class
cudaFlowCapturerclass to create a cudaFlow graph using stream capture
Contents
The usage of tf::task_1
and task_2
, where task_1
runs before task_2
.
taskflow.emplace([](tf::cudaFlowCapturer& capturer){ // capture my_kernel_1 through the given stream managed by the capturer auto task_1 = capturer.on([&](cudaStream_t stream){ my_kernel_1<<<grid_1, block_1, shm_size_1, stream>>>(my_parameters_1); }); // capture my_kernel_2 through the given stream managed by the capturer auto task_2 = capturer.on([&](cudaStream_t stream){ my_kernel_2<<<grid_2, block_2, shm_size_2, stream>>>(my_parameters_2); }); task_1.precede(task_2); });
Similar to tf::
Please refer to GPU Tasking (cudaFlowCapturer) for details.
Constructors, destructors, conversion operators
- cudaFlowCapturer()
- constrcts a standalone cudaFlowCapturer
- ~cudaFlowCapturer() virtual
- destructs the cudaFlowCapturer
Public functions
- auto empty() const -> bool
- queries the emptiness of the graph
- auto num_tasks() const -> size_t
- queries the number of tasks
- void clear()
- clear this cudaFlow capturer
-
void dump(std::
ostream& os) const - dumps the capture graph into a DOT format through an output stream
-
template<typename OPT, typename... ArgsT>auto make_optimizer(ArgsT && ... args) -> OPT&
- selects a different optimization algorithm
-
template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr>auto on(C&& callable) -> cudaTask
- captures a sequential CUDA operations from the given callable
-
template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr>void on(cudaTask task, C&& callable)
- updates a capture task to another sequential CUDA operations
- auto noop() -> cudaTask
- captures a no-operation task
- void noop(cudaTask task)
- updates a task to a no-operation task
- auto memcpy(void* dst, const void* src, size_t count) -> cudaTask
- copies data between host and device asynchronously through a stream
- void memcpy(cudaTask task, void* dst, const void* src, size_t count)
- updates a capture task to a memcpy operation
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>auto copy(T* tgt, const T* src, size_t num) -> cudaTask
- captures a copy task of typed data
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>void copy(cudaTask task, T* tgt, const T* src, size_t num)
- updates a capture task to a copy operation
- auto memset(void* ptr, int v, size_t n) -> cudaTask
- initializes or sets GPU memory to the given value byte by byte
- void memset(cudaTask task, void* ptr, int value, size_t n)
- updates a capture task to a memset operation
-
template<typename F, typename... ArgsT>auto kernel(dim3 g, dim3 b, size_t s, F f, ArgsT && ... args) -> cudaTask
- captures a kernel
-
template<typename F, typename... ArgsT>void kernel(cudaTask task, dim3 g, dim3 b, size_t s, F f, ArgsT && ... args)
- updates a capture task to a kernel operation
-
template<typename C>auto single_task(C c) -> cudaTask
- capturers a kernel to runs the given callable with only one thread
-
template<typename C>void single_task(cudaTask task, C c)
- updates a capture task to a single-threaded kernel
-
template<typename I, typename C>auto for_each(I first, I last, C callable) -> cudaTask
- captures a kernel that applies a callable to each dereferenced element of the data array
-
template<typename I, typename C>void for_each(cudaTask task, I first, I last, C callable)
- updates a capture task to a for-each kernel task
-
template<typename I, typename C>auto for_each_index(I first, I last, I step, C callable) -> cudaTask
- captures a kernel that applies a callable to each index in the range with the step size
-
template<typename I, typename C>void for_each_index(cudaTask task, I first, I last, I step, C callable)
- updates a capture task to a for-each-index kernel task
-
template<typename I, typename O, typename C>auto transform(I first, I last, O output, C op) -> cudaTask
- captures a kernel that transforms an input range to an output range
-
template<typename I, typename O, typename C>void transform(cudaTask task, I first, I last, O output, C op)
- updates a capture task to a transform kernel task
-
template<typename I1, typename I2, typename O, typename C>auto transform(I1 first1, I1 last1, I2 first2, O output, C op) -> cudaTask
- captures a kernel that transforms two input ranges to an output range
-
template<typename I1, typename I2, typename O, typename C>void transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C op)
- updates a capture task to a transform kernel task
-
template<typename I, typename T, typename C>auto reduce(I first, I last, T* result, C op) -> cudaTask
- captures kernels that perform parallel reduction over a range of items
-
template<typename I, typename T, typename C>void reduce(cudaTask task, I first, I last, T* result, C op)
- updates a capture task to a reduction task
-
template<typename I, typename T, typename C>auto uninitialized_reduce(I first, I last, T* result, C op) -> cudaTask
- similar to tf::
cudaFlowCapturer:: reduce but does not assume any initial value to reduce -
template<typename I, typename T, typename C>void uninitialized_reduce(cudaTask task, I first, I last, T* result, C op)
- updates a capture task to an uninitialized-reduction task
-
template<typename I, typename T, typename C, typename U>auto transform_reduce(I first, I last, T* result, C bop, U uop) -> cudaTask
- captures kernels that perform parallel reduction over a range of transformed items
-
template<typename I, typename T, typename C, typename U>void transform_reduce(cudaTask task, I first, I last, T* result, C bop, U uop)
- updates a capture task to a transform-reduce task
-
template<typename I, typename T, typename C, typename U>auto transform_uninitialized_reduce(I first, I last, T* result, C bop, U uop) -> cudaTask
- similar to tf::
cudaFlowCapturer:: transform_reduce but does not assume any initial value to reduce -
template<typename I, typename T, typename C, typename U>void transform_uninitialized_reduce(cudaTask task, I first, I last, T* result, C bop, U uop)
- updates a capture task to a transform-reduce task of no initialized value
-
template<typename I, typename O, typename C>auto inclusive_scan(I first, I last, O output, C op) -> cudaTask
- captures kernels that perform parallel inclusive scan over a range of items
-
template<typename I, typename O, typename C>void inclusive_scan(cudaTask task, I first, I last, O output, C op)
- updates a capture task to an inclusive scan task
-
template<typename I, typename O, typename C>auto exclusive_scan(I first, I last, O output, C op) -> cudaTask
- similar to cudaFlowCapturer::
inclusive_scan but excludes the first value -
template<typename I, typename O, typename C>void exclusive_scan(cudaTask task, I first, I last, O output, C op)
- updates a capture task to an exclusive scan task
-
template<typename I, typename O, typename B, typename U>auto transform_inclusive_scan(I first, I last, O output, B bop, U uop) -> cudaTask
- captures kernels that perform parallel inclusive scan over a range of transformed items
-
template<typename I, typename O, typename B, typename U>void transform_inclusive_scan(cudaTask task, I first, I last, O output, B bop, U uop)
- updates a capture task to a transform-inclusive scan task
-
template<typename I, typename O, typename B, typename U>auto transform_exclusive_scan(I first, I last, O output, B bop, U uop) -> cudaTask
- similar to cudaFlowCapturer::
transform_inclusive_scan but excludes the first value -
template<typename I, typename O, typename B, typename U>void transform_exclusive_scan(cudaTask task, I first, I last, O output, B bop, U uop)
- updates a capture task to a transform-exclusive scan task
-
template<typename A, typename B, typename C, typename Comp>auto merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp) -> cudaTask
- captures kernels that perform parallel merge on two sorted arrays
-
template<typename A, typename B, typename C, typename Comp>void merge(cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
- updates a capture task to a merge task
-
template<typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>auto merge_by_key(a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp) -> cudaTask
- captures kernels that perform parallel key-value merge
-
template<typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>void merge_by_key(cudaTask task, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
- updates a capture task to a key-value merge task
-
template<typename I, typename C>auto sort(I first, I last, C comp) -> cudaTask
- captures kernels that sort the given array
-
template<typename I, typename C>void sort(cudaTask task, I first, I last, C comp)
- updates a capture task to a sort task
-
template<typename K_it, typename V_it, typename C>auto sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp) -> cudaTask
- captures kernels that sort the given array
-
template<typename K_it, typename V_it, typename C>void sort_by_key(cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp)
- updates a capture task to a key-value sort task
-
template<typename I, typename U>auto find_if(I first, I last, unsigned* idx, U op) -> cudaTask
- creates a task to find the index of the first element in a range
-
template<typename I, typename U>void find_if(cudaTask task, I first, I last, unsigned* idx, U op)
- updates the parameters of a find-if task
-
template<typename I, typename O>auto min_element(I first, I last, unsigned* idx, O op) -> cudaTask
- finds the index of the minimum element in a range
-
template<typename I, typename O>void min_element(cudaTask task, I first, I last, unsigned* idx, O op)
- updates the parameters of a min-element task
-
template<typename I, typename O>auto max_element(I first, I last, unsigned* idx, O op) -> cudaTask
- finds the index of the maximum element in a range
-
template<typename I, typename O>void max_element(cudaTask task, I first, I last, unsigned* idx, O op)
- updates the parameters of a max-element task
-
template<typename P>void offload_until(P&& predicate)
- offloads the captured cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
- void offload_n(size_t n)
- offloads the captured cudaFlow and executes it by the given times
- void offload()
- offloads the captured cudaFlow and executes it once
Function documentation
tf:: cudaFlowCapturer:: cudaFlowCapturer()
constrcts a standalone cudaFlowCapturer
A standalone cudaFlow capturer does not go through any taskflow and can be run by the caller thread using explicit offload methods (e.g., tf::
template<typename OPT, typename... ArgsT>
OPT& tf:: cudaFlowCapturer:: make_optimizer(ArgsT && ... args)
selects a different optimization algorithm
Template parameters | |
---|---|
OPT | optimizer type |
ArgsT | arguments types |
Parameters | |
args | arguments to forward to construct the optimizer |
Returns | a reference to the optimizer |
We currently supports the following optimization algorithms to capture a user-described cudaFlow:
By default, tf::
template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr>
cudaTask tf:: cudaFlowCapturer:: on(C&& callable)
captures a sequential CUDA operations from the given callable
Template parameters | |
---|---|
C | callable type constructible with std::function<void(cudaStream_t)> |
Parameters | |
callable | a callable to capture CUDA operations with the stream |
This methods applies a stream created by the flow to capture a sequence of CUDA operations defined in the callable.
template<typename C, std::enable_if_t<std::is_invocable_r_v<void, C, cudaStream_t>, void>* = nullptr>
void tf:: cudaFlowCapturer:: on(cudaTask task,
C&& callable)
updates a capture task to another sequential CUDA operations
The method is similar to cudaFlowCapturer::
cudaTask tf:: cudaFlowCapturer:: noop()
captures a no-operation task
Returns | a tf:: |
---|
An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n
nodes with a barrier between them can be represented using an empty node and 2*n
dependency edges, rather than no empty node and n^2
dependency edges.
void tf:: cudaFlowCapturer:: noop(cudaTask task)
updates a task to a no-operation task
The method is similar to tf::
cudaTask tf:: cudaFlowCapturer:: memcpy(void* dst,
const void* src,
size_t count)
copies data between host and device asynchronously through a stream
Parameters | |
---|---|
dst | destination memory address |
src | source memory address |
count | size in bytes to copy |
The method captures a cudaMemcpyAsync
operation through an internal stream.
void tf:: cudaFlowCapturer:: memcpy(cudaTask task,
void* dst,
const void* src,
size_t count)
updates a capture task to a memcpy operation
The method is similar to cudaFlowCapturer::
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf:: cudaFlowCapturer:: copy(T* tgt,
const T* src,
size_t num)
captures a copy task of typed data
Template parameters | |
---|---|
T | element type (non-void) |
Parameters | |
tgt | pointer to the target memory block |
src | pointer to the source memory block |
num | number of elements to copy |
Returns | cudaTask handle |
A copy task transfers num*sizeof(T)
bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
void tf:: cudaFlowCapturer:: copy(cudaTask task,
T* tgt,
const T* src,
size_t num)
updates a capture task to a copy operation
The method is similar to cudaFlowCapturer::
cudaTask tf:: cudaFlowCapturer:: memset(void* ptr,
int v,
size_t n)
initializes or sets GPU memory to the given value byte by byte
Parameters | |
---|---|
ptr | pointer to GPU mempry |
v | value to set for each byte of the specified memory |
n | size in bytes to set |
The method captures a cudaMemsetAsync
operation through an internal stream to fill the first count
bytes of the memory area pointed to by devPtr
with the constant byte value value
.
void tf:: cudaFlowCapturer:: memset(cudaTask task,
void* ptr,
int value,
size_t n)
updates a capture task to a memset operation
The method is similar to cudaFlowCapturer::
template<typename F, typename... ArgsT>
cudaTask tf:: cudaFlowCapturer:: kernel(dim3 g,
dim3 b,
size_t s,
F f,
ArgsT && ... args)
captures a kernel
Template parameters | |
---|---|
F | kernel function type |
ArgsT | kernel function parameters type |
Parameters | |
g | configured grid |
b | configured block |
s | configured shared memory size in bytes |
f | kernel function |
args | arguments to forward to the kernel function by copy |
Returns | cudaTask handle |
template<typename F, typename... ArgsT>
void tf:: cudaFlowCapturer:: kernel(cudaTask task,
dim3 g,
dim3 b,
size_t s,
F f,
ArgsT && ... args)
updates a capture task to a kernel operation
The method is similar to cudaFlowCapturer::
template<typename C>
cudaTask tf:: cudaFlowCapturer:: single_task(C c)
capturers a kernel to runs the given callable with only one thread
Template parameters | |
---|---|
C | callable type |
Parameters | |
c | callable to run by a single kernel thread |
template<typename C>
void tf:: cudaFlowCapturer:: single_task(cudaTask task,
C c)
updates a capture task to a single-threaded kernel
This method is similar to cudaFlowCapturer::
template<typename I, typename C>
cudaTask tf:: cudaFlowCapturer:: for_each(I first,
I last,
C callable)
captures a kernel that applies a callable to each dereferenced element of the data array
Template parameters | |
---|---|
I | iterator type |
C | callable type |
Parameters | |
first | iterator to the beginning |
last | iterator to the end |
callable | a callable object to apply to the dereferenced iterator |
Returns | cudaTask handle |
This method is equivalent to the parallel execution of the following loop on a GPU:
for(auto itr = first; itr != last; i++) { callable(*itr); }
template<typename I, typename C>
void tf:: cudaFlowCapturer:: for_each(cudaTask task,
I first,
I last,
C callable)
updates a capture task to a for-each kernel task
This method is similar to cudaFlowCapturer::
template<typename I, typename C>
cudaTask tf:: cudaFlowCapturer:: for_each_index(I first,
I last,
I step,
C callable)
captures a kernel that applies a callable to each index in the range with the step size
Template parameters | |
---|---|
I | index type |
C | callable type |
Parameters | |
first | beginning index |
last | last index |
step | step size |
callable | the callable to apply to each element in the data array |
Returns | cudaTask handle |
This method is equivalent to the parallel execution of the following loop on a GPU:
// step is positive [first, last) for(auto i=first; i<last; i+=step) { callable(i); } // step is negative [first, last) for(auto i=first; i>last; i+=step) { callable(i); }
template<typename I, typename C>
void tf:: cudaFlowCapturer:: for_each_index(cudaTask task,
I first,
I last,
I step,
C callable)
updates a capture task to a for-each-index kernel task
This method is similar to cudaFlowCapturer::
template<typename I, typename O, typename C>
cudaTask tf:: cudaFlowCapturer:: transform(I first,
I last,
O output,
C op)
captures a kernel that transforms an input range to an output range
Template parameters | |
---|---|
I | input iterator type |
O | output iterator type |
C | unary operator type |
Parameters | |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
op | unary operator to apply to transform each item in the range |
Returns | cudaTask handle |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *output++ = op(*first++); }
template<typename I, typename O, typename C>
void tf:: cudaFlowCapturer:: transform(cudaTask task,
I first,
I last,
O output,
C op)
updates a capture task to a transform kernel task
This method is similar to cudaFlowCapturer::
template<typename I1, typename I2, typename O, typename C>
cudaTask tf:: cudaFlowCapturer:: transform(I1 first1,
I1 last1,
I2 first2,
O output,
C op)
captures a kernel that transforms two input ranges to an output range
Template parameters | |
---|---|
I1 | first input iterator type |
I2 | second input iterator type |
O | output iterator type |
C | unary operator type |
Parameters | |
first1 | iterator to the beginning of the input range |
last1 | iterator to the end of the input range |
first2 | iterato |
output | iterator to the beginning of the output range |
op | binary operator to apply to transform each pair of items in the two input ranges |
Returns | cudaTask handle |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first1 != last1) { *output++ = op(*first1++, *first2++); }
template<typename I1, typename I2, typename O, typename C>
void tf:: cudaFlowCapturer:: transform(cudaTask task,
I1 first1,
I1 last1,
I2 first2,
O output,
C op)
updates a capture task to a transform kernel task
This method is similar to cudaFlowCapturer::
template<typename I, typename T, typename C>
cudaTask tf:: cudaFlowCapturer:: reduce(I first,
I last,
T* result,
C op)
captures kernels that perform parallel reduction over a range of items
Template parameters | |
---|---|
I | input iterator type |
T | value type |
C | binary operator type |
Parameters | |
first | iterator to the beginning |
last | iterator to the end |
result | pointer to the result with an initialized value |
op | binary reduction operator |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *result = op(*result, *first++); }
template<typename I, typename T, typename C>
void tf:: cudaFlowCapturer:: reduce(cudaTask task,
I first,
I last,
T* result,
C op)
updates a capture task to a reduction task
This method is similar to cudaFlowCapturer::
template<typename I, typename T, typename C>
cudaTask tf:: cudaFlowCapturer:: uninitialized_reduce(I first,
I last,
T* result,
C op)
similar to tf::
This method is equivalent to the parallel execution of the following loop on a GPU:
*result = *first++; // initial value does not involve in the loop while (first != last) { *result = op(*result, *first++); }
template<typename I, typename T, typename C>
void tf:: cudaFlowCapturer:: uninitialized_reduce(cudaTask task,
I first,
I last,
T* result,
C op)
updates a capture task to an uninitialized-reduction task
This method is similar to cudaFlowCapturer::
template<typename I, typename T, typename C, typename U>
cudaTask tf:: cudaFlowCapturer:: transform_reduce(I first,
I last,
T* result,
C bop,
U uop)
captures kernels that perform parallel reduction over a range of transformed items
Template parameters | |
---|---|
I | input iterator type |
T | value type |
C | binary operator type |
U | unary operator type |
Parameters | |
first | iterator to the beginning |
last | iterator to the end |
result | pointer to the result with an initialized value |
bop | binary reduce operator |
uop | unary transform operator |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *result = bop(*result, uop(*first++)); }
template<typename I, typename T, typename C, typename U>
void tf:: cudaFlowCapturer:: transform_reduce(cudaTask task,
I first,
I last,
T* result,
C bop,
U uop)
updates a capture task to a transform-reduce task
This method is similar to cudaFlowCapturer::
template<typename I, typename T, typename C, typename U>
cudaTask tf:: cudaFlowCapturer:: transform_uninitialized_reduce(I first,
I last,
T* result,
C bop,
U uop)
similar to tf::
This method is equivalent to the parallel execution of the following loop on a GPU:
*result = uop(*first++); // initial value does not involve in the loop while (first != last) { *result = bop(*result, uop(*first++)); }
template<typename I, typename T, typename C, typename U>
void tf:: cudaFlowCapturer:: transform_uninitialized_reduce(cudaTask task,
I first,
I last,
T* result,
C bop,
U uop)
updates a capture task to a transform-reduce task of no initialized value
This method is similar to cudaFlowCapturer::
template<typename I, typename O, typename C>
cudaTask tf:: cudaFlowCapturer:: inclusive_scan(I first,
I last,
O output,
C op)
captures kernels that perform parallel inclusive scan over a range of items
Template parameters | |
---|---|
I | input iterator type |
O | output iterator type |
C | binary operator type |
Parameters | |
first | iterator to the beginning |
last | iterator to the end |
output | iterator to the beginning of the output |
op | binary operator |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
for(size_t i=0; i<std::distance(first, last); i++) { *(output + i) = i ? op(*(first+i), *(output+i-1)) : *(first+i); }
template<typename I, typename O, typename C>
void tf:: cudaFlowCapturer:: inclusive_scan(cudaTask task,
I first,
I last,
O output,
C op)
updates a capture task to an inclusive scan task
This method is similar to cudaFlowCapturer::
template<typename I, typename O, typename C>
void tf:: cudaFlowCapturer:: exclusive_scan(cudaTask task,
I first,
I last,
O output,
C op)
updates a capture task to an exclusive scan task
This method is similar to cudaFlowCapturer::
template<typename I, typename O, typename B, typename U>
cudaTask tf:: cudaFlowCapturer:: transform_inclusive_scan(I first,
I last,
O output,
B bop,
U uop)
captures kernels that perform parallel inclusive scan over a range of transformed items
Template parameters | |
---|---|
I | input iterator type |
O | output iterator type |
B | binary operator type |
U | unary operator type |
Parameters | |
first | iterator to the beginning |
last | iterator to the end |
output | iterator to the beginning of the output |
bop | binary operator |
uop | unary operator |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
for(size_t i=0; i<std::distance(first, last); i++) { *(output + i) = i ? op(uop(*(first+i)), *(output+i-1)) : uop(*(first+i)); }
template<typename I, typename O, typename B, typename U>
void tf:: cudaFlowCapturer:: transform_inclusive_scan(cudaTask task,
I first,
I last,
O output,
B bop,
U uop)
updates a capture task to a transform-inclusive scan task
This method is similar to cudaFlowCapturer::
template<typename I, typename O, typename B, typename U>
void tf:: cudaFlowCapturer:: transform_exclusive_scan(cudaTask task,
I first,
I last,
O output,
B bop,
U uop)
updates a capture task to a transform-exclusive scan task
This method is similar to cudaFlowCapturer::
template<typename A, typename B, typename C, typename Comp>
cudaTask tf:: cudaFlowCapturer:: merge(A a_first,
A a_last,
B b_first,
B b_last,
C c_first,
Comp comp)
captures kernels that perform parallel merge on two sorted arrays
Template parameters | |
---|---|
A | iterator type of the first input array |
B | iterator type of the second input array |
C | iterator type of the output array |
Comp | comparator type |
Parameters | |
a_first | iterator to the beginning of the first input array |
a_last | iterator to the end of the first input array |
b_first | iterator to the beginning of the second input array |
b_last | iterator to the end of the second input array |
c_first | iterator to the beginning of the output array |
comp | binary comparator |
Returns | a tf:: |
Merges two sorted ranges [a_first, a_last)
and [b_first, b_last)
into one sorted range beginning at c_first
.
A sequence is said to be sorted with respect to a comparator comp
if for any iterator it pointing to the sequence and any non-negative integer n
such that it + n
is a valid iterator pointing to an element of the sequence, comp(*(it + n), *it)
evaluates to false
.
template<typename A, typename B, typename C, typename Comp>
void tf:: cudaFlowCapturer:: merge(cudaTask task,
A a_first,
A a_last,
B b_first,
B b_last,
C c_first,
Comp comp)
updates a capture task to a merge task
This method is similar to cudaFlowCapturer::
template<typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>
cudaTask tf:: cudaFlowCapturer:: merge_by_key(a_keys_it a_keys_first,
a_keys_it a_keys_last,
a_vals_it a_vals_first,
b_keys_it b_keys_first,
b_keys_it b_keys_last,
b_vals_it b_vals_first,
c_keys_it c_keys_first,
c_vals_it c_vals_first,
C comp)
captures kernels that perform parallel key-value merge
Template parameters | |
---|---|
a_keys_it | first key iterator type |
a_vals_it | first value iterator type |
b_keys_it | second key iterator type |
b_vals_it | second value iterator type |
c_keys_it | output key iterator type |
c_vals_it | output value iterator type |
C | comparator type |
Parameters | |
a_keys_first | iterator to the beginning of the first key range |
a_keys_last | iterator to the end of the first key range |
a_vals_first | iterator to the beginning of the first value range |
b_keys_first | iterator to the beginning of the second key range |
b_keys_last | iterator to the end of the second key range |
b_vals_first | iterator to the beginning of the second value range |
c_keys_first | iterator to the beginning of the output key range |
c_vals_first | iterator to the beginning of the output value range |
comp | comparator |
Performs a key-value merge that copies elements from [a_keys_first, a_keys_last)
and [b_keys_first, b_keys_last)
into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))
such that the resulting range is in ascending key order.
At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first))
and [b_vals_first + (b_keys_last - b_keys_first))
into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))
such that the resulting range is in ascending order implied by each input element's associated key.
For example, assume:
a_keys
={8, 1}
a_vals
={1, 2}
b_keys
={3, 7}
b_vals
={3, 4}
After the merge, we have:
c_keys
={1, 3, 7, 8}
c_vals
={2, 3, 4, 1}
template<typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>
void tf:: cudaFlowCapturer:: merge_by_key(cudaTask task,
a_keys_it a_keys_first,
a_keys_it a_keys_last,
a_vals_it a_vals_first,
b_keys_it b_keys_first,
b_keys_it b_keys_last,
b_vals_it b_vals_first,
c_keys_it c_keys_first,
c_vals_it c_vals_first,
C comp)
updates a capture task to a key-value merge task
This method is similar to tf::
template<typename I, typename C>
cudaTask tf:: cudaFlowCapturer:: sort(I first,
I last,
C comp)
captures kernels that sort the given array
Template parameters | |
---|---|
I | iterator type of the first input array |
C | comparator type |
Parameters | |
first | iterator to the beginning of the input array |
last | iterator to the end of the input array |
comp | binary comparator |
Returns | a tf:: |
Sorts elements in the range [first, last)
with the given comparator.
template<typename I, typename C>
void tf:: cudaFlowCapturer:: sort(cudaTask task,
I first,
I last,
C comp)
updates a capture task to a sort task
This method is similar to cudaFlowCapturer::
template<typename K_it, typename V_it, typename C>
cudaTask tf:: cudaFlowCapturer:: sort_by_key(K_it k_first,
K_it k_last,
V_it v_first,
C comp)
captures kernels that sort the given array
Template parameters | |
---|---|
K_it | iterator type of the key |
V_it | iterator type of the value |
C | comparator type |
Parameters | |
k_first | iterator to the beginning of the key array |
k_last | iterator to the end of the key array |
v_first | iterator to the beginning of the value array |
comp | binary comparator |
Returns | a tf:: |
Sorts key-value elements in [k_first, k_last)
and [v_first, v_first + (k_last - k_first))
into ascending key order using the given comparator comp
. If i
and j
are any two valid iterators in [k_first, k_last)
such that i
precedes j
, and p
and q
are iterators in [v_first, v_first + (k_last - k_first))
corresponding to i
and j
respectively, then comp(*j, *i)
evaluates to false
.
For example, assume:
keys
are{1, 4, 2, 8, 5, 7}
values
are{'a', 'b', 'c', 'd', 'e', 'f'}
After sort:
keys
are{1, 2, 4, 5, 7, 8}
values
are{'a', 'c', 'b', 'e', 'f', 'd'}
template<typename K_it, typename V_it, typename C>
void tf:: cudaFlowCapturer:: sort_by_key(cudaTask task,
K_it k_first,
K_it k_last,
V_it v_first,
C comp)
updates a capture task to a key-value sort task
This method is similar to tf::
template<typename I, typename U>
cudaTask tf:: cudaFlowCapturer:: find_if(I first,
I last,
unsigned* idx,
U op)
creates a task to find the index of the first element in a range
Template parameters | |
---|---|
I | input iterator type |
U | unary operator type |
Parameters | |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | pointer to the index of the found element |
op | unary operator which returns true for the required element |
Finds the index idx
of the first element in the range [first, last)
such that op(*(first+idx))
is true. This is equivalent to the parallel execution of the following loop:
unsigned idx = 0; for(; first != last; ++first, ++idx) { if (p(*first)) { return idx; } } return idx;
template<typename I, typename U>
void tf:: cudaFlowCapturer:: find_if(cudaTask task,
I first,
I last,
unsigned* idx,
U op)
updates the parameters of a find-if task
This method is similar to tf::
template<typename I, typename O>
cudaTask tf:: cudaFlowCapturer:: min_element(I first,
I last,
unsigned* idx,
O op)
finds the index of the minimum element in a range
Template parameters | |
---|---|
I | input iterator type |
O | comparator type |
Parameters | |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | solution index of the minimum element |
op | comparison function object |
The function launches kernels asynchronously to find the smallest element in the range [first, last)
using the given comparator op
. The function is equivalent to a parallel execution of the following loop:
if(first == last) { return 0; } auto smallest = first; for (++first; first != last; ++first) { if (op(*first, *smallest)) { smallest = first; } } return std::distance(first, smallest);
template<typename I, typename O>
void tf:: cudaFlowCapturer:: min_element(cudaTask task,
I first,
I last,
unsigned* idx,
O op)
updates the parameters of a min-element task
This method is similar to cudaFlowCapturer::
template<typename I, typename O>
cudaTask tf:: cudaFlowCapturer:: max_element(I first,
I last,
unsigned* idx,
O op)
finds the index of the maximum element in a range
Template parameters | |
---|---|
I | input iterator type |
O | comparator type |
Parameters | |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | solution index of the maximum element |
op | comparison function object |
The function launches kernels asynchronously to find the largest element in the range [first, last)
using the given comparator op
. The function is equivalent to a parallel execution of the following loop:
if(first == last) { return 0; } auto largest = first; for (++first; first != last; ++first) { if (op(*largest, *first)) { largest = first; } } return std::distance(first, largest);
template<typename I, typename O>
void tf:: cudaFlowCapturer:: max_element(cudaTask task,
I first,
I last,
unsigned* idx,
O op)
updates the parameters of a max-element task
This method is similar to cudaFlowCapturer::
template<typename P>
void tf:: cudaFlowCapturer:: offload_until(P&& predicate)
offloads the captured cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
Template parameters | |
---|---|
P | predicate type (a binary callable) |
Parameters | |
predicate | a binary predicate (returns true for stop) |
Immediately offloads the cudaFlow captured so far onto a GPU and repeatedly runs it until the predicate returns true
.
By default, if users do not offload the cudaFlow capturer, the executor will offload it once.
void tf:: cudaFlowCapturer:: offload_n(size_t n)
offloads the captured cudaFlow and executes it by the given times
Parameters | |
---|---|
n | number of executions |