class
cudaFlowclass to create a cudaFlow task dependency graph
Contents
A cudaFlow is a high-level interface over CUDA Graph to perform GPU operations using the task dependency graph model. The class provides a set of methods for creating and launch different tasks on one or multiple CUDA devices, for instance, kernel tasks, data transfer tasks, and memory operation tasks. The following example creates a cudaFlow of two kernel tasks, task1
and task2
, where task1
runs before task2
.
tf::Taskflow taskflow; tf::Executor executor; taskflow.emplace([&](tf::cudaFlow& cf){ // create two kernel tasks tf::cudaTask task1 = cf.kernel(grid1, block1, shm_size1, kernel1, args1); tf::cudaTask task2 = cf.kernel(grid2, block2, shm_size2, kernel2, args2); // kernel1 runs before kernel2 task1.precede(task2); }); executor.run(taskflow).wait();
A cudaFlow is a task (tf::
Please refer to GPU Tasking (cudaFlow) for details.
Constructors, destructors, conversion operators
Public functions
- auto empty() const -> bool
- queries the emptiness of the graph
- auto num_tasks() const -> size_t
- queries the number of tasks
- void clear()
- clears the cudaFlow object
-
void dump(std::
ostream& os) const - dumps the cudaFlow graph into a DOT format through an output stream
-
void dump_native_graph(std::
ostream& os) const - dumps the native CUDA graph into a DOT format through an output stream
- auto noop() -> cudaTask
- creates a no-operation task
-
template<typename C>auto host(C&& callable) -> cudaTask
- creates a host task that runs a callable on the host
-
template<typename C>void host(cudaTask task, C&& callable)
- updates parameters of a host task
-
template<typename F, typename... ArgsT>auto kernel(dim3 g, dim3 b, size_t s, F f, ArgsT && ... args) -> cudaTask
- creates a kernel task
-
template<typename F, typename... ArgsT>void kernel(cudaTask task, dim3 g, dim3 b, size_t shm, F f, ArgsT && ... args)
- updates parameters of a kernel task
- auto memset(void* dst, int v, size_t count) -> cudaTask
- creates a memset task that fills untyped data with a byte value
- void memset(cudaTask task, void* dst, int ch, size_t count)
- updates parameters of a memset task
- auto memcpy(void* tgt, const void* src, size_t bytes) -> cudaTask
- creates a memcpy task that copies untyped data in bytes
- void memcpy(cudaTask task, void* tgt, const void* src, size_t bytes)
- updates parameters of a memcpy task
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>auto zero(T* dst, size_t count) -> cudaTask
- creates a memset task that sets a typed memory block to zero
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>void zero(cudaTask task, T* dst, size_t count)
- updates parameters of a memset task to a zero task
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>auto fill(T* dst, T value, size_t count) -> cudaTask
- creates a memset task that fills a typed memory block with a value
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>void fill(cudaTask task, T* dst, T value, size_t count)
- updates parameters of a memset task to a fill task
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>auto copy(T* tgt, const T* src, size_t num) -> cudaTask
- creates a memcopy task that copies typed data
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>void copy(cudaTask task, T* tgt, const T* src, size_t num)
- updates parameters of a memcpy task to a copy task
-
template<typename P>void offload_until(P&& predicate)
- offloads the cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
- void offload_n(size_t N)
- offloads the cudaFlow and executes it by the given times
- void offload()
- offloads the cudaFlow and executes it once
-
template<typename C>auto single_task(C c) -> cudaTask
- runs a callable with only a single kernel thread
-
template<typename C>void single_task(cudaTask task, C c)
- updates a single-threaded kernel task
-
template<typename I, typename C>auto for_each(I first, I last, C callable) -> cudaTask
- applies a callable to each dereferenced element of the data array
-
template<typename I, typename C>void for_each(cudaTask task, I first, I last, C callable)
- updates parameters of a kernel task created from tf::
cudaFlow:: for_each -
template<typename I, typename C>auto for_each_index(I first, I last, I step, C callable) -> cudaTask
- applies a callable to each index in the range with the step size
-
template<typename I, typename C>void for_each_index(cudaTask task, I first, I last, I step, C callable)
- updates parameters of a kernel task created from tf::
cudaFlow:: for_each_index -
template<typename I, typename O, typename C>auto transform(I first, I last, O output, C op) -> cudaTask
- applies a callable to a source range and stores the result in a target range
-
template<typename I, typename O, typename C>void transform(cudaTask task, I first, I last, O output, C c)
- updates parameters of a kernel task created from tf::
cudaFlow:: transform -
template<typename I1, typename I2, typename O, typename C>auto transform(I1 first1, I1 last1, I2 first2, O output, C op) -> cudaTask
- creates a task to perform parallel transforms over two ranges of items
-
template<typename I1, typename I2, typename O, typename C>void transform(cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c)
- updates parameters of a kernel task created from tf::
cudaFlow:: transform -
template<typename I, typename T, typename B>auto reduce(I first, I last, T* result, B bop) -> cudaTask
- performs parallel reduction over a range of items
-
template<typename I, typename T, typename C>void reduce(cudaTask task, I first, I last, T* result, C op)
- updates parameters of a kernel task created from tf::
cudaFlow:: reduce -
template<typename I, typename T, typename B>auto uninitialized_reduce(I first, I last, T* result, B bop) -> cudaTask
- similar to tf::
cudaFlow:: reduce but does not assume any initial value to reduce -
template<typename I, typename T, typename C>void uninitialized_reduce(cudaTask task, I first, I last, T* result, C op)
- updates parameters of a kernel task created from tf::
cudaFlow:: uninitialized_reduce -
template<typename I, typename T, typename B, typename U>auto transform_reduce(I first, I last, T* result, B bop, U uop) -> cudaTask
- performs parallel reduction over a range of transformed items
-
template<typename I, typename T, typename B, typename U>void transform_reduce(cudaTask, I first, I last, T* result, B bop, U uop)
- updates parameters of a kernel task created from tf::
cudaFlow:: transform_reduce -
template<typename I, typename T, typename B, typename U>auto transform_uninitialized_reduce(I first, I last, T* result, B bop, U uop) -> cudaTask
- similar to tf::
cudaFlow:: transform_reduce but does not assume any initial value to reduce -
template<typename I, typename T, typename B, typename U>void transform_uninitialized_reduce(cudaTask task, I first, I last, T* result, B bop, U uop)
- updates parameters of a kernel task created from tf::
cudaFlow:: transform_uninitialized_reduce -
template<typename I, typename O, typename C>auto inclusive_scan(I first, I last, O output, C op) -> cudaTask
- creates a task to perform parallel inclusive scan over a range of items
-
template<typename I, typename O, typename C>void inclusive_scan(cudaTask task, I first, I last, O output, C op)
- updates the parameters of a task created from tf::
cudaFlow:: inclusive_scan -
template<typename I, typename O, typename C>auto exclusive_scan(I first, I last, O output, C op) -> cudaTask
- similar to cudaFlow::
inclusive_scan but excludes the first value -
template<typename I, typename O, typename C>void exclusive_scan(cudaTask task, I first, I last, O output, C op)
- updates the parameters of a task created from tf::
cudaFlow:: exclusive_scan -
template<typename I, typename O, typename B, typename U>auto transform_inclusive_scan(I first, I last, O output, B bop, U uop) -> cudaTask
- creates a task to perform parallel inclusive scan over a range of transformed items
-
template<typename I, typename O, typename B, typename U>void transform_inclusive_scan(cudaTask task, I first, I last, O output, B bop, U uop)
- updates the parameters of a task created from tf::
cudaFlow:: transform_inclusive_scan -
template<typename I, typename O, typename B, typename U>auto transform_exclusive_scan(I first, I last, O output, B bop, U uop) -> cudaTask
- similar to cudaFlow::
transform_inclusive_scan but excludes the first value -
template<typename I, typename O, typename B, typename U>void transform_exclusive_scan(cudaTask task, I first, I last, O output, B bop, U uop)
- updates the parameters of a task created from tf::
cudaFlow:: transform_exclusive_scan -
template<typename A, typename B, typename C, typename Comp>auto merge(A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp) -> cudaTask
- creates a task to perform parallel merge on two sorted arrays
-
template<typename A, typename B, typename C, typename Comp>void merge(cudaTask task, A a_first, A a_last, B b_first, B b_last, C c_first, Comp comp)
- updates the parameters of a task created from tf::
cudaFlow:: merge -
template<typename I, typename C>auto sort(I first, I last, C comp) -> cudaTask
- creates a task to perform parallel sort an array
-
template<typename I, typename C>void sort(cudaTask task, I first, I last, C comp)
- updates the parameters of the task created from tf::
cudaFlow:: sort -
template<typename K_it, typename V_it, typename C>auto sort_by_key(K_it k_first, K_it k_last, V_it v_first, C comp) -> cudaTask
- creates kernels that sort the given array
-
template<typename K_it, typename V_it, typename C>void sort_by_key(cudaTask task, K_it k_first, K_it k_last, V_it v_first, C comp)
- updates the parameters of a task created from tf::
cudaFlow:: sort_by_key -
template<typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>auto merge_by_key(a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp) -> cudaTask
- creates a task to perform parallel key-value merge
-
template<typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>void merge_by_key(cudaTask task, a_keys_it a_keys_first, a_keys_it a_keys_last, a_vals_it a_vals_first, b_keys_it b_keys_first, b_keys_it b_keys_last, b_vals_it b_vals_first, c_keys_it c_keys_first, c_vals_it c_vals_first, C comp)
- updates the parameters of a task created from tf::
cudaFlow:: merge_by_key -
template<typename I, typename U>auto find_if(I first, I last, unsigned* idx, U op) -> cudaTask
- creates a task to find the index of the first element in a range
-
template<typename I, typename U>void find_if(cudaTask task, I first, I last, unsigned* idx, U op)
- updates the parameters of the task created from tf::
cudaFlow:: find_if -
template<typename I, typename O>auto min_element(I first, I last, unsigned* idx, O op) -> cudaTask
- finds the index of the minimum element in a range
-
template<typename I, typename O>void min_element(cudaTask task, I first, I last, unsigned* idx, O op)
- updates the parameters of the task created from tf::
cudaFlow:: min_element -
template<typename I, typename O>auto max_element(I first, I last, unsigned* idx, O op) -> cudaTask
- finds the index of the maximum element in a range
-
template<typename I, typename O>void max_element(cudaTask task, I first, I last, unsigned* idx, O op)
- updates the parameters of the task created from tf::
cudaFlow:: max_element -
template<typename C>auto capture(C&& callable) -> cudaTask
- constructs a subflow graph through tf::
cudaFlowCapturer -
template<typename C>void capture(cudaTask task, C callable)
- updates the captured child graph
Function documentation
tf:: cudaFlow:: cudaFlow()
constructs a standalone cudaFlow
A standalone cudaFlow does not go through any taskflow and can be run by the caller thread using explicit offload methods (e.g., tf::
void tf:: cudaFlow:: dump_native_graph(std:: ostream& os) const
dumps the native CUDA graph into a DOT format through an output stream
The native CUDA graph may be different from the upper-level cudaFlow graph when flow capture is involved.
cudaTask tf:: cudaFlow:: noop()
creates a no-operation task
Returns | a tf:: |
---|
An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n
nodes with a barrier between them can be represented using an empty node and 2*n
dependency edges, rather than no empty node and n^2
dependency edges.
template<typename C>
cudaTask tf:: cudaFlow:: host(C&& callable)
creates a host task that runs a callable on the host
Template parameters | |
---|---|
C | callable type |
Parameters | |
callable | a callable object with neither arguments nor return (i.e., constructible from std::function<void()> ) |
Returns | a tf:: |
A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc
).
template<typename C>
void tf:: cudaFlow:: host(cudaTask task,
C&& callable)
updates parameters of a host task
The method is similar to tf::
template<typename F, typename... ArgsT>
cudaTask tf:: cudaFlow:: kernel(dim3 g,
dim3 b,
size_t s,
F f,
ArgsT && ... args)
creates a kernel task
Template parameters | |
---|---|
F | kernel function type |
ArgsT | kernel function parameters type |
Parameters | |
g | configured grid |
b | configured block |
s | configured shared memory size in bytes |
f | kernel function |
args | arguments to forward to the kernel function by copy |
Returns | a tf:: |
template<typename F, typename... ArgsT>
void tf:: cudaFlow:: kernel(cudaTask task,
dim3 g,
dim3 b,
size_t shm,
F f,
ArgsT && ... args)
updates parameters of a kernel task
The method is similar to tf::
cudaTask tf:: cudaFlow:: memset(void* dst,
int v,
size_t count)
creates a memset task that fills untyped data with a byte value
Parameters | |
---|---|
dst | pointer to the destination device memory area |
v | value to set for each byte of specified memory |
count | size in bytes to set |
Returns | a tf:: |
A memset task fills the first count
bytes of device memory area pointed by dst
with the byte value v
.
void tf:: cudaFlow:: memset(cudaTask task,
void* dst,
int ch,
size_t count)
updates parameters of a memset task
The method is similar to tf::
cudaTask tf:: cudaFlow:: memcpy(void* tgt,
const void* src,
size_t bytes)
creates a memcpy task that copies untyped data in bytes
Parameters | |
---|---|
tgt | pointer to the target memory block |
src | pointer to the source memory block |
bytes | bytes to copy |
Returns | a tf:: |
A memcpy task transfers bytes
of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
void tf:: cudaFlow:: memcpy(cudaTask task,
void* tgt,
const void* src,
size_t bytes)
updates parameters of a memcpy task
The method is similar to tf::
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf:: cudaFlow:: zero(T* dst,
size_t count)
creates a memset task that sets a typed memory block to zero
Template parameters | |
---|---|
T | element type (size of T must be either 1, 2, or 4) |
Parameters | |
dst | pointer to the destination device memory area |
count | number of elements |
Returns | a tf:: |
A zero task zeroes the first count
elements of type T
in a device memory area pointed by dst
.
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
void tf:: cudaFlow:: zero(cudaTask task,
T* dst,
size_t count)
updates parameters of a memset task to a zero task
The method is similar to tf::
The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
cudaTask tf:: cudaFlow:: fill(T* dst,
T value,
size_t count)
creates a memset task that fills a typed memory block with a value
Template parameters | |
---|---|
T | element type (size of T must be either 1, 2, or 4) |
Parameters | |
dst | pointer to the destination device memory area |
value | value to fill for each element of type T |
count | number of elements |
Returns | a tf:: |
A fill task fills the first count
elements of type T
with value
in a device memory area pointed by dst
. The value to fill is interpreted in type T
rather than byte.
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
void tf:: cudaFlow:: fill(cudaTask task,
T* dst,
T value,
size_t count)
updates parameters of a memset task to a fill task
The method is similar to tf::
The source/destination memory may have different address values but must be allocated from the same contexts as the original source/destination memory.
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf:: cudaFlow:: copy(T* tgt,
const T* src,
size_t num)
creates a memcopy task that copies typed data
Template parameters | |
---|---|
T | element type (non-void) |
Parameters | |
tgt | pointer to the target memory block |
src | pointer to the source memory block |
num | number of elements to copy |
Returns | a tf:: |
A copy task transfers num*sizeof(T)
bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
void tf:: cudaFlow:: copy(cudaTask task,
T* tgt,
const T* src,
size_t num)
updates parameters of a memcpy task to a copy task
The method is similar to tf::
template<typename P>
void tf:: cudaFlow:: offload_until(P&& predicate)
offloads the cudaFlow onto a GPU and repeatedly runs it until the predicate becomes true
Template parameters | |
---|---|
P | predicate type (a binary callable) |
Parameters | |
predicate | a binary predicate (returns true for stop) |
Immediately offloads the present cudaFlow onto a GPU and repeatedly runs it until the predicate returns true
.
An offloaded cudaFlow forces the underlying graph to be instantiated. After the instantiation, you should not modify the graph topology but update node parameters.
By default, if users do not offload the cudaFlow, the executor will offload it once.
void tf:: cudaFlow:: offload_n(size_t N)
offloads the cudaFlow and executes it by the given times
Parameters | |
---|---|
N | number of executions |
template<typename C>
cudaTask tf:: cudaFlow:: single_task(C c)
runs a callable with only a single kernel thread
Template parameters | |
---|---|
C | callable type |
Parameters | |
c | callable to run by a single kernel thread |
Returns | a tf:: |
template<typename C>
void tf:: cudaFlow:: single_task(cudaTask task,
C c)
updates a single-threaded kernel task
This method is similar to cudaFlow::
template<typename I, typename C>
cudaTask tf:: cudaFlow:: for_each(I first,
I last,
C callable)
applies a callable to each dereferenced element of the data array
Template parameters | |
---|---|
I | iterator type |
C | callable type |
Parameters | |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
callable | a callable object to apply to the dereferenced iterator |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
for(auto itr = first; itr != last; itr++) { callable(*itr); }
template<typename I, typename C>
void tf:: cudaFlow:: for_each(cudaTask task,
I first,
I last,
C callable)
updates parameters of a kernel task created from tf::
The type of the iterators and the callable must be the same as the task created from tf::
template<typename I, typename C>
cudaTask tf:: cudaFlow:: for_each_index(I first,
I last,
I step,
C callable)
applies a callable to each index in the range with the step size
Template parameters | |
---|---|
I | index type |
C | callable type |
Parameters | |
first | beginning index |
last | last index |
step | step size |
callable | the callable to apply to each element in the data array |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
// step is positive [first, last) for(auto i=first; i<last; i+=step) { callable(i); } // step is negative [first, last) for(auto i=first; i>last; i+=step) { callable(i); }
template<typename I, typename C>
void tf:: cudaFlow:: for_each_index(cudaTask task,
I first,
I last,
I step,
C callable)
updates parameters of a kernel task created from tf::
The type of the iterators and the callable must be the same as the task created from tf::
template<typename I, typename O, typename C>
cudaTask tf:: cudaFlow:: transform(I first,
I last,
O output,
C op)
applies a callable to a source range and stores the result in a target range
Template parameters | |
---|---|
I | input iterator type |
O | output iterator type |
C | unary operator type |
Parameters | |
first | iterator to the beginning of the input range |
last | iterator to the end of the input range |
output | iterator to the beginning of the output range |
op | the operator to apply to transform each element in the range |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *output++ = callable(*first++); }
template<typename I, typename O, typename C>
void tf:: cudaFlow:: transform(cudaTask task,
I first,
I last,
O output,
C c)
updates parameters of a kernel task created from tf::
The type of the iterators and the callable must be the same as the task created from tf::
template<typename I1, typename I2, typename O, typename C>
cudaTask tf:: cudaFlow:: transform(I1 first1,
I1 last1,
I2 first2,
O output,
C op)
creates a task to perform parallel transforms over two ranges of items
Template parameters | |
---|---|
I1 | first input iterator type |
I2 | second input iterator type |
O | output iterator type |
C | unary operator type |
Parameters | |
first1 | iterator to the beginning of the input range |
last1 | iterator to the end of the input range |
first2 | iterato |
output | iterator to the beginning of the output range |
op | binary operator to apply to transform each pair of items in the two input ranges |
Returns | cudaTask handle |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first1 != last1) { *output++ = op(*first1++, *first2++); }
template<typename I1, typename I2, typename O, typename C>
void tf:: cudaFlow:: transform(cudaTask task,
I1 first1,
I1 last1,
I2 first2,
O output,
C c)
updates parameters of a kernel task created from tf::
The type of the iterators and the callable must be the same as the task created from tf::
template<typename I, typename T, typename B>
cudaTask tf:: cudaFlow:: reduce(I first,
I last,
T* result,
B bop)
performs parallel reduction over a range of items
Template parameters | |
---|---|
I | input iterator type |
T | value type |
B | binary operator type |
Parameters | |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
result | pointer to the result with an initialized value |
bop | binary operator to apply to reduce items |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *result = bop(*result, *first++); }
template<typename I, typename T, typename C>
void tf:: cudaFlow:: reduce(cudaTask task,
I first,
I last,
T* result,
C op)
updates parameters of a kernel task created from tf::
The type of the iterators, result, and callable must be the same as the task created from tf::
template<typename I, typename T, typename B>
cudaTask tf:: cudaFlow:: uninitialized_reduce(I first,
I last,
T* result,
B bop)
similar to tf::
This method is equivalent to the parallel execution of the following loop on a GPU:
*result = *first++; // no initial values partitipcate in the loop while (first != last) { *result = op(*result, *first++); }
template<typename I, typename T, typename C>
void tf:: cudaFlow:: uninitialized_reduce(cudaTask task,
I first,
I last,
T* result,
C op)
updates parameters of a kernel task created from tf::
The type of the iterators, result, and callable must be the same as the task created from tf::
template<typename I, typename T, typename B, typename U>
cudaTask tf:: cudaFlow:: transform_reduce(I first,
I last,
T* result,
B bop,
U uop)
performs parallel reduction over a range of transformed items
Template parameters | |
---|---|
I | input iterator type |
T | value type |
B | binary operator type |
U | unary operator type |
Parameters | |
first | iterator to the beginning (inclusive) |
last | iterator to the end (exclusive) |
result | pointer to the result with an initialized value |
bop | binary operator to apply to reduce items |
uop | unary operator to transform each item before reduction |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
while (first != last) { *result = bop(*result, uop(*first++)); }
template<typename I, typename T, typename B, typename U>
cudaTask tf:: cudaFlow:: transform_uninitialized_reduce(I first,
I last,
T* result,
B bop,
U uop)
similar to tf::
This method is equivalent to the parallel execution of the following loop on a GPU:
*result = uop(*first++); // no initial values partitipcate in the loop while (first != last) { *result = bop(*result, uop(*first++)); }
template<typename I, typename O, typename C>
cudaTask tf:: cudaFlow:: inclusive_scan(I first,
I last,
O output,
C op)
creates a task to perform parallel inclusive scan over a range of items
Template parameters | |
---|---|
I | input iterator type |
O | output iterator type |
C | binary operator type |
Parameters | |
first | iterator to the beginning |
last | iterator to the end |
output | iterator to the beginning of the output |
op | binary operator |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
for(size_t i=0; i<std::distance(first, last); i++) { *(output + i) = i ? op(*(first+i), *(output+i-1)) : *(first+i); }
template<typename I, typename O, typename C>
void tf:: cudaFlow:: inclusive_scan(cudaTask task,
I first,
I last,
O output,
C op)
updates the parameters of a task created from tf::
This method is similar to tf::
template<typename I, typename O, typename C>
void tf:: cudaFlow:: exclusive_scan(cudaTask task,
I first,
I last,
O output,
C op)
updates the parameters of a task created from tf::
This method is similar to tf::
template<typename I, typename O, typename B, typename U>
cudaTask tf:: cudaFlow:: transform_inclusive_scan(I first,
I last,
O output,
B bop,
U uop)
creates a task to perform parallel inclusive scan over a range of transformed items
Template parameters | |
---|---|
I | input iterator type |
O | output iterator type |
B | binary operator type |
U | unary operator type |
Parameters | |
first | iterator to the beginning |
last | iterator to the end |
output | iterator to the beginning of the output |
bop | binary operator |
uop | unary operator |
Returns | a tf:: |
This method is equivalent to the parallel execution of the following loop on a GPU:
for(size_t i=0; i<std::distance(first, last); i++) { *(output + i) = i ? op(uop(*(first+i)), *(output+i-1)) : uop(*(first+i)); }
template<typename I, typename O, typename B, typename U>
void tf:: cudaFlow:: transform_inclusive_scan(cudaTask task,
I first,
I last,
O output,
B bop,
U uop)
updates the parameters of a task created from tf::
This method is similar to tf::
template<typename I, typename O, typename B, typename U>
void tf:: cudaFlow:: transform_exclusive_scan(cudaTask task,
I first,
I last,
O output,
B bop,
U uop)
updates the parameters of a task created from tf::
This method is similar to tf::
template<typename A, typename B, typename C, typename Comp>
cudaTask tf:: cudaFlow:: merge(A a_first,
A a_last,
B b_first,
B b_last,
C c_first,
Comp comp)
creates a task to perform parallel merge on two sorted arrays
Template parameters | |
---|---|
A | iterator type of the first input array |
B | iterator type of the second input array |
C | iterator type of the output array |
Comp | comparator type |
Parameters | |
a_first | iterator to the beginning of the first input array |
a_last | iterator to the end of the first input array |
b_first | iterator to the beginning of the second input array |
b_last | iterator to the end of the second input array |
c_first | iterator to the beginning of the output array |
comp | binary comparator |
Returns | a tf:: |
Merges two sorted ranges [a_first, a_last)
and [b_first, b_last)
into one sorted range beginning at c_first
.
A sequence is said to be sorted with respect to a comparator comp
if for any iterator it pointing to the sequence and any non-negative integer n
such that it + n
is a valid iterator pointing to an element of the sequence, comp(*(it + n), *it)
evaluates to false.
template<typename A, typename B, typename C, typename Comp>
void tf:: cudaFlow:: merge(cudaTask task,
A a_first,
A a_last,
B b_first,
B b_last,
C c_first,
Comp comp)
updates the parameters of a task created from tf::
This method is similar to tf::
template<typename I, typename C>
cudaTask tf:: cudaFlow:: sort(I first,
I last,
C comp)
creates a task to perform parallel sort an array
Template parameters | |
---|---|
I | iterator type of the first input array |
C | comparator type |
Parameters | |
first | iterator to the beginning of the input array |
last | iterator to the end of the input array |
comp | binary comparator |
Returns | a tf:: |
Sorts elements in the range [first, last)
with the given comparator comp
.
template<typename I, typename C>
void tf:: cudaFlow:: sort(cudaTask task,
I first,
I last,
C comp)
updates the parameters of the task created from tf::
This method is similar to tf::
template<typename K_it, typename V_it, typename C>
cudaTask tf:: cudaFlow:: sort_by_key(K_it k_first,
K_it k_last,
V_it v_first,
C comp)
creates kernels that sort the given array
Template parameters | |
---|---|
K_it | iterator type of the key |
V_it | iterator type of the value |
C | comparator type |
Parameters | |
k_first | iterator to the beginning of the key array |
k_last | iterator to the end of the key array |
v_first | iterator to the beginning of the value array |
comp | binary comparator |
Returns | a tf:: |
Sorts key-value elements in [k_first, k_last)
and [v_first, v_first + (k_last - k_first))
into ascending key order using the given comparator comp
. If i
and j
are any two valid iterators in [k_first, k_last)
such that i
precedes j
, and p
and q
are iterators in [v_first, v_first + (k_last - k_first))
corresponding to i
and j
respectively, then comp(*j, *i)
evaluates to false
.
For example, assume:
keys
are{1, 4, 2, 8, 5, 7}
values
are{'a', 'b', 'c', 'd', 'e', 'f'}
After sort:
keys
are{1, 2, 4, 5, 7, 8}
values
are{'a', 'c', 'b', 'e', 'f', 'd'}
template<typename K_it, typename V_it, typename C>
void tf:: cudaFlow:: sort_by_key(cudaTask task,
K_it k_first,
K_it k_last,
V_it v_first,
C comp)
updates the parameters of a task created from tf::
This method is similar to tf::
template<typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>
cudaTask tf:: cudaFlow:: merge_by_key(a_keys_it a_keys_first,
a_keys_it a_keys_last,
a_vals_it a_vals_first,
b_keys_it b_keys_first,
b_keys_it b_keys_last,
b_vals_it b_vals_first,
c_keys_it c_keys_first,
c_vals_it c_vals_first,
C comp)
creates a task to perform parallel key-value merge
Template parameters | |
---|---|
a_keys_it | first key iterator type |
a_vals_it | first value iterator type |
b_keys_it | second key iterator type |
b_vals_it | second value iterator type |
c_keys_it | output key iterator type |
c_vals_it | output value iterator type |
C | comparator type |
Parameters | |
a_keys_first | iterator to the beginning of the first key range |
a_keys_last | iterator to the end of the first key range |
a_vals_first | iterator to the beginning of the first value range |
b_keys_first | iterator to the beginning of the second key range |
b_keys_last | iterator to the end of the second key range |
b_vals_first | iterator to the beginning of the second value range |
c_keys_first | iterator to the beginning of the output key range |
c_vals_first | iterator to the beginning of the output value range |
comp | comparator |
Performs a key-value merge that copies elements from [a_keys_first, a_keys_last)
and [b_keys_first, b_keys_last)
into a single range, [c_keys_first, c_keys_last + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))
such that the resulting range is in ascending key order.
At the same time, the merge copies elements from the two associated ranges [a_vals_first + (a_keys_last - a_keys_first))
and [b_vals_first + (b_keys_last - b_keys_first))
into a single range, [c_vals_first, c_vals_first + (a_keys_last - a_keys_first) + (b_keys_last - b_keys_first))
such that the resulting range is in ascending order implied by each input element's associated key.
For example, assume:
a_keys
={8, 1}
a_vals
={1, 2}
b_keys
={3, 7}
b_vals
={3, 4}
After the merge, we have:
c_keys
={1, 3, 7, 8}
c_vals
={2, 3, 4, 1}
template<typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename C>
void tf:: cudaFlow:: merge_by_key(cudaTask task,
a_keys_it a_keys_first,
a_keys_it a_keys_last,
a_vals_it a_vals_first,
b_keys_it b_keys_first,
b_keys_it b_keys_last,
b_vals_it b_vals_first,
c_keys_it c_keys_first,
c_vals_it c_vals_first,
C comp)
updates the parameters of a task created from tf::
This method is similar to tf::
template<typename I, typename U>
cudaTask tf:: cudaFlow:: find_if(I first,
I last,
unsigned* idx,
U op)
creates a task to find the index of the first element in a range
Template parameters | |
---|---|
I | input iterator type |
U | unary operator type |
Parameters | |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | pointer to the index of the found element |
op | unary operator which returns true for the required element |
Finds the index idx
of the first element in the range [first, last)
such that op(*(first+idx))
is true. This is equivalent to the parallel execution of the following loop:
unsigned idx = 0; for(; first != last; ++first, ++idx) { if (p(*first)) { return idx; } } return idx;
template<typename I, typename O>
cudaTask tf:: cudaFlow:: min_element(I first,
I last,
unsigned* idx,
O op)
finds the index of the minimum element in a range
Template parameters | |
---|---|
I | input iterator type |
O | comparator type |
Parameters | |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | solution index of the minimum element |
op | comparison function object |
The function launches kernels asynchronously to find the smallest element in the range [first, last)
using the given comparator op
. The function is equivalent to a parallel execution of the following loop:
if(first == last) { return 0; } auto smallest = first; for (++first; first != last; ++first) { if (op(*first, *smallest)) { smallest = first; } } return std::distance(first, smallest);
template<typename I, typename O>
cudaTask tf:: cudaFlow:: max_element(I first,
I last,
unsigned* idx,
O op)
finds the index of the maximum element in a range
Template parameters | |
---|---|
I | input iterator type |
O | comparator type |
Parameters | |
first | iterator to the beginning of the range |
last | iterator to the end of the range |
idx | solution index of the maximum element |
op | comparison function object |
The function launches kernels asynchronously to find the largest element in the range [first, last)
using the given comparator op
. The function is equivalent to a parallel execution of the following loop:
if(first == last) { return 0; } auto largest = first; for (++first; first != last; ++first) { if (op(*largest, *first)) { largest = first; } } return std::distance(first, largest);
template<typename C>
cudaTask tf:: cudaFlow:: capture(C&& callable)
constructs a subflow graph through tf::
Template parameters | |
---|---|
C | callable type constructible from std::function<void(tf::cudaFlowCapturer&)> |
Parameters | |
callable | the callable to construct a capture flow |
Returns | a tf:: |
A captured subflow forms a sub-graph to the cudaFlow and can be used to capture custom (or third-party) kernels that cannot be directly constructed from the cudaFlow.
Example usage:
taskflow.emplace([&](tf::cudaFlow& cf){ tf::cudaTask my_kernel = cf.kernel(my_arguments); // create a flow capturer to capture custom kernels tf::cudaTask my_subflow = cf.capture([&](tf::cudaFlowCapturer& capturer){ capturer.on([&](cudaStream_t stream){ invoke_custom_kernel_with_stream(stream, custom_arguments); }); }); my_kernel.precede(my_subflow); });
template<typename C>
void tf:: cudaFlow:: capture(cudaTask task,
C callable)
updates the captured child graph
The method is similar to tf::