Parallel Merge
Contents
cudaFlow provides template methods to create parallel merge tasks on a CUDA GPU.
Include the Header
You need to include the header file, taskflow/cuda/algorithm/merge.hpp
, for creating a parallel-merge task.
Merge two Sorted Ranges of Items
tf::input_1
and input_2
, each of 1000 items, into a sorted array output
of 2000 items.
const size_t N = 1000; int* input_1 = tf::cuda_malloc_shared<int>(N); // input vector 1 int* input_2 = tf::cuda_malloc_shared<int>(N); // input vector 2 int* output = tf::cuda_malloc_shared<int>(2*N); // output vector // initializes the data for(size_t i=0; i<N; i++) { input_1[i] = rand()%100; input_2[i] = rand()%100; } std::sort(input_1, input1 + N); std::sort(input_2, input2 + N); // merge input_1 and input_2 to output tf::cudaFlow cf; tf::cudaTask task = cf.merge( input_1, input_1 + N, input_2, input_2 + N, output, []__device__ (int a, int b) { return a < b; } // comparator ); cf.offload();
Merge two Sorted Ranges of Key-Value Items
tf::a
and b:
const size_t N = 2; int* a_keys = tf::cuda_malloc_shared<int>(N); int* a_vals = tf::cuda_malloc_shared<int>(N); int* b_keys = tf::cuda_malloc_shared<int>(N); int* b_vals = tf::cuda_malloc_shared<int>(N); int* c_keys = tf::cuda_malloc_shared<int>(2*N); int* c_vals = tf::cuda_malloc_shared<int>(2*N); // initializes the data a_keys[0] = 8, a_keys[1] = 1; a_vals[0] = 1, a_vals[1] = 2; b_keys[0] = 3, b_keys[1] = 7; b_vals[0] = 3, b_vals[1] = 4; // performs key-value merge tf::cudaFlow cf; cf.merge_by_key( a_keys, a_keys+N, a_vals, b_keys, b_keys+N, b_vals, c_keys, c_vals, [] __device__ (int a, int b) { return a < b; }, ); cf.offload(); // now, c_keys = {1, 3, 7, 8} // now, c_vals = {2, 3, 4, 1} // delete the device memory tf::cuda_free(buffer); tf::cuda_free(a_keys); tf::cuda_free(b_keys); tf::cuda_free(c_keys); tf::cuda_free(a_vals); tf::cuda_free(b_vals); tf::cuda_free(c_vals);
Miscellaneous Items
Parallel merge algorithms are also available in tf::