Skip to content

Commit

Permalink
hide thrust::(host|device)_vector
Browse files Browse the repository at this point in the history
  • Loading branch information
koide3 committed Jul 22, 2024
1 parent 411cb14 commit 079a637
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 29 deletions.
40 changes: 26 additions & 14 deletions include/gtsam_points/cuda/nonlinear_factor_set_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
#include <memory>
#include <vector>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

#include <boost/utility/in_place_factory.hpp>
#include <boost/utility/typed_in_place_factory.hpp>

#include <gtsam/nonlinear/NonlinearFactorGraph.h>
#include <gtsam_points/factors/nonlinear_factor_gpu.hpp>
#include <gtsam_points/optimizers/linearization_hook.hpp>
Expand Down Expand Up @@ -88,6 +82,24 @@ class NonlinearFactorSetGPU : public NonlinearFactorSet {
*/
std::vector<gtsam::GaussianFactor::shared_ptr> calc_linear_factors(const gtsam::Values& linearization_point) override;

private:
/// @brief Simple buffer class for device memory
struct DeviceBuffer {
DeviceBuffer();
~DeviceBuffer();

DeviceBuffer(const DeviceBuffer&) = delete;
DeviceBuffer& operator=(const DeviceBuffer&) = delete;

void resize(size_t size, CUstream_st* stream);
unsigned char* data() { return buffer; }
const unsigned char* data() const { return buffer; }

size_t size;
unsigned char* buffer;
};


private:
CUstream_st* stream;

Expand All @@ -96,15 +108,15 @@ class NonlinearFactorSetGPU : public NonlinearFactorSet {

std::vector<boost::shared_ptr<NonlinearFactorGPU>> factors;

thrust::host_vector<unsigned char, Eigen::aligned_allocator<unsigned char>> linearization_input_buffer_cpu;
thrust::host_vector<unsigned char, Eigen::aligned_allocator<unsigned char>> linearization_output_buffer_cpu;
thrust::device_vector<unsigned char> linearization_input_buffer_gpu;
thrust::device_vector<unsigned char> linearization_output_buffer_gpu;
std::vector<unsigned char, Eigen::aligned_allocator<unsigned char>> linearization_input_buffer_cpu;
std::vector<unsigned char, Eigen::aligned_allocator<unsigned char>> linearization_output_buffer_cpu;
std::unique_ptr<DeviceBuffer> linearization_input_buffer_gpu;
std::unique_ptr<DeviceBuffer> linearization_output_buffer_gpu;

thrust::host_vector<unsigned char, Eigen::aligned_allocator<unsigned char>> evaluation_input_buffer_cpu;
thrust::host_vector<unsigned char, Eigen::aligned_allocator<unsigned char>> evaluation_output_buffer_cpu;
thrust::device_vector<unsigned char> evaluation_input_buffer_gpu;
thrust::device_vector<unsigned char> evaluation_output_buffer_gpu;
std::vector<unsigned char, Eigen::aligned_allocator<unsigned char>> evaluation_input_buffer_cpu;
std::vector<unsigned char, Eigen::aligned_allocator<unsigned char>> evaluation_output_buffer_cpu;
std::unique_ptr<DeviceBuffer> evaluation_input_buffer_gpu;
std::unique_ptr<DeviceBuffer> evaluation_output_buffer_gpu;
};

} // namespace gtsam_points
53 changes: 38 additions & 15 deletions src/gtsam_points/cuda/nonlinear_factor_set_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,31 @@

namespace gtsam_points {

NonlinearFactorSetGPU::DeviceBuffer::DeviceBuffer() : size(0), buffer(nullptr) {}

NonlinearFactorSetGPU::DeviceBuffer::~DeviceBuffer() {
if(buffer) {
check_error << cudaFreeAsync(buffer, 0);
}
}

void NonlinearFactorSetGPU::DeviceBuffer::resize(size_t size, CUstream_st* stream) {
if(this->size < size) {
if(buffer) {
check_error << cudaFreeAsync(buffer, stream);
}
check_error << cudaMallocAsync(&buffer, size, stream);
this->size = size;
}
}

NonlinearFactorSetGPU::NonlinearFactorSetGPU() {
check_error << cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

linearization_input_buffer_gpu.reset(new DeviceBuffer);
linearization_output_buffer_gpu.reset(new DeviceBuffer);
evaluation_input_buffer_gpu.reset(new DeviceBuffer);
evaluation_output_buffer_gpu.reset(new DeviceBuffer);
}

NonlinearFactorSetGPU::~NonlinearFactorSetGPU() {
Expand Down Expand Up @@ -52,9 +75,9 @@ void NonlinearFactorSetGPU::linearize(const gtsam::Values& linearization_point)
}

linearization_input_buffer_cpu.resize(input_buffer_size);
linearization_input_buffer_gpu.resize(input_buffer_size);
linearization_input_buffer_gpu->resize(input_buffer_size, stream);
linearization_output_buffer_cpu.resize(output_buffer_size);
linearization_output_buffer_gpu.resize(output_buffer_size);
linearization_output_buffer_gpu->resize(output_buffer_size, stream);

// set linearization point
size_t input_cursor = 0;
Expand All @@ -67,7 +90,7 @@ void NonlinearFactorSetGPU::linearize(const gtsam::Values& linearization_point)

// copy input buffer from cpu to gpu
check_error << cudaMemcpyAsync(
thrust::raw_pointer_cast(linearization_input_buffer_gpu.data()),
linearization_input_buffer_gpu->data(),
linearization_input_buffer_cpu.data(),
input_buffer_size,
cudaMemcpyHostToDevice,
Expand All @@ -79,9 +102,9 @@ void NonlinearFactorSetGPU::linearize(const gtsam::Values& linearization_point)
output_cursor = 0;
for (auto& factor : factors) {
auto input_cpu = linearization_input_buffer_cpu.data() + input_cursor;
auto input_gpu = linearization_input_buffer_gpu.data() + input_cursor;
auto output_gpu = linearization_output_buffer_gpu.data() + output_cursor;
factor->issue_linearize(input_cpu, input_gpu.get(), output_gpu.get());
auto input_gpu = linearization_input_buffer_gpu->data() + input_cursor;
auto output_gpu = linearization_output_buffer_gpu->data() + output_cursor;
factor->issue_linearize(input_cpu, input_gpu, output_gpu);
input_cursor += factor->linearization_input_size();
output_cursor += factor->linearization_output_size();
}
Expand All @@ -94,7 +117,7 @@ void NonlinearFactorSetGPU::linearize(const gtsam::Values& linearization_point)
// copy output buffer from gpu to cpu
check_error << cudaMemcpyAsync(
linearization_output_buffer_cpu.data(),
thrust::raw_pointer_cast(linearization_output_buffer_gpu.data()),
linearization_output_buffer_gpu->data(),
output_buffer_size,
cudaMemcpyDeviceToHost,
stream);
Expand Down Expand Up @@ -124,9 +147,9 @@ void NonlinearFactorSetGPU::error(const gtsam::Values& values) {
output_buffer_size += factor->evaluation_output_size();
}
evaluation_input_buffer_cpu.resize(input_buffer_size);
evaluation_input_buffer_gpu.resize(input_buffer_size);
evaluation_input_buffer_gpu->resize(input_buffer_size, stream);
evaluation_output_buffer_cpu.resize(output_buffer_size);
evaluation_output_buffer_gpu.resize(output_buffer_size);
evaluation_output_buffer_gpu->resize(output_buffer_size, stream);

// set evaluation point
size_t lin_input_cursor = 0;
Expand All @@ -141,7 +164,7 @@ void NonlinearFactorSetGPU::error(const gtsam::Values& values) {

// copy input buffer from cpu to gpu
check_error << cudaMemcpyAsync(
thrust::raw_pointer_cast(evaluation_input_buffer_gpu.data()),
evaluation_input_buffer_gpu->data(),
evaluation_input_buffer_cpu.data(),
input_buffer_size,
cudaMemcpyHostToDevice,
Expand All @@ -154,12 +177,12 @@ void NonlinearFactorSetGPU::error(const gtsam::Values& values) {
eval_output_cursor = 0;
for (auto& factor : factors) {
auto lin_input_cpu = linearization_input_buffer_cpu.data() + lin_input_cursor;
auto lin_input_gpu = linearization_input_buffer_gpu.data() + lin_input_cursor;
auto lin_input_gpu = linearization_input_buffer_gpu->data() + lin_input_cursor;
auto eval_input_cpu = evaluation_input_buffer_cpu.data() + eval_input_cursor;
auto eval_input_gpu = evaluation_input_buffer_gpu.data() + eval_input_cursor;
auto eval_output_gpu = evaluation_output_buffer_gpu.data() + eval_output_cursor;
auto eval_input_gpu = evaluation_input_buffer_gpu->data() + eval_input_cursor;
auto eval_output_gpu = evaluation_output_buffer_gpu->data() + eval_output_cursor;

factor->issue_compute_error(lin_input_cpu, eval_input_cpu, lin_input_gpu.get(), eval_input_gpu.get(), eval_output_gpu.get());
factor->issue_compute_error(lin_input_cpu, eval_input_cpu, lin_input_gpu, eval_input_gpu, eval_output_gpu);

lin_input_cursor += factor->linearization_input_size();
eval_input_cursor += factor->evaluation_input_size();
Expand All @@ -174,7 +197,7 @@ void NonlinearFactorSetGPU::error(const gtsam::Values& values) {
// copy output buffer from gpu to cpu
check_error << cudaMemcpyAsync(
evaluation_output_buffer_cpu.data(),
thrust::raw_pointer_cast(evaluation_output_buffer_gpu.data()),
evaluation_output_buffer_gpu->data(),
output_buffer_size,
cudaMemcpyDeviceToHost,
stream);
Expand Down

0 comments on commit 079a637

Please sign in to comment.