Skip to content

Commit

Permalink
cuda 12.5
Browse files Browse the repository at this point in the history
  • Loading branch information
koide3 committed Jul 22, 2024
1 parent 17f78e8 commit 411cb14
Show file tree
Hide file tree
Showing 11 changed files with 63 additions and 56 deletions.
3 changes: 2 additions & 1 deletion include/gtsam_points/cuda/kernels/lookup_voxels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <Eigen/Core>
#include <Eigen/Geometry>

#include <thrust/pair.h>
#include <thrust/count.h>
#include <thrust/device_new.h>
#include <thrust/device_vector.h>
Expand Down Expand Up @@ -86,7 +87,7 @@ struct lookup_voxels_kernel {
thrust::device_ptr<const Eigen::Isometry3f> x_ptr;

thrust::device_ptr<const VoxelMapInfo> voxelmap_info_ptr;
thrust::device_ptr<const thrust::pair<Eigen::Vector3i, int>> buckets_ptr;
thrust::device_ptr<const VoxelBucket> buckets_ptr;

thrust::device_ptr<const Eigen::Vector3f> points_ptr;
thrust::device_ptr<const Eigen::Vector3f> normals_ptr;
Expand Down
2 changes: 1 addition & 1 deletion include/gtsam_points/cuda/kernels/vector3_hash.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ inline __host__ __device__ Eigen::Vector3i calc_voxel_coord(const Eigen::Vector3
inline __host__ __device__ int lookup_voxel(
const int max_bucket_scan_count,
const int num_buckets,
const thrust::device_ptr<const thrust::pair<Eigen::Vector3i, int>>& buckets_ptr,
const thrust::device_ptr<const VoxelBucket>& buckets_ptr,
const float resolution,
const Eigen::Vector3f& x) {
Eigen::Vector3i coord = calc_voxel_coord(x, resolution);
Expand Down
24 changes: 12 additions & 12 deletions include/gtsam_points/cuda/stream_temp_buffer_roundrobin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,6 @@

#include <gtsam_points/cuda/stream_roundrobin.hpp>

// forward declaration
namespace thrust {

template <typename T>
class device_allocator;

template <typename T, typename Alloc>
class device_vector;

} // namespace thrust

namespace gtsam_points {

/**
Expand All @@ -29,6 +18,17 @@ namespace gtsam_points {
*/
class TempBufferManager {
public:
struct Buffer {
Buffer(size_t size);
~Buffer();

Buffer(const Buffer&) = delete;
Buffer& operator=(const Buffer&) = delete;

size_t size;
char* buffer;
};

using Ptr = std::shared_ptr<TempBufferManager>;

TempBufferManager(size_t init_buffer_size = 0);
Expand All @@ -40,7 +40,7 @@ class TempBufferManager {
void clear_all();

private:
std::vector<std::shared_ptr<thrust::device_vector<char, thrust::device_allocator<char>>>> buffers;
std::vector<std::shared_ptr<Buffer>> buffers;
};

/**
Expand Down
10 changes: 5 additions & 5 deletions include/gtsam_points/factors/integrated_vgicp_factor_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,17 +100,17 @@ class IntegratedVGICPFactorGPU : public gtsam_points::NonlinearFactorGPU {
virtual void set_linearization_point(const gtsam::Values& values, void* lin_input_cpu) override;
virtual void issue_linearize(
const void* lin_input_cpu,
const thrust::device_ptr<const void>& lin_input_gpu,
const thrust::device_ptr<void>& lin_output_gpu) override;
const void* lin_input_gpu,
void* lin_output_gpu) override;
virtual void store_linearized(const void* lin_output_cpu) override;

virtual void set_evaluation_point(const gtsam::Values& values, void* eval_input_cpu) override;
virtual void issue_compute_error(
const void* lin_input_cpu,
const void* eval_input_cpu,
const thrust::device_ptr<const void>& lin_input_gpu,
const thrust::device_ptr<const void>& eval_input_gpu,
const thrust::device_ptr<void>& eval_output_gpu) override;
const void* lin_input_gpu,
const void* eval_input_gpu,
void* eval_output_gpu) override;
virtual void store_computed_error(const void* eval_output_cpu) override;

virtual void sync() override;
Expand Down
14 changes: 4 additions & 10 deletions include/gtsam_points/factors/nonlinear_factor_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,6 @@
#include <boost/utility/typed_in_place_factory.hpp>
#include <gtsam/nonlinear/NonlinearFactor.h>

namespace thrust {

template <typename T>
class device_ptr;
}

namespace gtsam_points {

/**
Expand Down Expand Up @@ -83,7 +77,7 @@ class NonlinearFactorGPU : public gtsam::NonlinearFactor {
* @param lin_output_gpu Output data destination on the GPU memory (size == linearization_output_size)
*/
virtual void
issue_linearize(const void* lin_input_cpu, const thrust::device_ptr<const void>& lin_input_gpu, const thrust::device_ptr<void>& lin_output_gpu) = 0;
issue_linearize(const void* lin_input_cpu, const void* lin_input_gpu, void* lin_output_gpu) = 0;

/**
* @brief Read linearization output data from the download buffer
Expand All @@ -109,9 +103,9 @@ class NonlinearFactorGPU : public gtsam::NonlinearFactor {
virtual void issue_compute_error(
const void* lin_input_cpu,
const void* eval_input_cpu,
const thrust::device_ptr<const void>& lin_input_gpu,
const thrust::device_ptr<const void>& eval_input_gpu,
const thrust::device_ptr<void>& eval_output_gpu) = 0;
const void* lin_input_gpu,
const void* eval_input_gpu,
void* eval_output_gpu) = 0;

/**
* @brief Read cost evaluation output data from the download buffer
Expand Down
16 changes: 9 additions & 7 deletions include/gtsam_points/types/gaussian_voxelmap_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@
// forward declaration
struct CUstream_st;

namespace thrust {
template <typename T1, typename T2>
class pair;

} // namespace thrust

namespace gtsam_points {

/**
Expand All @@ -29,6 +23,14 @@ struct VoxelMapInfo {
float voxel_resolution; ///< Voxel resolution
};

/**
* @brief Voxel bucket (avoid using thrust::pair for CUDA compatibility)
*/
struct VoxelBucket {
Eigen::Vector3i first;
int second;
};

/**
* @brief Gaussian distribution voxelmap on GPU
*/
Expand Down Expand Up @@ -71,7 +73,7 @@ class GaussianVoxelMapGPU : public GaussianVoxelMap {
VoxelMapInfo voxelmap_info; ///< Voxelmap information
VoxelMapInfo* voxelmap_info_ptr; ///< Voxelmap information on GPU memory

thrust::pair<Eigen::Vector3i, int>* buckets; ///< Voxel buckets for hashing
VoxelBucket* buckets; ///< Voxel buckets for hashing

// voxel data
int* num_points; ///< Number of points in eac voxel
Expand Down
5 changes: 3 additions & 2 deletions src/gtsam_points/cuda/nonlinear_factor_set_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ void NonlinearFactorSetGPU::linearize(const gtsam::Values& linearization_point)
auto input_cpu = linearization_input_buffer_cpu.data() + input_cursor;
auto input_gpu = linearization_input_buffer_gpu.data() + input_cursor;
auto output_gpu = linearization_output_buffer_gpu.data() + output_cursor;
factor->issue_linearize(input_cpu, input_gpu, output_gpu);
factor->issue_linearize(input_cpu, input_gpu.get(), output_gpu.get());
input_cursor += factor->linearization_input_size();
output_cursor += factor->linearization_output_size();
}
Expand Down Expand Up @@ -159,7 +159,7 @@ void NonlinearFactorSetGPU::error(const gtsam::Values& values) {
auto eval_input_gpu = evaluation_input_buffer_gpu.data() + eval_input_cursor;
auto eval_output_gpu = evaluation_output_buffer_gpu.data() + eval_output_cursor;

factor->issue_compute_error(lin_input_cpu, eval_input_cpu, lin_input_gpu, eval_input_gpu, eval_output_gpu);
factor->issue_compute_error(lin_input_cpu, eval_input_cpu, lin_input_gpu.get(), eval_input_gpu.get(), eval_output_gpu.get());

lin_input_cursor += factor->linearization_input_size();
eval_input_cursor += factor->evaluation_input_size();
Expand Down Expand Up @@ -201,4 +201,5 @@ std::vector<gtsam::GaussianFactor::shared_ptr> NonlinearFactorSetGPU::calc_linea

return linear_factors;
}

} // namespace gtsam_points
17 changes: 13 additions & 4 deletions src/gtsam_points/cuda/stream_temp_buffer_roundrobin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,33 @@

#include <gtsam_points/cuda/stream_temp_buffer_roundrobin.hpp>

#include <gtsam_points/cuda/check_error.cuh>
#include <thrust/device_vector.h>

namespace gtsam_points {

TempBufferManager::Buffer::Buffer(size_t buffer_size) : size(buffer_size) {
check_error << cudaMallocAsync(&buffer, buffer_size, 0);
}

TempBufferManager::Buffer::~Buffer() {
check_error << cudaFreeAsync(buffer, 0);
}

TempBufferManager::TempBufferManager(size_t init_buffer_size) {
if (init_buffer_size) {
buffers.push_back(std::make_shared<thrust::device_vector<char>>(init_buffer_size));
buffers.emplace_back(new Buffer(init_buffer_size));
}
}

TempBufferManager::~TempBufferManager() {}

char* TempBufferManager::get_buffer(size_t buffer_size) {
if (buffers.empty() || buffers.back()->size() < buffer_size) {
buffers.push_back(std::make_shared<thrust::device_vector<char>>(buffer_size * 1.2));
if (buffers.empty() || buffers.back()->size < buffer_size) {
buffers.emplace_back(new Buffer(buffer_size * 1.2));
}

return thrust::raw_pointer_cast(buffers.back()->data());
return buffers.back()->buffer;
}

void TempBufferManager::clear() {
Expand Down
10 changes: 5 additions & 5 deletions src/gtsam_points/factors/integrated_vgicp_factor_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ void IntegratedVGICPFactorGPU::set_evaluation_point(const gtsam::Values& values,

void IntegratedVGICPFactorGPU::issue_linearize(
const void* lin_input_cpu,
const thrust::device_ptr<const void>& lin_input_gpu,
const thrust::device_ptr<void>& lin_output_gpu) {
const void* lin_input_gpu,
void* lin_output_gpu) {
auto linearization_point = reinterpret_cast<const Eigen::Isometry3f*>(lin_input_cpu);
auto linearization_point_gpu = thrust::reinterpret_pointer_cast<thrust::device_ptr<const Eigen::Isometry3f>>(lin_input_gpu);
auto linearized_gpu = thrust::reinterpret_pointer_cast<thrust::device_ptr<LinearizedSystem6>>(lin_output_gpu);
Expand All @@ -222,9 +222,9 @@ void IntegratedVGICPFactorGPU::store_linearized(const void* lin_output_cpu) {
void IntegratedVGICPFactorGPU::issue_compute_error(
const void* lin_input_cpu,
const void* eval_input_cpu,
const thrust::device_ptr<const void>& lin_input_gpu,
const thrust::device_ptr<const void>& eval_input_gpu,
const thrust::device_ptr<void>& eval_output_gpu) {
const void* lin_input_gpu,
const void* eval_input_gpu,
void* eval_output_gpu) {
//
auto linearization_point = reinterpret_cast<const Eigen::Isometry3f*>(lin_input_cpu);
auto evaluation_point = reinterpret_cast<const Eigen::Isometry3f*>(eval_input_cpu);
Expand Down
16 changes: 8 additions & 8 deletions src/gtsam_points/types/gaussian_voxelmap_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ struct voxel_bucket_assignment_kernel {
struct voxel_coord_select_kernel {
voxel_coord_select_kernel(const Eigen::Vector3i* point_coords) : point_coords_ptr(point_coords) {}

__device__ thrust::pair<Eigen::Vector3i, int> operator()(const thrust::pair<int, int>& index_bucket) const {
__device__ VoxelBucket operator()(const thrust::pair<int, int>& index_bucket) const {
if (index_bucket.first < 0) {
return thrust::make_pair(Eigen::Vector3i(0, 0, 0), -1);
return {Eigen::Vector3i(0, 0, 0), -1};
}

return thrust::make_pair(thrust::raw_pointer_cast(point_coords_ptr)[index_bucket.first], index_bucket.second);
return {thrust::raw_pointer_cast(point_coords_ptr)[index_bucket.first], index_bucket.second};
}

thrust::device_ptr<const Eigen::Vector3i> point_coords_ptr;
Expand All @@ -87,7 +87,7 @@ struct voxel_coord_select_kernel {
struct accumulate_points_kernel {
accumulate_points_kernel(
const VoxelMapInfo* voxelmap_info_ptr,
const thrust::pair<Eigen::Vector3i, int>* buckets,
const VoxelBucket* buckets,
int* num_points,
Eigen::Vector3f* voxel_means,
Eigen::Matrix3f* voxel_covs)
Expand All @@ -108,7 +108,7 @@ struct accumulate_points_kernel {

for (int i = 0; i < info.max_bucket_scan_count; i++) {
uint64_t bucket_index = (hash + i) % info.num_buckets;
const thrust::pair<Eigen::Vector3i, int>& bucket = thrust::raw_pointer_cast(buckets_ptr)[bucket_index];
const VoxelBucket& bucket = thrust::raw_pointer_cast(buckets_ptr)[bucket_index];

if (bucket.second < 0) {
break;
Expand All @@ -132,7 +132,7 @@ struct accumulate_points_kernel {
}

thrust::device_ptr<const VoxelMapInfo> voxelmap_info_ptr;
thrust::device_ptr<const thrust::pair<Eigen::Vector3i, int>> buckets_ptr;
thrust::device_ptr<const VoxelBucket> buckets_ptr;

thrust::device_ptr<int> num_points_ptr;
thrust::device_ptr<Eigen::Vector3f> voxel_means_ptr;
Expand Down Expand Up @@ -268,12 +268,12 @@ void GaussianVoxelMapGPU::create_bucket_table(cudaStream_t stream, const PointCl
}

check_error << cudaFreeAsync(buckets, stream);
check_error << cudaMallocAsync(&buckets, sizeof(thrust::pair<Eigen::Vector3i, int>) * voxelmap_info.num_buckets, stream);
check_error << cudaMallocAsync(&buckets, sizeof(VoxelBucket) * voxelmap_info.num_buckets, stream);
thrust::transform(
thrust::cuda::par_nosync.on(stream),
thrust::device_ptr<thrust::pair<int, int>>(index_buckets),
thrust::device_ptr<thrust::pair<int, int>>(index_buckets) + voxelmap_info.num_buckets,
thrust::device_ptr<thrust::pair<Eigen::Vector3i, int>>(buckets),
thrust::device_ptr<VoxelBucket>(buckets),
voxel_coord_select_kernel(coords));

check_error << cudaFreeAsync(coords, stream);
Expand Down
2 changes: 1 addition & 1 deletion src/gtsam_points/types/gaussian_voxelmap_gpu_funcs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ public:
}

thrust::device_ptr<const VoxelMapInfo> voxelmap_info_ptr;
thrust::device_ptr<const thrust::pair<Eigen::Vector3i, int>> buckets_ptr;
thrust::device_ptr<const VoxelBucket> buckets_ptr;

thrust::device_ptr<const Eigen::Isometry3f> delta_ptr;
};
Expand Down

0 comments on commit 411cb14

Please sign in to comment.