Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removed unnecessary memory allocation and updates for image nodes #87

Merged
merged 1 commit into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 55 additions & 31 deletions src/nestgpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ NESTGPU::NESTGPU()
sim_time_ = 1000.0; //Simulation time in ms
//n_poiss_nodes_ = 0;
n_remote_nodes_.assign(1, 0);
n_ext_nodes_ = 0;
n_image_nodes_ = 0;
SetTimeResolution(0.1); // time resolution in ms
max_spike_num_fact_ = 1.0;
max_spike_per_host_fact_ = 1.0;
Expand Down Expand Up @@ -390,31 +390,45 @@ int NESTGPU::GetMaxSpikeBufferSize()
return max_spike_buffer_size_;
}

uint NESTGPU::GetNNode()
uint NESTGPU::GetNLocalNodes()
{
return node_group_map_.size();
}

int NESTGPU::CreateNodeGroup(int n_node, int n_port)
int NESTGPU::CheckImageNodes(int n_nodes)
{
int i_node_0 = GetNNode();
int max_n_neurons = IntPow(2,h_MaxNodeNBits);
int i_node_0 = GetNLocalNodes();
int max_n_nodes = IntPow(2,h_MaxNodeNBits);

if ((i_node_0 + n_nodes) > max_n_nodes) {
throw ngpu_exception(std::string("Local plus Image nodes exceed maximum"
" number of nodes ")
+ std::to_string(max_n_nodes));
}

return i_node_0;
}

int NESTGPU::CreateNodeGroup(int n_nodes, int n_ports)
{
int i_node_0 = GetNLocalNodes();
int max_n_nodes = IntPow(2,h_MaxNodeNBits);
int max_n_ports = IntPow(2,h_MaxPortNBits);

if ((i_node_0 + n_node) > max_n_neurons) {
throw ngpu_exception(std::string("Maximum number of neurons ")
+ std::to_string(max_n_neurons) + " exceeded");
if ((i_node_0 + n_nodes) > max_n_nodes) {
throw ngpu_exception(std::string("Maximum number of nodes ")
+ std::to_string(max_n_nodes) + " exceeded");
}
if (n_port > max_n_ports) {
if (n_ports > max_n_ports) {
throw ngpu_exception(std::string("Maximum number of ports ")
+ std::to_string(max_n_ports) + " exceeded");
}
int i_group = node_vect_.size() - 1;
node_group_map_.insert(node_group_map_.end(), n_node, i_group);
node_group_map_.insert(node_group_map_.end(), n_nodes, i_group);

node_vect_[i_group]->random_generator_ = random_generator_;
node_vect_[i_group]->Init(i_node_0, n_node, n_port, i_group);
node_vect_[i_group]->get_spike_array_ = InitGetSpikeArray(n_node, n_port);
node_vect_[i_group]->Init(i_node_0, n_nodes, n_ports, i_group);
node_vect_[i_group]->get_spike_array_ = InitGetSpikeArray(n_nodes, n_ports);

return i_node_0;
}
Expand All @@ -439,17 +453,20 @@ int NESTGPU::Calibrate()
gpuErrchk(cudaMemcpyToSymbol(NESTGPUTimeResolution, &time_resolution_,
sizeof(float)));
///////////////////////////////////
i_ext_node_0_ = GetNNode();
// std::cout << "i_ext_node_0_: " << i_ext_node_0_ << " n_ext_nodes_: "
// << n_ext_nodes_ << "\n";
if (n_ext_nodes_ > 0) {
_Create("ext_neuron", n_ext_nodes_, 1);
int n_nodes = GetNLocalNodes();
gpuErrchk(cudaMemcpyToSymbol(n_local_nodes, &n_nodes,
sizeof(int)));

// std::cout << "n_local_nodes: " << n_nodes << " n_image_nodes_: "
// << n_image_nodes_ << "\n";
if (n_image_nodes_ > 0) {
CheckImageNodes(n_image_nodes_);
addOffsetToExternalNodeIds();
}

calibrate_flag_ = true;

organizeConnections(time_resolution_, GetNNode(),
organizeConnections(time_resolution_, GetTotalNNodes(),
NConn, h_ConnBlockSize,
KeySubarray, ConnectionSubarray);

Expand All @@ -459,7 +476,7 @@ int NESTGPU::Calibrate()

int max_delay_num = h_MaxDelayNum;

unsigned int n_spike_buffers = GetNNode();
unsigned int n_spike_buffers = GetTotalNNodes();
NestedLoop::Init(n_spike_buffers);

// temporary
Expand All @@ -472,17 +489,17 @@ int NESTGPU::Calibrate()
NodeGroupArrayInit();

max_spike_num_ = (int)round(max_spike_num_fact_
* GetNNode()
* GetTotalNNodes()
* max_delay_num);
max_spike_num_ = (max_spike_num_>1) ? max_spike_num_ : 1;

max_spike_per_host_ = (int)round(max_spike_per_host_fact_
* GetNNode()
* GetNLocalNodes()
* max_delay_num);
max_spike_per_host_ = (max_spike_per_host_>1) ? max_spike_per_host_ : 1;

SpikeInit(max_spike_num_);
SpikeBufferInit(GetNNode(), max_spike_buffer_size_);
SpikeBufferInit(GetTotalNNodes(), max_spike_buffer_size_);

//#ifndef CHECKRC
if (n_hosts_ > 1) {
Expand Down Expand Up @@ -577,7 +594,7 @@ int NESTGPU::Calibrate()
//#endif

if (rev_conn_flag_) {
RevSpikeInit(GetNNode());
RevSpikeInit(GetNLocalNodes());
}

multimeter_->OpenFiles();
Expand Down Expand Up @@ -706,7 +723,7 @@ int NESTGPU::SimulationStep()
double time_mark;

time_mark = getRealTime();
SpikeBufferUpdate<<<(GetNNode()+1023)/1024, 1024>>>();
SpikeBufferUpdate<<<(GetTotalNNodes()+1023)/1024, 1024>>>();
gpuErrchk( cudaPeekAtLastError() );
SpikeBufferUpdate_time_ += (getRealTime() - time_mark);
time_mark = getRealTime();
Expand Down Expand Up @@ -744,23 +761,27 @@ int NESTGPU::SimulationStep()
if (n_ext_spike != 0) {
time_mark = getRealTime();
SendExternalSpike<<<(n_ext_spike+1023)/1024, 1024>>>();
gpuErrchk( cudaPeekAtLastError() );
//gpuErrchk( cudaPeekAtLastError() );
CUDASYNC;
SendExternalSpike_time_ += (getRealTime() - time_mark);
}
//for (int ih=0; ih<connect_mpi_->mpi_np_; ih++) {
//if (ih == connect_mpi_->mpi_id_) {

time_mark = getRealTime();
SendSpikeToRemote(n_hosts_, max_spike_per_host_);
CUDASYNC;
SendSpikeToRemote_time_ += (getRealTime() - time_mark);
time_mark = getRealTime();
RecvSpikeFromRemote(n_hosts_, max_spike_per_host_);
CUDASYNC;
RecvSpikeFromRemote_time_ += (getRealTime() - time_mark);
CopySpikeFromRemote(n_hosts_, max_spike_per_host_);
CUDASYNC;
MPI_Barrier(MPI_COMM_WORLD);

}
CUDASYNC;

int n_spikes;
time_mark = getRealTime();
Expand Down Expand Up @@ -824,7 +845,8 @@ int NESTGPU::SimulationStep()
//time_mark = getRealTime();
RevSpikeReset<<<1, 1>>>();
gpuErrchk( cudaPeekAtLastError() );
RevSpikeBufferUpdate<<<(GetNNode()+1023)/1024, 1024>>>(GetNNode());
RevSpikeBufferUpdate<<<(GetNLocalNodes()+1023)/1024, 1024>>>
(GetNLocalNodes());
gpuErrchk( cudaPeekAtLastError() );
unsigned int n_rev_spikes;
gpuErrchk(cudaMemcpy(&n_rev_spikes, d_RevSpikeNum, sizeof(unsigned int),
Expand Down Expand Up @@ -1624,11 +1646,12 @@ int64_t *NESTGPU::GetConnections(int i_source, int n_source,
{
if (n_source<=0) {
i_source = 0;
n_source = GetNNode();
// gets also connections from image neurons
n_source = GetTotalNNodes();
}
if (n_target<=0) {
i_target = 0;
n_target = GetNNode();
n_target = GetNLocalNodes();
}
int *i_source_pt = new int[n_source];
for (int i=0; i<n_source; i++) {
Expand All @@ -1654,7 +1677,7 @@ int64_t *NESTGPU::GetConnections(int *i_source_pt, int n_source,
{
if (n_target<=0) {
i_target = 0;
n_target = GetNNode();
n_target = GetNLocalNodes();
}
int *i_target_pt = new int[n_target];
for (int i=0; i<n_target; i++) {
Expand All @@ -1676,7 +1699,8 @@ int64_t *NESTGPU::GetConnections(int i_source, int n_source,
{
if (n_source<=0) {
i_source = 0;
n_source = GetNNode();
// gets also connections from image neurons
n_source = GetTotalNNodes();
}
int *i_source_pt = new int[n_source];
for (int i=0; i<n_source; i++) {
Expand Down
14 changes: 9 additions & 5 deletions src/nestgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,10 @@ class NESTGPU
long long Nt_; // number of simulation time steps
//int n_poiss_nodes_;
std::vector<int> n_remote_nodes_;
int n_ext_nodes_;
int i_ext_node_0_;
//int n_ext_nodes_;
//int i_ext_node_0_;
//int i_remote_node_0_;
int n_image_nodes_;

double start_real_time_;
double build_real_time_;
Expand All @@ -177,13 +178,14 @@ class NESTGPU
int InitConnRandomGenerator();
int FreeConnRandomGenerator();

int CreateNodeGroup(int n_neuron, int n_port);
int CreateNodeGroup(int n_nodes, int n_ports);
int CheckUncalibrated(std::string message);
double *InitGetSpikeArray(int n_node, int n_port);
double *InitGetSpikeArray(int n_nodes, int n_ports);
int NodeGroupArrayInit();
int ClearGetSpikeArrays();
int FreeGetSpikeArrays();
int FreeNodeGroupMap();
int CheckImageNodes(int n_nodes);

NodeSeq _Create(std::string model_name, int n_nodes, int n_ports);

Expand Down Expand Up @@ -300,7 +302,9 @@ class NESTGPU
int SetMaxSpikeBufferSize(int max_size);
int GetMaxSpikeBufferSize();

uint GetNNode();
uint GetNLocalNodes();

uint GetTotalNNodes() { return GetNLocalNodes() + n_image_nodes_; }

int HostNum() {
return n_hosts_;
Expand Down
21 changes: 13 additions & 8 deletions src/remote_connect.cu
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ int **d_node_map_index; // [i_node]
// - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli)
bool *use_all_source_nodes; // [n_connection_rules]:

__constant__ int n_local_nodes; // number of local nodes

// Allocate GPU memory for new remote-source-node-map blocks
int allocRemoteSourceNodeMapBlocks(std::vector<int*> &i_remote_src_node_map,
std::vector<int*> &i_local_spike_buf_map,
Expand Down Expand Up @@ -379,7 +381,7 @@ int NESTGPU::RemoteConnectionMapCalibrate(int i_host, int n_hosts)
////////////////////////////////////////
#endif

int n_nodes = GetNNode(); // number of nodes
int n_nodes = GetNLocalNodes(); // number of nodes
// n_target_hosts[i_node] is the number of remote target hosts
// on which each local node
// has outgoing connections
Expand Down Expand Up @@ -845,30 +847,31 @@ __global__ void MapIndexToSpikeBufferKernel(int n_hosts, int *host_offset,

__global__ void addOffsetToExternalNodeIdsKernel
(int64_t n_conn, uint *key_subarray, connection_struct *conn_subarray,
int i_ext_node_0)
int i_image_node_0)
{
int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
if (i_conn>=n_conn) return;
uint target_port = conn_subarray[i_conn].target_port;
if (target_port & (1 << (MaxPortNBits - 1))) {
target_port = target_port ^ (1 << (MaxPortNBits - 1));
conn_subarray[i_conn].target_port = target_port;
key_subarray[i_conn] += (i_ext_node_0 << MaxPortNBits);
key_subarray[i_conn] += (i_image_node_0 << MaxPortNBits);
}
}

int NESTGPU::addOffsetToExternalNodeIds()
{
int64_t block_size = h_ConnBlockSize;
int n_blocks = (NConn - 1) / block_size + 1;

int i_image_node_0 = GetNLocalNodes();

for (int ib=0; ib<n_blocks; ib++) {
uint64_t n_block_conn = block_size; // number of connections in the block
if (ib == n_blocks-1) { // last block
n_block_conn = (NConn - 1) % block_size + 1;
}
addOffsetToExternalNodeIdsKernel<<<(n_block_conn+1023)/1024, 1024>>>
(n_block_conn, KeySubarray[ib], ConnectionSubarray[ib], i_ext_node_0_);
(n_block_conn, KeySubarray[ib], ConnectionSubarray[ib], i_image_node_0);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
Expand Down Expand Up @@ -897,24 +900,26 @@ int NESTGPU::addOffsetToExternalNodeIds()
}

__global__ void addOffsetToSpikeBufferMapKernel(int i_host, int n_node_map,
int i_ext_node_0)
int i_image_node_0)
{
int i_node_map = threadIdx.x + blockIdx.x * blockDim.x;
if (i_node_map>=n_node_map) return;

const int i_block = i_node_map / node_map_block_size;
const int i = i_node_map % node_map_block_size;
local_spike_buffer_map[i_host][i_block][i] += i_ext_node_0;
local_spike_buffer_map[i_host][i_block][i] += i_image_node_0;
}

int NESTGPU::addOffsetToSpikeBufferMap()
{
int i_image_node_0 = GetNLocalNodes();

for (int i_host=0; i_host<n_hosts_; i_host++) {
if (i_host != this_host_) {
int n_node_map = h_n_remote_source_node_map[i_host];
if (n_node_map > 0) {
addOffsetToSpikeBufferMapKernel<<<(n_node_map+1023)/1024, 1024>>>
(i_host, n_node_map, i_ext_node_0_);
(i_host, n_node_map, i_image_node_0);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
Expand Down
12 changes: 7 additions & 5 deletions src/remote_connect.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ extern int **d_node_target_host_i_map; // [i_node]
// - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli)
extern bool *use_all_source_nodes; // [n_connection_rules]:

extern __constant__ int n_local_nodes; // number of local nodes

// device function that checks if an int value is in a sorted 2d-array
// assuming that the entries in the 2d-array are sorted.
// The 2d-array is divided in noncontiguous blocks of size block_size
Expand Down Expand Up @@ -284,7 +286,7 @@ int NESTGPU::_RemoteConnectSource(int source_host, T1 source, int n_source,
// n_nodes will be the first index for new mapping of remote source nodes
// to local spike buffers
//int spike_buffer_map_i0 = GetNNode();
int spike_buffer_map_i0 = n_ext_nodes_;
int spike_buffer_map_i0 = n_image_nodes_;
syn_spec.port_ = syn_spec.port_ | (1 << (h_MaxPortNBits-1));

// check if the flag UseAllSourceNodes[conn_rule] is false
Expand Down Expand Up @@ -774,12 +776,12 @@ int NESTGPU::_RemoteConnectSource(int source_host, T1 source, int n_source,
fixConnectionSourceNodeIndexes(KeySubarray, old_n_conn, NConn,
h_ConnBlockSize, d_local_node_index);

// On target host. Create n_nodes_to_map nodes of type ext_neuron
// On target host. Create n_nodes_to_map nodes of type image_node
//std::cout << "h_n_node_to_map " << h_n_node_to_map <<"\n";
if (h_n_node_to_map > 0) {
//_Create("ext_neuron", h_n_node_to_map);
n_ext_nodes_ += h_n_node_to_map;
//std::cout << "n_ext_nodes_ " << n_ext_nodes_ <<"\n";
//_Create("image_node", h_n_node_to_map);
n_image_nodes_ += h_n_node_to_map;
//std::cout << "n_image_nodes_ " << n_image_nodes_ <<"\n";
}

return 0;
Expand Down
Loading