From e2b88a81630249374a060406449adb6e62bfaa10 Mon Sep 17 00:00:00 2001 From: Bruno Golosio Date: Thu, 19 Oct 2023 00:42:13 +0200 Subject: [PATCH] Removed unnecessary memory allocation and updates for image nodes --- src/nestgpu.cu | 86 +++++++++++++++--------- src/nestgpu.h | 14 ++-- src/remote_connect.cu | 21 +++--- src/remote_connect.h | 12 ++-- src/spike_buffer.cu | 147 +++++++++++++++++++++--------------------- src/spike_mpi.cu | 6 +- 6 files changed, 162 insertions(+), 124 deletions(-) diff --git a/src/nestgpu.cu b/src/nestgpu.cu index 0edb43cf1..8cb77a54a 100644 --- a/src/nestgpu.cu +++ b/src/nestgpu.cu @@ -185,7 +185,7 @@ NESTGPU::NESTGPU() sim_time_ = 1000.0; //Simulation time in ms //n_poiss_nodes_ = 0; n_remote_nodes_.assign(1, 0); - n_ext_nodes_ = 0; + n_image_nodes_ = 0; SetTimeResolution(0.1); // time resolution in ms max_spike_num_fact_ = 1.0; max_spike_per_host_fact_ = 1.0; @@ -390,31 +390,45 @@ int NESTGPU::GetMaxSpikeBufferSize() return max_spike_buffer_size_; } -uint NESTGPU::GetNNode() +uint NESTGPU::GetNLocalNodes() { return node_group_map_.size(); } -int NESTGPU::CreateNodeGroup(int n_node, int n_port) +int NESTGPU::CheckImageNodes(int n_nodes) { - int i_node_0 = GetNNode(); - int max_n_neurons = IntPow(2,h_MaxNodeNBits); + int i_node_0 = GetNLocalNodes(); + int max_n_nodes = IntPow(2,h_MaxNodeNBits); + + if ((i_node_0 + n_nodes) > max_n_nodes) { + throw ngpu_exception(std::string("Local plus Image nodes exceed maximum" + " number of nodes ") + + std::to_string(max_n_nodes)); + } + + return i_node_0; +} + +int NESTGPU::CreateNodeGroup(int n_nodes, int n_ports) +{ + int i_node_0 = GetNLocalNodes(); + int max_n_nodes = IntPow(2,h_MaxNodeNBits); int max_n_ports = IntPow(2,h_MaxPortNBits); - if ((i_node_0 + n_node) > max_n_neurons) { - throw ngpu_exception(std::string("Maximum number of neurons ") - + std::to_string(max_n_neurons) + " exceeded"); + if ((i_node_0 + n_nodes) > max_n_nodes) { + throw ngpu_exception(std::string("Maximum number of nodes ") + + std::to_string(max_n_nodes) + " exceeded"); } - if (n_port > max_n_ports) { + if (n_ports > max_n_ports) { throw ngpu_exception(std::string("Maximum number of ports ") + std::to_string(max_n_ports) + " exceeded"); } int i_group = node_vect_.size() - 1; - node_group_map_.insert(node_group_map_.end(), n_node, i_group); + node_group_map_.insert(node_group_map_.end(), n_nodes, i_group); node_vect_[i_group]->random_generator_ = random_generator_; - node_vect_[i_group]->Init(i_node_0, n_node, n_port, i_group); - node_vect_[i_group]->get_spike_array_ = InitGetSpikeArray(n_node, n_port); + node_vect_[i_group]->Init(i_node_0, n_nodes, n_ports, i_group); + node_vect_[i_group]->get_spike_array_ = InitGetSpikeArray(n_nodes, n_ports); return i_node_0; } @@ -439,17 +453,20 @@ int NESTGPU::Calibrate() gpuErrchk(cudaMemcpyToSymbol(NESTGPUTimeResolution, &time_resolution_, sizeof(float))); /////////////////////////////////// - i_ext_node_0_ = GetNNode(); - // std::cout << "i_ext_node_0_: " << i_ext_node_0_ << " n_ext_nodes_: " - // << n_ext_nodes_ << "\n"; - if (n_ext_nodes_ > 0) { - _Create("ext_neuron", n_ext_nodes_, 1); + int n_nodes = GetNLocalNodes(); + gpuErrchk(cudaMemcpyToSymbol(n_local_nodes, &n_nodes, + sizeof(int))); + + // std::cout << "n_local_nodes: " << n_nodes << " n_image_nodes_: " + // << n_image_nodes_ << "\n"; + if (n_image_nodes_ > 0) { + CheckImageNodes(n_image_nodes_); addOffsetToExternalNodeIds(); } calibrate_flag_ = true; - organizeConnections(time_resolution_, GetNNode(), + organizeConnections(time_resolution_, GetTotalNNodes(), NConn, h_ConnBlockSize, KeySubarray, ConnectionSubarray); @@ -459,7 +476,7 @@ int NESTGPU::Calibrate() int max_delay_num = h_MaxDelayNum; - unsigned int n_spike_buffers = GetNNode(); + unsigned int n_spike_buffers = GetTotalNNodes(); NestedLoop::Init(n_spike_buffers); // temporary @@ -472,17 +489,17 @@ int NESTGPU::Calibrate() NodeGroupArrayInit(); max_spike_num_ = (int)round(max_spike_num_fact_ - * GetNNode() + * GetTotalNNodes() * max_delay_num); max_spike_num_ = (max_spike_num_>1) ? max_spike_num_ : 1; max_spike_per_host_ = (int)round(max_spike_per_host_fact_ - * GetNNode() + * GetNLocalNodes() * max_delay_num); max_spike_per_host_ = (max_spike_per_host_>1) ? max_spike_per_host_ : 1; SpikeInit(max_spike_num_); - SpikeBufferInit(GetNNode(), max_spike_buffer_size_); + SpikeBufferInit(GetTotalNNodes(), max_spike_buffer_size_); //#ifndef CHECKRC if (n_hosts_ > 1) { @@ -577,7 +594,7 @@ int NESTGPU::Calibrate() //#endif if (rev_conn_flag_) { - RevSpikeInit(GetNNode()); + RevSpikeInit(GetNLocalNodes()); } multimeter_->OpenFiles(); @@ -706,7 +723,7 @@ int NESTGPU::SimulationStep() double time_mark; time_mark = getRealTime(); - SpikeBufferUpdate<<<(GetNNode()+1023)/1024, 1024>>>(); + SpikeBufferUpdate<<<(GetTotalNNodes()+1023)/1024, 1024>>>(); gpuErrchk( cudaPeekAtLastError() ); SpikeBufferUpdate_time_ += (getRealTime() - time_mark); time_mark = getRealTime(); @@ -744,7 +761,8 @@ int NESTGPU::SimulationStep() if (n_ext_spike != 0) { time_mark = getRealTime(); SendExternalSpike<<<(n_ext_spike+1023)/1024, 1024>>>(); - gpuErrchk( cudaPeekAtLastError() ); + //gpuErrchk( cudaPeekAtLastError() ); + CUDASYNC; SendExternalSpike_time_ += (getRealTime() - time_mark); } //for (int ih=0; ihmpi_np_; ih++) { @@ -752,15 +770,18 @@ int NESTGPU::SimulationStep() time_mark = getRealTime(); SendSpikeToRemote(n_hosts_, max_spike_per_host_); + CUDASYNC; SendSpikeToRemote_time_ += (getRealTime() - time_mark); time_mark = getRealTime(); RecvSpikeFromRemote(n_hosts_, max_spike_per_host_); - + CUDASYNC; RecvSpikeFromRemote_time_ += (getRealTime() - time_mark); CopySpikeFromRemote(n_hosts_, max_spike_per_host_); + CUDASYNC; MPI_Barrier(MPI_COMM_WORLD); } + CUDASYNC; int n_spikes; time_mark = getRealTime(); @@ -824,7 +845,8 @@ int NESTGPU::SimulationStep() //time_mark = getRealTime(); RevSpikeReset<<<1, 1>>>(); gpuErrchk( cudaPeekAtLastError() ); - RevSpikeBufferUpdate<<<(GetNNode()+1023)/1024, 1024>>>(GetNNode()); + RevSpikeBufferUpdate<<<(GetNLocalNodes()+1023)/1024, 1024>>> + (GetNLocalNodes()); gpuErrchk( cudaPeekAtLastError() ); unsigned int n_rev_spikes; gpuErrchk(cudaMemcpy(&n_rev_spikes, d_RevSpikeNum, sizeof(unsigned int), @@ -1624,11 +1646,12 @@ int64_t *NESTGPU::GetConnections(int i_source, int n_source, { if (n_source<=0) { i_source = 0; - n_source = GetNNode(); + // gets also connections from image neurons + n_source = GetTotalNNodes(); } if (n_target<=0) { i_target = 0; - n_target = GetNNode(); + n_target = GetNLocalNodes(); } int *i_source_pt = new int[n_source]; for (int i=0; i n_remote_nodes_; - int n_ext_nodes_; - int i_ext_node_0_; + //int n_ext_nodes_; + //int i_ext_node_0_; //int i_remote_node_0_; + int n_image_nodes_; double start_real_time_; double build_real_time_; @@ -177,13 +178,14 @@ class NESTGPU int InitConnRandomGenerator(); int FreeConnRandomGenerator(); - int CreateNodeGroup(int n_neuron, int n_port); + int CreateNodeGroup(int n_nodes, int n_ports); int CheckUncalibrated(std::string message); - double *InitGetSpikeArray(int n_node, int n_port); + double *InitGetSpikeArray(int n_nodes, int n_ports); int NodeGroupArrayInit(); int ClearGetSpikeArrays(); int FreeGetSpikeArrays(); int FreeNodeGroupMap(); + int CheckImageNodes(int n_nodes); NodeSeq _Create(std::string model_name, int n_nodes, int n_ports); @@ -300,7 +302,9 @@ class NESTGPU int SetMaxSpikeBufferSize(int max_size); int GetMaxSpikeBufferSize(); - uint GetNNode(); + uint GetNLocalNodes(); + + uint GetTotalNNodes() { return GetNLocalNodes() + n_image_nodes_; } int HostNum() { return n_hosts_; diff --git a/src/remote_connect.cu b/src/remote_connect.cu index 15c1f6338..564f010c5 100644 --- a/src/remote_connect.cu +++ b/src/remote_connect.cu @@ -92,6 +92,8 @@ int **d_node_map_index; // [i_node] // - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli) bool *use_all_source_nodes; // [n_connection_rules]: +__constant__ int n_local_nodes; // number of local nodes + // Allocate GPU memory for new remote-source-node-map blocks int allocRemoteSourceNodeMapBlocks(std::vector &i_remote_src_node_map, std::vector &i_local_spike_buf_map, @@ -379,7 +381,7 @@ int NESTGPU::RemoteConnectionMapCalibrate(int i_host, int n_hosts) //////////////////////////////////////// #endif - int n_nodes = GetNNode(); // number of nodes + int n_nodes = GetNLocalNodes(); // number of nodes // n_target_hosts[i_node] is the number of remote target hosts // on which each local node // has outgoing connections @@ -845,7 +847,7 @@ __global__ void MapIndexToSpikeBufferKernel(int n_hosts, int *host_offset, __global__ void addOffsetToExternalNodeIdsKernel (int64_t n_conn, uint *key_subarray, connection_struct *conn_subarray, - int i_ext_node_0) + int i_image_node_0) { int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x; if (i_conn>=n_conn) return; @@ -853,7 +855,7 @@ __global__ void addOffsetToExternalNodeIdsKernel if (target_port & (1 << (MaxPortNBits - 1))) { target_port = target_port ^ (1 << (MaxPortNBits - 1)); conn_subarray[i_conn].target_port = target_port; - key_subarray[i_conn] += (i_ext_node_0 << MaxPortNBits); + key_subarray[i_conn] += (i_image_node_0 << MaxPortNBits); } } @@ -861,14 +863,15 @@ int NESTGPU::addOffsetToExternalNodeIds() { int64_t block_size = h_ConnBlockSize; int n_blocks = (NConn - 1) / block_size + 1; - + int i_image_node_0 = GetNLocalNodes(); + for (int ib=0; ib>> - (n_block_conn, KeySubarray[ib], ConnectionSubarray[ib], i_ext_node_0_); + (n_block_conn, KeySubarray[ib], ConnectionSubarray[ib], i_image_node_0); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); } @@ -897,24 +900,26 @@ int NESTGPU::addOffsetToExternalNodeIds() } __global__ void addOffsetToSpikeBufferMapKernel(int i_host, int n_node_map, - int i_ext_node_0) + int i_image_node_0) { int i_node_map = threadIdx.x + blockIdx.x * blockDim.x; if (i_node_map>=n_node_map) return; const int i_block = i_node_map / node_map_block_size; const int i = i_node_map % node_map_block_size; - local_spike_buffer_map[i_host][i_block][i] += i_ext_node_0; + local_spike_buffer_map[i_host][i_block][i] += i_image_node_0; } int NESTGPU::addOffsetToSpikeBufferMap() { + int i_image_node_0 = GetNLocalNodes(); + for (int i_host=0; i_host 0) { addOffsetToSpikeBufferMapKernel<<<(n_node_map+1023)/1024, 1024>>> - (i_host, n_node_map, i_ext_node_0_); + (i_host, n_node_map, i_image_node_0); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaDeviceSynchronize() ); } diff --git a/src/remote_connect.h b/src/remote_connect.h index 10f45d62f..89e83ba72 100644 --- a/src/remote_connect.h +++ b/src/remote_connect.h @@ -54,6 +54,8 @@ extern int **d_node_target_host_i_map; // [i_node] // - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli) extern bool *use_all_source_nodes; // [n_connection_rules]: +extern __constant__ int n_local_nodes; // number of local nodes + // device function that checks if an int value is in a sorted 2d-array // assuming that the entries in the 2d-array are sorted. // The 2d-array is divided in noncontiguous blocks of size block_size @@ -284,7 +286,7 @@ int NESTGPU::_RemoteConnectSource(int source_host, T1 source, int n_source, // n_nodes will be the first index for new mapping of remote source nodes // to local spike buffers //int spike_buffer_map_i0 = GetNNode(); - int spike_buffer_map_i0 = n_ext_nodes_; + int spike_buffer_map_i0 = n_image_nodes_; syn_spec.port_ = syn_spec.port_ | (1 << (h_MaxPortNBits-1)); // check if the flag UseAllSourceNodes[conn_rule] is false @@ -774,12 +776,12 @@ int NESTGPU::_RemoteConnectSource(int source_host, T1 source, int n_source, fixConnectionSourceNodeIndexes(KeySubarray, old_n_conn, NConn, h_ConnBlockSize, d_local_node_index); - // On target host. Create n_nodes_to_map nodes of type ext_neuron + // On target host. Create n_nodes_to_map nodes of type image_node //std::cout << "h_n_node_to_map " << h_n_node_to_map <<"\n"; if (h_n_node_to_map > 0) { - //_Create("ext_neuron", h_n_node_to_map); - n_ext_nodes_ += h_n_node_to_map; - //std::cout << "n_ext_nodes_ " << n_ext_nodes_ <<"\n"; + //_Create("image_node", h_n_node_to_map); + n_image_nodes_ += h_n_node_to_map; + //std::cout << "n_image_nodes_ " << n_image_nodes_ <<"\n"; } return 0; diff --git a/src/spike_buffer.cu b/src/spike_buffer.cu index f4ecb4f6b..243d06b09 100644 --- a/src/spike_buffer.cu +++ b/src/spike_buffer.cu @@ -63,6 +63,8 @@ __device__ long long *LastRevSpikeTimeIdx; // unsigned short *d_ConnectionSpikeTime; // [NConnection]; __device__ unsigned short *ConnectionSpikeTime; // +extern __constant__ int n_local_nodes; + ////////////////////////////////////////////////////////////////////// int *d_SpikeBufferSize; // [NSpikeBuffer]; @@ -101,60 +103,59 @@ __device__ float *SpikeBufferHeight; // [NSpikeBuffer*MaxSpikeBufferNum]; //////////////////////////////////////////////////////////// __device__ void PushSpike(int i_spike_buffer, float height) { - LastSpikeTimeIdx[i_spike_buffer] = NESTGPUTimeIdx; - LastSpikeHeight[i_spike_buffer] = height; - int i_group = NodeGroupMap[i_spike_buffer]; - int den_delay_idx; - float *den_delay_arr = NodeGroupArray[i_group].den_delay_arr_; - // check if node has dendritic delay - if (den_delay_arr != NULL) { - int i_neuron = i_spike_buffer - NodeGroupArray[i_group].i_node_0_; - int n_param = NodeGroupArray[i_group].n_param_; - // dendritic delay index is stored in the parameter array - // den_delay_arr points to the dendritic delay if the first - // node of the group. The other are separate by steps = n_param - den_delay_idx = (int)round(den_delay_arr[i_neuron*n_param] - /NESTGPUTimeResolution); - //printf("isb %d\tden_delay_idx: %d\n", i_spike_buffer, den_delay_idx); - } - else { - den_delay_idx = 0; - } - // printf("Node %d spikes at time %lld , den_delay_idx: %d\n", - // i_spike_buffer, NESTGPUTimeIdx, den_delay_idx); - if (den_delay_idx==0) { - // last time when spike is sent back to dendrites (e.g. for STDP) - LastRevSpikeTimeIdx[i_spike_buffer] = NESTGPUTimeIdx; - } - - if (ExternalSpikeFlag) { - // if active spike should eventually be sent to remote connections - //printf("PushExternalSpike i_spike_buffer: %d height: %f\n", - // i_spike_buffer, height); - PushExternalSpike(i_spike_buffer, height); - } - - // if recording spike counts is activated, increase counter - if (NodeGroupArray[i_group].spike_count_ != NULL) { - int i_node_0 = NodeGroupArray[i_group].i_node_0_; - NodeGroupArray[i_group].spike_count_[i_spike_buffer-i_node_0]++; - } - - // check if recording spike times is activated - int max_n_rec_spike_times = NodeGroupArray[i_group].max_n_rec_spike_times_; - if (max_n_rec_spike_times != 0) { - int i_node_rel = i_spike_buffer - NodeGroupArray[i_group].i_node_0_; - int n_rec_spike_times = - NodeGroupArray[i_group].n_rec_spike_times_[i_node_rel]; - if (n_rec_spike_times>=max_n_rec_spike_times-1) { - printf("Maximum number of recorded spike times exceeded" - " for spike buffer %d\n", i_spike_buffer); + int den_delay_idx = 0; + if (i_spike_buffer=max_n_rec_spike_times-1) { + printf("Maximum number of recorded spike times exceeded" + " for spike buffer %d\n", i_spike_buffer); + } + else { // record spike time + NodeGroupArray[i_group].rec_spike_times_ + [i_node_rel*max_n_rec_spike_times + n_rec_spike_times] + = NESTGPUTime; + NodeGroupArray[i_group].n_rec_spike_times_[i_node_rel]++; + } } } @@ -192,23 +193,22 @@ __global__ void SpikeBufferUpdate() { int i_spike_buffer = threadIdx.x + blockIdx.x * blockDim.x; if (i_spike_buffer>=NSpikeBuffer) return; - - int i_group=NodeGroupMap[i_spike_buffer]; - int den_delay_idx; - float *den_delay_arr = NodeGroupArray[i_group].den_delay_arr_; - // check if node has dendritic delay - if (den_delay_arr != NULL) { - int i_neuron = i_spike_buffer - NodeGroupArray[i_group].i_node_0_; - int n_param = NodeGroupArray[i_group].n_param_; - // dendritic delay index is stored in the parameter array - // den_delay_arr points to the dendritic delay if the first - // node of the group. The other are separate by steps = n_param - den_delay_idx = (int)round(den_delay_arr[i_neuron*n_param] - /NESTGPUTimeResolution); - //printf("isb update %d\tden_delay_idx: %d\n", i_spike_buffer, den_delay_idx); - } - else { - den_delay_idx = 0; + + int den_delay_idx = 0; + if (i_spike_buffer>>(n_hosts, d_ExternalSourceSpikeCumul, d_ExternalSourceSpikeNodeId); + CUDASYNC; // convert node group indexes to spike buffer indexes // by adding the index of the first node of the node group //AddOffset<<<(n_spike_tot+1023)/1024, 1024>>> @@ -444,7 +447,8 @@ int NESTGPU::CopySpikeFromRemote(int n_hosts, int max_spike_per_host) // push remote spikes in local spike buffers PushSpikeFromRemote<<<(n_spike_tot+1023)/1024, 1024>>> (n_spike_tot, d_ExternalSourceSpikeNodeId); - gpuErrchk( cudaPeekAtLastError() ); + CUDASYNC; + //gpuErrchk( cudaPeekAtLastError() ); //cudaDeviceSynchronize(); }