nest · golosio · Oct 19, 2023 · Oct 18, 2023
diff --git a/src/nestgpu.cu b/src/nestgpu.cu
@@ -185,7 +185,7 @@ NESTGPU::NESTGPU()
   sim_time_ = 1000.0;        //Simulation time in ms
   //n_poiss_nodes_ = 0;
   n_remote_nodes_.assign(1, 0);
-  n_ext_nodes_ = 0;
+  n_image_nodes_ = 0;
   SetTimeResolution(0.1);  // time resolution in ms
   max_spike_num_fact_ = 1.0;
   max_spike_per_host_fact_ = 1.0;
@@ -390,31 +390,45 @@ int NESTGPU::GetMaxSpikeBufferSize()
   return max_spike_buffer_size_;
 }
 
-uint NESTGPU::GetNNode()
+uint NESTGPU::GetNLocalNodes()
 {
   return node_group_map_.size();
 }
 
-int NESTGPU::CreateNodeGroup(int n_node, int n_port)
+int NESTGPU::CheckImageNodes(int n_nodes)
 {
-  int i_node_0 = GetNNode();
-  int max_n_neurons = IntPow(2,h_MaxNodeNBits);
+  int i_node_0 = GetNLocalNodes();
+  int max_n_nodes = IntPow(2,h_MaxNodeNBits);
+
+  if ((i_node_0 + n_nodes) > max_n_nodes) {
+    throw ngpu_exception(std::string("Local plus Image nodes exceed maximum"
+				     " number of nodes ")
+			 + std::to_string(max_n_nodes));
+  }
+
+  return i_node_0;
+}
+
+int NESTGPU::CreateNodeGroup(int n_nodes, int n_ports)
+{
+  int i_node_0 = GetNLocalNodes();
+  int max_n_nodes = IntPow(2,h_MaxNodeNBits);
   int max_n_ports = IntPow(2,h_MaxPortNBits);
 
-  if ((i_node_0 + n_node) > max_n_neurons) {
-    throw ngpu_exception(std::string("Maximum number of neurons ")
-			 + std::to_string(max_n_neurons) + " exceeded");
+  if ((i_node_0 + n_nodes) > max_n_nodes) {
+    throw ngpu_exception(std::string("Maximum number of nodes ")
+			 + std::to_string(max_n_nodes) + " exceeded");
   }
-  if (n_port > max_n_ports) {
+  if (n_ports > max_n_ports) {
     throw ngpu_exception(std::string("Maximum number of ports ")
 			 + std::to_string(max_n_ports) + " exceeded");
   }
   int i_group = node_vect_.size() - 1;
-  node_group_map_.insert(node_group_map_.end(), n_node, i_group);
+  node_group_map_.insert(node_group_map_.end(), n_nodes, i_group);
 
   node_vect_[i_group]->random_generator_ = random_generator_;
-  node_vect_[i_group]->Init(i_node_0, n_node, n_port, i_group);
-  node_vect_[i_group]->get_spike_array_ = InitGetSpikeArray(n_node, n_port);
+  node_vect_[i_group]->Init(i_node_0, n_nodes, n_ports, i_group);
+  node_vect_[i_group]->get_spike_array_ = InitGetSpikeArray(n_nodes, n_ports);
 
   return i_node_0;
 }
@@ -439,17 +453,20 @@ int NESTGPU::Calibrate()
   gpuErrchk(cudaMemcpyToSymbol(NESTGPUTimeResolution, &time_resolution_,
 			       sizeof(float)));
 ///////////////////////////////////
-  i_ext_node_0_ = GetNNode();
-  // std::cout << "i_ext_node_0_: " << i_ext_node_0_ << " n_ext_nodes_: "
-  // << n_ext_nodes_ << "\n";
-  if (n_ext_nodes_ > 0) {
-    _Create("ext_neuron", n_ext_nodes_, 1);
+  int n_nodes = GetNLocalNodes();
+  gpuErrchk(cudaMemcpyToSymbol(n_local_nodes, &n_nodes,
+			       sizeof(int)));
+
+  // std::cout << "n_local_nodes: " << n_nodes << " n_image_nodes_: "
+  // << n_image_nodes_ << "\n";
+  if (n_image_nodes_ > 0) {
+    CheckImageNodes(n_image_nodes_);
     addOffsetToExternalNodeIds();
   }
 
   calibrate_flag_ = true;
 
-  organizeConnections(time_resolution_, GetNNode(),
+  organizeConnections(time_resolution_, GetTotalNNodes(),
 		      NConn, h_ConnBlockSize,
 		      KeySubarray, ConnectionSubarray);
 
@@ -459,7 +476,7 @@ int NESTGPU::Calibrate()
 
   int max_delay_num = h_MaxDelayNum;
 
-  unsigned int n_spike_buffers = GetNNode();
+  unsigned int n_spike_buffers = GetTotalNNodes();
   NestedLoop::Init(n_spike_buffers);
 
   // temporary
@@ -472,17 +489,17 @@ int NESTGPU::Calibrate()
   NodeGroupArrayInit();
 
   max_spike_num_ = (int)round(max_spike_num_fact_
-			      * GetNNode()
+			      * GetTotalNNodes()
 			      * max_delay_num);
   max_spike_num_ = (max_spike_num_>1) ? max_spike_num_ : 1;
 
   max_spike_per_host_ = (int)round(max_spike_per_host_fact_
-				   * GetNNode()
+				   * GetNLocalNodes()
 				   * max_delay_num);
   max_spike_per_host_ = (max_spike_per_host_>1) ? max_spike_per_host_ : 1;
 
   SpikeInit(max_spike_num_);
-  SpikeBufferInit(GetNNode(), max_spike_buffer_size_);
+  SpikeBufferInit(GetTotalNNodes(), max_spike_buffer_size_);
 
   //#ifndef CHECKRC
   if (n_hosts_ > 1) {
@@ -577,7 +594,7 @@ int NESTGPU::Calibrate()
   //#endif
 
   if (rev_conn_flag_) {
-    RevSpikeInit(GetNNode());
+    RevSpikeInit(GetNLocalNodes());
   }
 
   multimeter_->OpenFiles();
@@ -706,7 +723,7 @@ int NESTGPU::SimulationStep()
   double time_mark;
 
   time_mark = getRealTime();
-  SpikeBufferUpdate<<<(GetNNode()+1023)/1024, 1024>>>();
+  SpikeBufferUpdate<<<(GetTotalNNodes()+1023)/1024, 1024>>>();
   gpuErrchk( cudaPeekAtLastError() );
   SpikeBufferUpdate_time_ += (getRealTime() - time_mark);
   time_mark = getRealTime();
@@ -744,23 +761,27 @@ int NESTGPU::SimulationStep()
     if (n_ext_spike != 0) {
       time_mark = getRealTime();
       SendExternalSpike<<<(n_ext_spike+1023)/1024, 1024>>>();
-      gpuErrchk( cudaPeekAtLastError() );
+      //gpuErrchk( cudaPeekAtLastError() );
+      CUDASYNC;
       SendExternalSpike_time_ += (getRealTime() - time_mark);
     }
     //for (int ih=0; ih<connect_mpi_->mpi_np_; ih++) {
     //if (ih == connect_mpi_->mpi_id_) {
 
     time_mark = getRealTime();
     SendSpikeToRemote(n_hosts_, max_spike_per_host_);
+    CUDASYNC;
     SendSpikeToRemote_time_ += (getRealTime() - time_mark);
     time_mark = getRealTime();
     RecvSpikeFromRemote(n_hosts_, max_spike_per_host_);
-				      
+    CUDASYNC;			      
     RecvSpikeFromRemote_time_ += (getRealTime() - time_mark);
     CopySpikeFromRemote(n_hosts_, max_spike_per_host_);
+    CUDASYNC;
     MPI_Barrier(MPI_COMM_WORLD);
 
   }
+  CUDASYNC;
 
   int n_spikes;
   time_mark = getRealTime();
@@ -824,7 +845,8 @@ int NESTGPU::SimulationStep()
     //time_mark = getRealTime();
     RevSpikeReset<<<1, 1>>>();
     gpuErrchk( cudaPeekAtLastError() );
-    RevSpikeBufferUpdate<<<(GetNNode()+1023)/1024, 1024>>>(GetNNode());
+    RevSpikeBufferUpdate<<<(GetNLocalNodes()+1023)/1024, 1024>>>
+      (GetNLocalNodes());
     gpuErrchk( cudaPeekAtLastError() );
     unsigned int n_rev_spikes;
     gpuErrchk(cudaMemcpy(&n_rev_spikes, d_RevSpikeNum, sizeof(unsigned int),
@@ -1624,11 +1646,12 @@ int64_t *NESTGPU::GetConnections(int i_source, int n_source,
 {
   if (n_source<=0) {
     i_source = 0;
-    n_source = GetNNode();
+    // gets also connections from image neurons
+    n_source = GetTotalNNodes();
   }
   if (n_target<=0) {
     i_target = 0;
-    n_target = GetNNode();
+    n_target = GetNLocalNodes();
   }
   int *i_source_pt = new int[n_source];
   for (int i=0; i<n_source; i++) {
@@ -1654,7 +1677,7 @@ int64_t *NESTGPU::GetConnections(int *i_source_pt, int n_source,
 {
   if (n_target<=0) {
     i_target = 0;
-    n_target = GetNNode();
+    n_target = GetNLocalNodes();
   }
   int *i_target_pt = new int[n_target];
   for (int i=0; i<n_target; i++) {
@@ -1676,7 +1699,8 @@ int64_t *NESTGPU::GetConnections(int i_source, int n_source,
 {
   if (n_source<=0) {
     i_source = 0;
-    n_source = GetNNode();
+    //  gets also connections from image neurons
+    n_source = GetTotalNNodes();
   }
   int *i_source_pt = new int[n_source];
   for (int i=0; i<n_source; i++) {

diff --git a/src/nestgpu.h b/src/nestgpu.h
@@ -148,9 +148,10 @@ class NESTGPU
   long long Nt_; // number of simulation time steps
   //int n_poiss_nodes_;
   std::vector<int> n_remote_nodes_;
-  int n_ext_nodes_;
-  int i_ext_node_0_;
+  //int n_ext_nodes_;
+  //int i_ext_node_0_;
   //int i_remote_node_0_;
+  int n_image_nodes_;
 
   double start_real_time_;
   double build_real_time_;
@@ -177,13 +178,14 @@ class NESTGPU
   int InitConnRandomGenerator();
   int FreeConnRandomGenerator();
 
-  int CreateNodeGroup(int n_neuron, int n_port);
+  int CreateNodeGroup(int n_nodes, int n_ports);
   int CheckUncalibrated(std::string message);
-  double *InitGetSpikeArray(int n_node, int n_port);
+  double *InitGetSpikeArray(int n_nodes, int n_ports);
   int NodeGroupArrayInit();
   int ClearGetSpikeArrays();
   int FreeGetSpikeArrays();
   int FreeNodeGroupMap();
+  int CheckImageNodes(int n_nodes);
 
   NodeSeq _Create(std::string model_name, int n_nodes, int n_ports);
 
@@ -300,7 +302,9 @@ class NESTGPU
   int SetMaxSpikeBufferSize(int max_size);
   int GetMaxSpikeBufferSize();
 
-  uint GetNNode();
+  uint GetNLocalNodes();
+
+  uint GetTotalNNodes() { return GetNLocalNodes() + n_image_nodes_; }
 
   int HostNum() {
     return n_hosts_;

diff --git a/src/remote_connect.cu b/src/remote_connect.cu
@@ -92,6 +92,8 @@ int **d_node_map_index; // [i_node]
 // - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli)
 bool *use_all_source_nodes; // [n_connection_rules]:
 
+__constant__ int n_local_nodes; // number of local nodes
+
 // Allocate GPU memory for new remote-source-node-map blocks
 int allocRemoteSourceNodeMapBlocks(std::vector<int*> &i_remote_src_node_map,
 				   std::vector<int*> &i_local_spike_buf_map,
@@ -379,7 +381,7 @@ int  NESTGPU::RemoteConnectionMapCalibrate(int i_host, int n_hosts)
   ////////////////////////////////////////
 #endif
 
-  int n_nodes = GetNNode(); // number of nodes
+  int n_nodes = GetNLocalNodes(); // number of nodes
   // n_target_hosts[i_node] is the number of remote target hosts
   // on which each local node
   // has outgoing connections
@@ -845,30 +847,31 @@ __global__ void MapIndexToSpikeBufferKernel(int n_hosts, int *host_offset,
 
 __global__ void addOffsetToExternalNodeIdsKernel
 (int64_t n_conn, uint *key_subarray, connection_struct *conn_subarray,
- int i_ext_node_0)
+ int i_image_node_0)
 {
   int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
   if (i_conn>=n_conn) return;
   uint target_port = conn_subarray[i_conn].target_port;
   if (target_port & (1 << (MaxPortNBits - 1))) {
     target_port = target_port ^ (1 << (MaxPortNBits - 1));
     conn_subarray[i_conn].target_port = target_port; 
-    key_subarray[i_conn] += (i_ext_node_0 << MaxPortNBits);
+    key_subarray[i_conn] += (i_image_node_0 << MaxPortNBits);
   }
 }
 
 int NESTGPU::addOffsetToExternalNodeIds()
 {
   int64_t block_size = h_ConnBlockSize;
   int n_blocks = (NConn - 1) / block_size + 1;
-
+  int i_image_node_0 = GetNLocalNodes();
+
   for (int ib=0; ib<n_blocks; ib++) {
     uint64_t n_block_conn = block_size; // number of connections in the block
     if (ib == n_blocks-1) { // last block
       n_block_conn = (NConn - 1) % block_size + 1;
     }
     addOffsetToExternalNodeIdsKernel<<<(n_block_conn+1023)/1024, 1024>>>
-      (n_block_conn, KeySubarray[ib], ConnectionSubarray[ib], i_ext_node_0_);
+      (n_block_conn, KeySubarray[ib], ConnectionSubarray[ib], i_image_node_0);
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
   }
@@ -897,24 +900,26 @@ int NESTGPU::addOffsetToExternalNodeIds()
 }
 
 __global__ void addOffsetToSpikeBufferMapKernel(int i_host, int n_node_map,
-						int i_ext_node_0)
+						int i_image_node_0)
 {
   int i_node_map = threadIdx.x + blockIdx.x * blockDim.x;
   if (i_node_map>=n_node_map) return;
 
   const int i_block = i_node_map / node_map_block_size;
   const int i = i_node_map % node_map_block_size;
-  local_spike_buffer_map[i_host][i_block][i] += i_ext_node_0;
+  local_spike_buffer_map[i_host][i_block][i] += i_image_node_0;
 }
 
 int NESTGPU::addOffsetToSpikeBufferMap()
 {
+  int i_image_node_0 = GetNLocalNodes();
+
   for (int i_host=0; i_host<n_hosts_; i_host++) {
     if (i_host != this_host_) {
       int n_node_map = h_n_remote_source_node_map[i_host];
       if (n_node_map > 0) {
 	addOffsetToSpikeBufferMapKernel<<<(n_node_map+1023)/1024, 1024>>>
-	  (i_host, n_node_map, i_ext_node_0_);
+	  (i_host, n_node_map, i_image_node_0);
 	gpuErrchk( cudaPeekAtLastError() );
 	gpuErrchk( cudaDeviceSynchronize() );
       }

diff --git a/src/remote_connect.h b/src/remote_connect.h
@@ -54,6 +54,8 @@ extern int **d_node_target_host_i_map; // [i_node]
 // - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli)
 extern bool *use_all_source_nodes; // [n_connection_rules]:
 
+extern __constant__ int n_local_nodes; // number of local nodes
+
 // device function that checks if an int value is in a sorted 2d-array 
 // assuming that the entries in the 2d-array are sorted.
 // The 2d-array is divided in noncontiguous blocks of size block_size
@@ -284,7 +286,7 @@ int NESTGPU::_RemoteConnectSource(int source_host, T1 source, int n_source,
   // n_nodes will be the first index for new mapping of remote source nodes
   // to local spike buffers
   //int spike_buffer_map_i0 = GetNNode();
-  int spike_buffer_map_i0 = n_ext_nodes_;
+  int spike_buffer_map_i0 = n_image_nodes_;
   syn_spec.port_ = syn_spec.port_ | (1 << (h_MaxPortNBits-1));
 
   // check if the flag UseAllSourceNodes[conn_rule] is false
@@ -774,12 +776,12 @@ int NESTGPU::_RemoteConnectSource(int source_host, T1 source, int n_source,
   fixConnectionSourceNodeIndexes(KeySubarray, old_n_conn, NConn,
 				 h_ConnBlockSize, d_local_node_index);
 
-  // On target host. Create n_nodes_to_map nodes of type ext_neuron
+  // On target host. Create n_nodes_to_map nodes of type image_node
   //std::cout << "h_n_node_to_map " << h_n_node_to_map <<"\n";
   if (h_n_node_to_map > 0) {
-    //_Create("ext_neuron", h_n_node_to_map);
-    n_ext_nodes_ += h_n_node_to_map;
-    //std::cout << "n_ext_nodes_ " << n_ext_nodes_ <<"\n";
+    //_Create("image_node", h_n_node_to_map);
+    n_image_nodes_ += h_n_node_to_map;
+    //std::cout << "n_image_nodes_ " << n_image_nodes_ <<"\n";
   }
 
   return 0;