From e2b88a81630249374a060406449adb6e62bfaa10 Mon Sep 17 00:00:00 2001
From: Bruno Golosio <golosio@unica.it>
Date: Thu, 19 Oct 2023 00:42:13 +0200
Subject: [PATCH] Removed unnecessary memory allocation and updates for image
 nodes

---
 src/nestgpu.cu        |  86 +++++++++++++++---------
 src/nestgpu.h         |  14 ++--
 src/remote_connect.cu |  21 +++---
 src/remote_connect.h  |  12 ++--
 src/spike_buffer.cu   | 147 +++++++++++++++++++++---------------------
 src/spike_mpi.cu      |   6 +-
 6 files changed, 162 insertions(+), 124 deletions(-)

diff --git a/src/nestgpu.cu b/src/nestgpu.cu
index 0edb43cf1..8cb77a54a 100644
--- a/src/nestgpu.cu
+++ b/src/nestgpu.cu
@@ -185,7 +185,7 @@ NESTGPU::NESTGPU()
   sim_time_ = 1000.0;        //Simulation time in ms
   //n_poiss_nodes_ = 0;
   n_remote_nodes_.assign(1, 0);
-  n_ext_nodes_ = 0;
+  n_image_nodes_ = 0;
   SetTimeResolution(0.1);  // time resolution in ms
   max_spike_num_fact_ = 1.0;
   max_spike_per_host_fact_ = 1.0;
@@ -390,31 +390,45 @@ int NESTGPU::GetMaxSpikeBufferSize()
   return max_spike_buffer_size_;
 }
 
-uint NESTGPU::GetNNode()
+uint NESTGPU::GetNLocalNodes()
 {
   return node_group_map_.size();
 }
 
-int NESTGPU::CreateNodeGroup(int n_node, int n_port)
+int NESTGPU::CheckImageNodes(int n_nodes)
 {
-  int i_node_0 = GetNNode();
-  int max_n_neurons = IntPow(2,h_MaxNodeNBits);
+  int i_node_0 = GetNLocalNodes();
+  int max_n_nodes = IntPow(2,h_MaxNodeNBits);
+  
+  if ((i_node_0 + n_nodes) > max_n_nodes) {
+    throw ngpu_exception(std::string("Local plus Image nodes exceed maximum"
+				     " number of nodes ")
+			 + std::to_string(max_n_nodes));
+  }
+  
+  return i_node_0;
+}
+
+int NESTGPU::CreateNodeGroup(int n_nodes, int n_ports)
+{
+  int i_node_0 = GetNLocalNodes();
+  int max_n_nodes = IntPow(2,h_MaxNodeNBits);
   int max_n_ports = IntPow(2,h_MaxPortNBits);
   
-  if ((i_node_0 + n_node) > max_n_neurons) {
-    throw ngpu_exception(std::string("Maximum number of neurons ")
-			 + std::to_string(max_n_neurons) + " exceeded");
+  if ((i_node_0 + n_nodes) > max_n_nodes) {
+    throw ngpu_exception(std::string("Maximum number of nodes ")
+			 + std::to_string(max_n_nodes) + " exceeded");
   }
-  if (n_port > max_n_ports) {
+  if (n_ports > max_n_ports) {
     throw ngpu_exception(std::string("Maximum number of ports ")
 			 + std::to_string(max_n_ports) + " exceeded");
   }
   int i_group = node_vect_.size() - 1;
-  node_group_map_.insert(node_group_map_.end(), n_node, i_group);
+  node_group_map_.insert(node_group_map_.end(), n_nodes, i_group);
 
   node_vect_[i_group]->random_generator_ = random_generator_;
-  node_vect_[i_group]->Init(i_node_0, n_node, n_port, i_group);
-  node_vect_[i_group]->get_spike_array_ = InitGetSpikeArray(n_node, n_port);
+  node_vect_[i_group]->Init(i_node_0, n_nodes, n_ports, i_group);
+  node_vect_[i_group]->get_spike_array_ = InitGetSpikeArray(n_nodes, n_ports);
   
   return i_node_0;
 }
@@ -439,17 +453,20 @@ int NESTGPU::Calibrate()
   gpuErrchk(cudaMemcpyToSymbol(NESTGPUTimeResolution, &time_resolution_,
 			       sizeof(float)));
 ///////////////////////////////////
-  i_ext_node_0_ = GetNNode();
-  // std::cout << "i_ext_node_0_: " << i_ext_node_0_ << " n_ext_nodes_: "
-  // << n_ext_nodes_ << "\n";
-  if (n_ext_nodes_ > 0) {
-    _Create("ext_neuron", n_ext_nodes_, 1);
+  int n_nodes = GetNLocalNodes();
+  gpuErrchk(cudaMemcpyToSymbol(n_local_nodes, &n_nodes,
+			       sizeof(int)));
+
+  // std::cout << "n_local_nodes: " << n_nodes << " n_image_nodes_: "
+  // << n_image_nodes_ << "\n";
+  if (n_image_nodes_ > 0) {
+    CheckImageNodes(n_image_nodes_);
     addOffsetToExternalNodeIds();
   }
   
   calibrate_flag_ = true;
   
-  organizeConnections(time_resolution_, GetNNode(),
+  organizeConnections(time_resolution_, GetTotalNNodes(),
 		      NConn, h_ConnBlockSize,
 		      KeySubarray, ConnectionSubarray);
   
@@ -459,7 +476,7 @@ int NESTGPU::Calibrate()
 
   int max_delay_num = h_MaxDelayNum;
   
-  unsigned int n_spike_buffers = GetNNode();
+  unsigned int n_spike_buffers = GetTotalNNodes();
   NestedLoop::Init(n_spike_buffers);
 		   
   // temporary
@@ -472,17 +489,17 @@ int NESTGPU::Calibrate()
   NodeGroupArrayInit();
   
   max_spike_num_ = (int)round(max_spike_num_fact_
-			      * GetNNode()
+			      * GetTotalNNodes()
 			      * max_delay_num);
   max_spike_num_ = (max_spike_num_>1) ? max_spike_num_ : 1;
 
   max_spike_per_host_ = (int)round(max_spike_per_host_fact_
-				   * GetNNode()
+				   * GetNLocalNodes()
 				   * max_delay_num);
   max_spike_per_host_ = (max_spike_per_host_>1) ? max_spike_per_host_ : 1;
   
   SpikeInit(max_spike_num_);
-  SpikeBufferInit(GetNNode(), max_spike_buffer_size_);
+  SpikeBufferInit(GetTotalNNodes(), max_spike_buffer_size_);
 
   //#ifndef CHECKRC
   if (n_hosts_ > 1) {
@@ -577,7 +594,7 @@ int NESTGPU::Calibrate()
   //#endif
 
   if (rev_conn_flag_) {
-    RevSpikeInit(GetNNode());
+    RevSpikeInit(GetNLocalNodes());
   }
  
   multimeter_->OpenFiles();
@@ -706,7 +723,7 @@ int NESTGPU::SimulationStep()
   double time_mark;
 
   time_mark = getRealTime();
-  SpikeBufferUpdate<<<(GetNNode()+1023)/1024, 1024>>>();
+  SpikeBufferUpdate<<<(GetTotalNNodes()+1023)/1024, 1024>>>();
   gpuErrchk( cudaPeekAtLastError() );
   SpikeBufferUpdate_time_ += (getRealTime() - time_mark);
   time_mark = getRealTime();
@@ -744,7 +761,8 @@ int NESTGPU::SimulationStep()
     if (n_ext_spike != 0) {
       time_mark = getRealTime();
       SendExternalSpike<<<(n_ext_spike+1023)/1024, 1024>>>();
-      gpuErrchk( cudaPeekAtLastError() );
+      //gpuErrchk( cudaPeekAtLastError() );
+      CUDASYNC;
       SendExternalSpike_time_ += (getRealTime() - time_mark);
     }
     //for (int ih=0; ih<connect_mpi_->mpi_np_; ih++) {
@@ -752,15 +770,18 @@ int NESTGPU::SimulationStep()
 
     time_mark = getRealTime();
     SendSpikeToRemote(n_hosts_, max_spike_per_host_);
+    CUDASYNC;
     SendSpikeToRemote_time_ += (getRealTime() - time_mark);
     time_mark = getRealTime();
     RecvSpikeFromRemote(n_hosts_, max_spike_per_host_);
-				      
+    CUDASYNC;			      
     RecvSpikeFromRemote_time_ += (getRealTime() - time_mark);
     CopySpikeFromRemote(n_hosts_, max_spike_per_host_);
+    CUDASYNC;
     MPI_Barrier(MPI_COMM_WORLD);
  
   }
+  CUDASYNC;
   
   int n_spikes;
   time_mark = getRealTime();
@@ -824,7 +845,8 @@ int NESTGPU::SimulationStep()
     //time_mark = getRealTime();
     RevSpikeReset<<<1, 1>>>();
     gpuErrchk( cudaPeekAtLastError() );
-    RevSpikeBufferUpdate<<<(GetNNode()+1023)/1024, 1024>>>(GetNNode());
+    RevSpikeBufferUpdate<<<(GetNLocalNodes()+1023)/1024, 1024>>>
+      (GetNLocalNodes());
     gpuErrchk( cudaPeekAtLastError() );
     unsigned int n_rev_spikes;
     gpuErrchk(cudaMemcpy(&n_rev_spikes, d_RevSpikeNum, sizeof(unsigned int),
@@ -1624,11 +1646,12 @@ int64_t *NESTGPU::GetConnections(int i_source, int n_source,
 {
   if (n_source<=0) {
     i_source = 0;
-    n_source = GetNNode();
+    // gets also connections from image neurons
+    n_source = GetTotalNNodes();
   }
   if (n_target<=0) {
     i_target = 0;
-    n_target = GetNNode();
+    n_target = GetNLocalNodes();
   }
   int *i_source_pt = new int[n_source];
   for (int i=0; i<n_source; i++) {
@@ -1654,7 +1677,7 @@ int64_t *NESTGPU::GetConnections(int *i_source_pt, int n_source,
 {
   if (n_target<=0) {
     i_target = 0;
-    n_target = GetNNode();
+    n_target = GetNLocalNodes();
   }
   int *i_target_pt = new int[n_target];
   for (int i=0; i<n_target; i++) {
@@ -1676,7 +1699,8 @@ int64_t *NESTGPU::GetConnections(int i_source, int n_source,
 {
   if (n_source<=0) {
     i_source = 0;
-    n_source = GetNNode();
+    //  gets also connections from image neurons
+    n_source = GetTotalNNodes();
   }
   int *i_source_pt = new int[n_source];
   for (int i=0; i<n_source; i++) {
diff --git a/src/nestgpu.h b/src/nestgpu.h
index 35e845abc..2d3d258f1 100644
--- a/src/nestgpu.h
+++ b/src/nestgpu.h
@@ -148,9 +148,10 @@ class NESTGPU
   long long Nt_; // number of simulation time steps
   //int n_poiss_nodes_;
   std::vector<int> n_remote_nodes_;
-  int n_ext_nodes_;
-  int i_ext_node_0_;
+  //int n_ext_nodes_;
+  //int i_ext_node_0_;
   //int i_remote_node_0_;
+  int n_image_nodes_;
 
   double start_real_time_;
   double build_real_time_;
@@ -177,13 +178,14 @@ class NESTGPU
   int InitConnRandomGenerator();
   int FreeConnRandomGenerator();
 
-  int CreateNodeGroup(int n_neuron, int n_port);
+  int CreateNodeGroup(int n_nodes, int n_ports);
   int CheckUncalibrated(std::string message);
-  double *InitGetSpikeArray(int n_node, int n_port);
+  double *InitGetSpikeArray(int n_nodes, int n_ports);
   int NodeGroupArrayInit();
   int ClearGetSpikeArrays();
   int FreeGetSpikeArrays();
   int FreeNodeGroupMap();
+  int CheckImageNodes(int n_nodes);
 
   NodeSeq _Create(std::string model_name, int n_nodes, int n_ports);
   
@@ -300,7 +302,9 @@ class NESTGPU
   int SetMaxSpikeBufferSize(int max_size);
   int GetMaxSpikeBufferSize();
   
-  uint GetNNode();
+  uint GetNLocalNodes();
+  
+  uint GetTotalNNodes() { return GetNLocalNodes() + n_image_nodes_; }
 
   int HostNum() {
     return n_hosts_;
diff --git a/src/remote_connect.cu b/src/remote_connect.cu
index 15c1f6338..564f010c5 100644
--- a/src/remote_connect.cu
+++ b/src/remote_connect.cu
@@ -92,6 +92,8 @@ int **d_node_map_index; // [i_node]
 // - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli)
 bool *use_all_source_nodes; // [n_connection_rules]:
 
+__constant__ int n_local_nodes; // number of local nodes
+
 // Allocate GPU memory for new remote-source-node-map blocks
 int allocRemoteSourceNodeMapBlocks(std::vector<int*> &i_remote_src_node_map,
 				   std::vector<int*> &i_local_spike_buf_map,
@@ -379,7 +381,7 @@ int  NESTGPU::RemoteConnectionMapCalibrate(int i_host, int n_hosts)
   ////////////////////////////////////////
 #endif
 
-  int n_nodes = GetNNode(); // number of nodes
+  int n_nodes = GetNLocalNodes(); // number of nodes
   // n_target_hosts[i_node] is the number of remote target hosts
   // on which each local node
   // has outgoing connections
@@ -845,7 +847,7 @@ __global__ void MapIndexToSpikeBufferKernel(int n_hosts, int *host_offset,
 
 __global__ void addOffsetToExternalNodeIdsKernel
 (int64_t n_conn, uint *key_subarray, connection_struct *conn_subarray,
- int i_ext_node_0)
+ int i_image_node_0)
 {
   int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
   if (i_conn>=n_conn) return;
@@ -853,7 +855,7 @@ __global__ void addOffsetToExternalNodeIdsKernel
   if (target_port & (1 << (MaxPortNBits - 1))) {
     target_port = target_port ^ (1 << (MaxPortNBits - 1));
     conn_subarray[i_conn].target_port = target_port; 
-    key_subarray[i_conn] += (i_ext_node_0 << MaxPortNBits);
+    key_subarray[i_conn] += (i_image_node_0 << MaxPortNBits);
   }
 }
 
@@ -861,14 +863,15 @@ int NESTGPU::addOffsetToExternalNodeIds()
 {
   int64_t block_size = h_ConnBlockSize;
   int n_blocks = (NConn - 1) / block_size + 1;
-
+  int i_image_node_0 = GetNLocalNodes();
+  
   for (int ib=0; ib<n_blocks; ib++) {
     uint64_t n_block_conn = block_size; // number of connections in the block
     if (ib == n_blocks-1) { // last block
       n_block_conn = (NConn - 1) % block_size + 1;
     }
     addOffsetToExternalNodeIdsKernel<<<(n_block_conn+1023)/1024, 1024>>>
-      (n_block_conn, KeySubarray[ib], ConnectionSubarray[ib], i_ext_node_0_);
+      (n_block_conn, KeySubarray[ib], ConnectionSubarray[ib], i_image_node_0);
     gpuErrchk( cudaPeekAtLastError() );
     gpuErrchk( cudaDeviceSynchronize() );
   }
@@ -897,24 +900,26 @@ int NESTGPU::addOffsetToExternalNodeIds()
 }
 
 __global__ void addOffsetToSpikeBufferMapKernel(int i_host, int n_node_map,
-						int i_ext_node_0)
+						int i_image_node_0)
 {
   int i_node_map = threadIdx.x + blockIdx.x * blockDim.x;
   if (i_node_map>=n_node_map) return;
   
   const int i_block = i_node_map / node_map_block_size;
   const int i = i_node_map % node_map_block_size;
-  local_spike_buffer_map[i_host][i_block][i] += i_ext_node_0;
+  local_spike_buffer_map[i_host][i_block][i] += i_image_node_0;
 }
 
 int NESTGPU::addOffsetToSpikeBufferMap()
 {
+  int i_image_node_0 = GetNLocalNodes();
+
   for (int i_host=0; i_host<n_hosts_; i_host++) {
     if (i_host != this_host_) {
       int n_node_map = h_n_remote_source_node_map[i_host];
       if (n_node_map > 0) {
 	addOffsetToSpikeBufferMapKernel<<<(n_node_map+1023)/1024, 1024>>>
-	  (i_host, n_node_map, i_ext_node_0_);
+	  (i_host, n_node_map, i_image_node_0);
 	gpuErrchk( cudaPeekAtLastError() );
 	gpuErrchk( cudaDeviceSynchronize() );
       }
diff --git a/src/remote_connect.h b/src/remote_connect.h
index 10f45d62f..89e83ba72 100644
--- a/src/remote_connect.h
+++ b/src/remote_connect.h
@@ -54,6 +54,8 @@ extern int **d_node_target_host_i_map; // [i_node]
 // - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli)
 extern bool *use_all_source_nodes; // [n_connection_rules]:
 
+extern __constant__ int n_local_nodes; // number of local nodes
+
 // device function that checks if an int value is in a sorted 2d-array 
 // assuming that the entries in the 2d-array are sorted.
 // The 2d-array is divided in noncontiguous blocks of size block_size
@@ -284,7 +286,7 @@ int NESTGPU::_RemoteConnectSource(int source_host, T1 source, int n_source,
   // n_nodes will be the first index for new mapping of remote source nodes
   // to local spike buffers
   //int spike_buffer_map_i0 = GetNNode();
-  int spike_buffer_map_i0 = n_ext_nodes_;
+  int spike_buffer_map_i0 = n_image_nodes_;
   syn_spec.port_ = syn_spec.port_ | (1 << (h_MaxPortNBits-1));
     
   // check if the flag UseAllSourceNodes[conn_rule] is false
@@ -774,12 +776,12 @@ int NESTGPU::_RemoteConnectSource(int source_host, T1 source, int n_source,
   fixConnectionSourceNodeIndexes(KeySubarray, old_n_conn, NConn,
 				 h_ConnBlockSize, d_local_node_index);
 
-  // On target host. Create n_nodes_to_map nodes of type ext_neuron
+  // On target host. Create n_nodes_to_map nodes of type image_node
   //std::cout << "h_n_node_to_map " << h_n_node_to_map <<"\n";
   if (h_n_node_to_map > 0) {
-    //_Create("ext_neuron", h_n_node_to_map);
-    n_ext_nodes_ += h_n_node_to_map;
-    //std::cout << "n_ext_nodes_ " << n_ext_nodes_ <<"\n";
+    //_Create("image_node", h_n_node_to_map);
+    n_image_nodes_ += h_n_node_to_map;
+    //std::cout << "n_image_nodes_ " << n_image_nodes_ <<"\n";
   }
   
   return 0;
diff --git a/src/spike_buffer.cu b/src/spike_buffer.cu
index f4ecb4f6b..243d06b09 100644
--- a/src/spike_buffer.cu
+++ b/src/spike_buffer.cu
@@ -63,6 +63,8 @@ __device__ long long *LastRevSpikeTimeIdx; //
 unsigned short *d_ConnectionSpikeTime; // [NConnection];
 __device__ unsigned short *ConnectionSpikeTime; //
 
+extern __constant__ int n_local_nodes;
+
 //////////////////////////////////////////////////////////////////////
 
 int *d_SpikeBufferSize; // [NSpikeBuffer];
@@ -101,60 +103,59 @@ __device__ float *SpikeBufferHeight; // [NSpikeBuffer*MaxSpikeBufferNum];
 ////////////////////////////////////////////////////////////
 __device__ void PushSpike(int i_spike_buffer, float height)
 {
-  LastSpikeTimeIdx[i_spike_buffer] = NESTGPUTimeIdx;
-  LastSpikeHeight[i_spike_buffer] = height;
-  int i_group = NodeGroupMap[i_spike_buffer];
-  int den_delay_idx;
-  float *den_delay_arr = NodeGroupArray[i_group].den_delay_arr_;
-  // check if node has dendritic delay
-  if (den_delay_arr != NULL) {
-    int i_neuron = i_spike_buffer - NodeGroupArray[i_group].i_node_0_;
-    int n_param = NodeGroupArray[i_group].n_param_;
-    // dendritic delay index is stored in the parameter array
-    // den_delay_arr points to the dendritic delay if the first
-    // node of the group. The other are separate by steps = n_param
-    den_delay_idx = (int)round(den_delay_arr[i_neuron*n_param]
-			       /NESTGPUTimeResolution);
-    //printf("isb %d\tden_delay_idx: %d\n", i_spike_buffer, den_delay_idx);
-  }
-  else {
-    den_delay_idx = 0;
-  }
-  // printf("Node %d spikes at time %lld , den_delay_idx: %d\n",
-  //	 i_spike_buffer, NESTGPUTimeIdx, den_delay_idx); 
-  if (den_delay_idx==0) {
-    // last time when spike is sent back to dendrites (e.g. for STDP)
-    LastRevSpikeTimeIdx[i_spike_buffer] = NESTGPUTimeIdx;
-  }
-
-  if (ExternalSpikeFlag) {
-    // if active spike should eventually be sent to remote connections
-    //printf("PushExternalSpike i_spike_buffer: %d height: %f\n",
-    //	   i_spike_buffer, height);
-    PushExternalSpike(i_spike_buffer, height);
-  }
-  
-  // if recording  spike counts is activated, increase counter
-  if (NodeGroupArray[i_group].spike_count_ != NULL) {
-    int i_node_0 = NodeGroupArray[i_group].i_node_0_;
-    NodeGroupArray[i_group].spike_count_[i_spike_buffer-i_node_0]++;
-  }
-
-  // check if recording spike times is activated
-  int max_n_rec_spike_times = NodeGroupArray[i_group].max_n_rec_spike_times_;
-  if (max_n_rec_spike_times != 0) {
-    int i_node_rel = i_spike_buffer - NodeGroupArray[i_group].i_node_0_;
-    int n_rec_spike_times =
-      NodeGroupArray[i_group].n_rec_spike_times_[i_node_rel];
-    if (n_rec_spike_times>=max_n_rec_spike_times-1) {
-      printf("Maximum number of recorded spike times exceeded"
-	     " for spike buffer %d\n", i_spike_buffer);
+  int den_delay_idx = 0;
+  if (i_spike_buffer<n_local_nodes) {
+    LastSpikeTimeIdx[i_spike_buffer] = NESTGPUTimeIdx;
+    LastSpikeHeight[i_spike_buffer] = height;
+    int i_group = NodeGroupMap[i_spike_buffer];
+    float *den_delay_arr = NodeGroupArray[i_group].den_delay_arr_;
+    // check if node has dendritic delay
+    if (den_delay_arr != NULL) {
+      int i_neuron = i_spike_buffer - NodeGroupArray[i_group].i_node_0_;
+      int n_param = NodeGroupArray[i_group].n_param_;
+      // dendritic delay index is stored in the parameter array
+      // den_delay_arr points to the dendritic delay if the first
+      // node of the group. The other are separate by steps = n_param
+      den_delay_idx = (int)round(den_delay_arr[i_neuron*n_param]
+				 /NESTGPUTimeResolution);
+      //printf("isb %d\tden_delay_idx: %d\n", i_spike_buffer, den_delay_idx);
     }
-    else { // record spike time
-      NodeGroupArray[i_group].rec_spike_times_
-	[i_node_rel*max_n_rec_spike_times + n_rec_spike_times]
-	= NESTGPUTime;
-      NodeGroupArray[i_group].n_rec_spike_times_[i_node_rel]++;
+    // printf("Node %d spikes at time %lld , den_delay_idx: %d\n",
+    //	 i_spike_buffer, NESTGPUTimeIdx, den_delay_idx); 
+    if (den_delay_idx==0) {
+      // last time when spike is sent back to dendrites (e.g. for STDP)
+      LastRevSpikeTimeIdx[i_spike_buffer] = NESTGPUTimeIdx;
+    }
+    
+    if (ExternalSpikeFlag) {
+      // if active spike should eventually be sent to remote connections
+      //printf("PushExternalSpike i_spike_buffer: %d height: %f\n",
+      //	   i_spike_buffer, height);
+      PushExternalSpike(i_spike_buffer, height);
+    }
+    
+    // if recording  spike counts is activated, increase counter
+    if (NodeGroupArray[i_group].spike_count_ != NULL) {
+      int i_node_0 = NodeGroupArray[i_group].i_node_0_;
+      NodeGroupArray[i_group].spike_count_[i_spike_buffer-i_node_0]++;
+    }
+    
+    // check if recording spike times is activated
+    int max_n_rec_spike_times = NodeGroupArray[i_group].max_n_rec_spike_times_;
+    if (max_n_rec_spike_times != 0) {
+      int i_node_rel = i_spike_buffer - NodeGroupArray[i_group].i_node_0_;
+      int n_rec_spike_times =
+	NodeGroupArray[i_group].n_rec_spike_times_[i_node_rel];
+      if (n_rec_spike_times>=max_n_rec_spike_times-1) {
+	printf("Maximum number of recorded spike times exceeded"
+	       " for spike buffer %d\n", i_spike_buffer);
+      }
+      else { // record spike time
+	NodeGroupArray[i_group].rec_spike_times_
+	  [i_node_rel*max_n_rec_spike_times + n_rec_spike_times]
+	  = NESTGPUTime;
+	NodeGroupArray[i_group].n_rec_spike_times_[i_node_rel]++;
+      }
     }
   }
 
@@ -192,23 +193,22 @@ __global__ void SpikeBufferUpdate()
 {
   int i_spike_buffer = threadIdx.x + blockIdx.x * blockDim.x;
   if (i_spike_buffer>=NSpikeBuffer) return;
-  
-  int i_group=NodeGroupMap[i_spike_buffer];
-  int den_delay_idx;
-  float *den_delay_arr = NodeGroupArray[i_group].den_delay_arr_;
-  // check if node has dendritic delay
-  if (den_delay_arr != NULL) {
-    int i_neuron = i_spike_buffer - NodeGroupArray[i_group].i_node_0_;
-    int n_param = NodeGroupArray[i_group].n_param_;
-    // dendritic delay index is stored in the parameter array
-    // den_delay_arr points to the dendritic delay if the first
-    // node of the group. The other are separate by steps = n_param
-    den_delay_idx = (int)round(den_delay_arr[i_neuron*n_param]
-			       /NESTGPUTimeResolution);
-    //printf("isb update %d\tden_delay_idx: %d\n", i_spike_buffer, den_delay_idx);
-  }
-  else {
-    den_delay_idx = 0;
+
+  int den_delay_idx = 0;
+  if (i_spike_buffer<n_local_nodes) {
+    int i_group=NodeGroupMap[i_spike_buffer];
+    float *den_delay_arr = NodeGroupArray[i_group].den_delay_arr_;
+    // check if node has dendritic delay
+    if (den_delay_arr != NULL) {
+      int i_neuron = i_spike_buffer - NodeGroupArray[i_group].i_node_0_;
+      int n_param = NodeGroupArray[i_group].n_param_;
+      // dendritic delay index is stored in the parameter array
+      // den_delay_arr points to the dendritic delay if the first
+      // node of the group. The other are separate by steps = n_param
+      den_delay_idx = (int)round(den_delay_arr[i_neuron*n_param]
+				 /NESTGPUTimeResolution);
+      //printf("isb update %d\tden_delay_idx: %d\n", i_spike_buffer, den_delay_idx);
+    }
   }
   // flag for sending spikes back through dendrites (e.g. for STDP)
   bool rev_spike = false;
@@ -230,13 +230,12 @@ __global__ void SpikeBufferUpdate()
     //int ig = ConnGroupIdx0[i_spike_buffer] + i_conn;
     // if spike time matches connection group delay deliver it
     // to global spike array
-
-
+    
     uint conn_group_i0 = ConnGroupIdx0[i_spike_buffer];
     uint conn_group_num = ConnGroupIdx0[i_spike_buffer+1] - conn_group_i0;
     int ig = conn_group_i0 + i_conn;
     
-
+    
     if (i_conn<conn_group_num) {
 #ifdef OPTIMIZE_FOR_MEMORY
       int64_t conn_group_i_conn0 = ConnGroupIConn0[ig];
@@ -275,7 +274,7 @@ __global__ void SpikeBufferUpdate()
       // increase time index
     }
   }
-
+    
   if (rev_spike) {
     LastRevSpikeTimeIdx[i_spike_buffer] = NESTGPUTimeIdx+1;
   }
diff --git a/src/spike_mpi.cu b/src/spike_mpi.cu
index e9881e484..067d98906 100644
--- a/src/spike_mpi.cu
+++ b/src/spike_mpi.cu
@@ -426,15 +426,18 @@ int NESTGPU::CopySpikeFromRemote(int n_hosts, int max_spike_per_host)
     gpuErrchk(cudaMemcpyAsync(d_ExternalSourceSpikeCumul,
 			      h_ExternalSourceSpikeCumul,
 			      (n_hosts+1)*sizeof(int), cudaMemcpyHostToDevice));
+    CUDASYNC;
     // copy to GPU memory packed spikes from remote MPI proc
     gpuErrchk(cudaMemcpyAsync(d_ExternalSourceSpikeNodeId,
 			      h_ExternalSourceSpikeNodeId,
 			      n_spike_tot*sizeof(int), cudaMemcpyHostToDevice));
+    CUDASYNC;
     RecvSpikeFromRemote_CUDAcp_time_ += (getRealTime() - time_mark);
     // convert node map indexes to spike buffer indexes
     MapIndexToSpikeBufferKernel<<<n_hosts, 1024>>>(n_hosts,
 						   d_ExternalSourceSpikeCumul,
 						   d_ExternalSourceSpikeNodeId);
+    CUDASYNC;
     // convert node group indexes to spike buffer indexes
     // by adding the index of the first node of the node group  
     //AddOffset<<<(n_spike_tot+1023)/1024, 1024>>>
@@ -444,7 +447,8 @@ int NESTGPU::CopySpikeFromRemote(int n_hosts, int max_spike_per_host)
     // push remote spikes in local spike buffers
     PushSpikeFromRemote<<<(n_spike_tot+1023)/1024, 1024>>>
       (n_spike_tot, d_ExternalSourceSpikeNodeId);
-    gpuErrchk( cudaPeekAtLastError() );
+    CUDASYNC;
+    //gpuErrchk( cudaPeekAtLastError() );
     //cudaDeviceSynchronize();
   }