From 421eee0abdd7bbdc777cfd1d5d846f5a4f6590d9 Mon Sep 17 00:00:00 2001
From: Craig Warner <cwarner@astro.ufl.edu>
Date: Wed, 29 Mar 2023 18:11:49 -0400
Subject: [PATCH 1/5] Refactored code to minimize GPU to CPU copying, resulting
 in big speed-ups. The refactoring results in having two copies of several
 arrays (flux, wavelength, etc) - one on the GPU and one on the CPU.  It also
 results in several scalars such as min/max wavelength being stored on the CPU
 to avoid very costly single value being copied to/from GPU.  This results in
 a speed up on "Finding best redshift" from 12.5s to 9.3s on 64 CPU / 4 GPU
 and from 10.0s to 7.2s on 4 CPU / 4 GPU, bringing it closer in line with the
 6.7s for 64 CPU.

---
 py/redrock/fitz.py      |  59 ++++++++++++++++-----
 py/redrock/rebin.py     | 110 ++++++++++++++++++++++++++++++----------
 py/redrock/targets.py   |  45 ++++++++++++++++
 py/redrock/templates.py |   5 ++
 py/redrock/zfind.py     |  11 ++--
 py/redrock/zscan.py     |  20 +++++---
 6 files changed, 198 insertions(+), 52 deletions(-)

diff --git a/py/redrock/fitz.py b/py/redrock/fitz.py
index 03b33e03..2ae1ffa1 100644
--- a/py/redrock/fitz.py
+++ b/py/redrock/fitz.py
@@ -107,7 +107,7 @@ def minfit(x, y):
     return (x0, xerr, y0, zwarn)
 
 
-def fitz(zchi2, redshifts, spectra, template, nminima=3, archetype=None, use_gpu=False):
+def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=False):
     """Refines redshift measurement around up to nminima minima.
 
     TODO:
@@ -116,8 +116,8 @@ def fitz(zchi2, redshifts, spectra, template, nminima=3, archetype=None, use_gpu
     Args:
         zchi2 (array): chi^2 values for each redshift.
         redshifts (array): the redshift values.
-        spectra (list): list of Spectrum objects at different wavelengths
-            grids.
+        target (Target): the target for this fit which includes a list
+            of Spectrum objects at different wavelength grids.
         template (Template): the template for this fit.
         nminima (int): the number of minima to consider.
         use_gpu (bool): use GPU or not
@@ -132,6 +132,7 @@ def fitz(zchi2, redshifts, spectra, template, nminima=3, archetype=None, use_gpu
         import cupy as cp
 
     nbasis = template.nbasis
+    spectra = target.spectra
 
     # Build dictionary of wavelength grids
     dwave = { s.wavehash:s.wave for s in spectra }
@@ -146,10 +147,13 @@ def fitz(zchi2, redshifts, spectra, template, nminima=3, archetype=None, use_gpu
 
     (weights, flux, wflux) = spectral_data(spectra)
     if (use_gpu):
-        #Copy arrays to GPU
-        weights = cp.asarray(weights)
-        flux = cp.asarray(flux)
-        wflux = cp.asarray(wflux)
+        #Get CuPy arrays of weights, flux, wflux
+        #These are created on the first call of gpu_spectral_data() for a
+        #target and stored.  They are retrieved on subsequent calls.
+        (gpuweights, gpuflux, gpuwflux) = target.gpu_spectral_data()
+        # Build dictionaries of wavelength bin edges, min/max, and centers
+        gpuedges = { s.wavehash:(s.gpuedges, s.minedge, s.maxedge) for s in spectra }
+        gpudwave = { s.wavehash:s.gpuwave for s in spectra }
 
     results = list()
     #Define nz here instead of hard-coding length 15 and then defining nz as
@@ -171,12 +175,25 @@ def fitz(zchi2, redshifts, spectra, template, nminima=3, archetype=None, use_gpu
         ilo = max(0, imin-1)
         ihi = min(imin+1, len(zchi2)-1)
         zz = np.linspace(redshifts[ilo], redshifts[ihi], nz)
+        if (use_gpu):
+            #Create a redshift grid on the GPU as well
+            gpuzz = cp.asarray(zz)
 
         zzchi2 = np.zeros(nz, dtype=np.float64)
         zzcoeff = np.zeros((nz, nbasis), dtype=np.float64)
 
+        #Calculate xmin and xmax from template and zz array on CPU and
+        #pass as scalars
+        xmin = template.minwave*(1+zz.max())
+        xmax = template.maxwave*(1+zz.min())
+
         #Use batch mode for rebin_template, transmission_Lyman, and calc_zchi2
-        binned = rebin_template(template, zz, dwave, use_gpu=use_gpu)
+        if (use_gpu):
+            #Use gpuedges already calculated and on GPU
+            binned = rebin_template(template, gpuzz, dedges=gpuedges, use_gpu=use_gpu, xmin=xmin, xmax=xmax)
+        else:
+            #Use numpy CPU arrays
+            binned = rebin_template(template, zz, dwave, use_gpu=use_gpu, xmin=xmin, xmax=xmax)
         # Correct spectra for Lyman-series
         for k in list(dwave.keys()):
             #New algorithm accepts all z as an array and returns T, a 2-d
@@ -188,23 +205,41 @@ def fitz(zchi2, redshifts, spectra, template, nminima=3, archetype=None, use_gpu
                 continue
             #Vectorize multiplication
             binned[k] *= T[:,:,None]
-        (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, binned, weights, flux, wflux, nz, nbasis, use_gpu=use_gpu)
+        if (use_gpu):
+            #Use gpu arrays for weights, flux, wflux
+            (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, binned, gpuweights, gpuflux, gpuwflux, nz, nbasis, use_gpu=use_gpu)
+        else:
+            #Use numpy CPU arrays for weights, flux, wflux 
+            (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, binned, weights, flux, wflux, nz, nbasis, use_gpu=use_gpu)
 
         #- fit parabola to 3 points around minimum
         i = min(max(np.argmin(zzchi2),1), len(zz)-2)
         zmin, sigma, chi2min, zwarn = minfit(zz[i-1:i+2], zzchi2[i-1:i+2])
 
         try:
-            binned = rebin_template(template, np.array([zmin]), dwave, use_gpu=use_gpu)
+            #Calculate xmin and xmax from template and pass as scalars 
+            xmin = template.minwave*(1+zmin)
+            ximax = template.maxwave*(1+zmin)
+            if (use_gpu):
+                #Use gpuedges already calculated and on GPU
+                binned = rebin_template(template, cp.array([zmin]), dedges=gpuedges, use_gpu=use_gpu, xmin=xmin, xmax=xmax)
+            else:
+                binned = rebin_template(template, np.array([zmin]), dwave, use_gpu=use_gpu, xmin=xmin, xmax=xmax)
             for k in list(dwave.keys()):
-                T = transmission_Lyman(np.array([zmin]),dwave[k], use_gpu=use_gpu)
+                if (use_gpu):
+                    #Copy binned[k] back to CPU to perform next steps on CPU
+                    #because faster with only 1 redshift
+                    binned[k] = binned[k].get()
+                #Use CPU always
+                T = transmission_Lyman(np.array([zmin]),dwave[k], use_gpu=False)
                 if (T is None):
                     #Return value of None means that wavelenght regime
                     #does not overlap Lyman transmission - continue here
                     continue
                 #Vectorize multiplication
                 binned[k] *= T[:,:,None]
-            (chi2, coeff) = calc_zchi2_batch(spectra, binned, weights, flux, wflux, 1, nbasis, use_gpu=use_gpu)
+            #Use CPU always with one redshift
+            (chi2, coeff) = calc_zchi2_batch(spectra, binned, weights, flux, wflux, 1, nbasis, use_gpu=False)
             coeff = coeff[0,:]
         except ValueError as err:
             if zmin<redshifts[0] or redshifts[-1]<zmin:
diff --git a/py/redrock/rebin.py b/py/redrock/rebin.py
index e13db182..e2aadb5a 100644
--- a/py/redrock/rebin.py
+++ b/py/redrock/rebin.py
@@ -209,7 +209,7 @@ def _trapz_rebin_batch(x, y, edges, myz, results, redshifted_x):
     return
 
 
-def trapz_rebin(x, y, xnew=None, edges=None, myz=None, use_gpu=False, xmin=None, xmax=None):
+def trapz_rebin(x, y, xnew=None, edges=None, myz=None, use_gpu=False, xmin=None, xmax=None, edge_min=None, edge_max=None):
     """Rebin y(x) flux density using trapezoidal integration between bin edges
     Optionally use GPU helper method to rebin in batch, see trapz_rebin_batch_gpu
     Note - current return array shape is (nz, nbins, nbasis).  Changing to
@@ -258,14 +258,22 @@ def trapz_rebin(x, y, xnew=None, edges=None, myz=None, use_gpu=False, xmin=None,
         myz (array): (optional) redshift array to rebin in batch,
             applying redshifts on-the-fly to x
         use_gpu (boolean): whether or not to use GPU algorithm 
-        xmin (float): (optional) minimum x-value - x[0] will be used if
-            omitted - this is useful to avoid GPU to CPU copying in the case
-            where x is a CuPy array and providing the scalar value results
-            in a large speed gain
-        xmax (float) (optional) maximum x-value - x[-1] will be used if
-            omitted - this is useful to avoid GPU to CPU copying in the case
-            where x is a CuPy array and providing the scalar value results
-            in a large speed gain
+        xmin (float): (optional) minimum x-value - x[0]*(1+myz.max()) will be
+            used if omitted - this is useful to avoid GPU to CPU copying in
+            the case where x is a CuPy array and providing the scalar value
+            results in a large speed gain
+        xmax (float) (optional) maximum x-value - x[-1]*(1+myz.min()) will be
+            used if omitted - this is useful to avoid GPU to CPU copying in
+            the case where x is a CuPy array and providing the scalar value
+            results in a large speed gain
+        edge_min (float): (optional) minimum new bin edge - edges[0] will be
+            used if omitted - this is useful to avoid GPU to CPU copying in
+            the case where edges is precomputed on the GPU as a CuPy array
+            and providing the scalar value results in a large speed gain
+        edge_max (float): (optional) maximum new bin edge - edges[-1] will be
+            used if omitted - this is useful to avoid GPU to CPU copying in
+            the case where edges is precomputed on the GPU as a CuPy array
+            and providing the scalar value results in a large speed gain
 
     Returns:
         array: integrated results with len(results) = len(edges)-1
@@ -283,15 +291,17 @@ def trapz_rebin(x, y, xnew=None, edges=None, myz=None, use_gpu=False, xmin=None,
         ValueError: if edges are outside the range of x or if len(x) != len(y)
 
     """
+    import cupy as cp
     if edges is None:
         edges = centers2edges(xnew)
-    else:
+    elif type(edges) != cp.ndarray:
         edges = np.asarray(edges)
     nbins = len(edges)-1
-    if xmin is None:
-        xmin = x[0]
-    if xmax is None:
-        xmax = x[-1]
+    #Set edge_min and edge_max if not passed as input
+    if edge_min is None:
+        edge_min = edges[0]
+    if edge_max is None:
+        edge_max = edges[-1]
 
     #Use these booleans to determine output array shape based on input
     scalar_z = False
@@ -303,17 +313,25 @@ def trapz_rebin(x, y, xnew=None, edges=None, myz=None, use_gpu=False, xmin=None,
     elif (np.isscalar(myz)):
         myz = np.array([myz], dtype=np.float64)
         scalar_z = True
-    myz = np.asarray(myz, dtype=np.float64)
+    elif (type(myz) != cp.ndarray):
+        myz = np.asarray(myz, dtype=np.float64)
     nz = len(myz)
     if (nz == 0):
         #Empty myz array
         return np.zeros((0,nbins, 1), dtype=np.float64)
 
+    #Set xmin and xmax based on x and myz if not passed as input
     #Must multiply x by 1+z for comparison, only need to look at max/min cases
-    #if (edges[0] < x[0]*(1+myz.max()) or edges[-1] > x[-1]*(1+myz.min())):
-    if (edges[0] < xmin*(1+myz.max()) or edges[-1] > xmax*(1+myz.min())):
-        raise ValueError('edges must be within input x range')
+    if xmin is None:
+        xmin = x[0]*(1+myz.max())
+    if xmax is None:
+        xmax = x[-1]*(1+myz.min())
 
+    #Use edge_min, xmin, edge_max, and xmax for comparison - these are
+    #passed as scalars when x and edges are on GPU which results in
+    #large speed gain by minimizing GPU to CPU copying of single values
+    if (edge_min < xmin or edge_max > xmax):
+        raise ValueError('edges must be within input x range')
 
     if (not use_gpu and scalar_z and len(y.shape) == 1):
         #Special case, call _trapz_rebin_1d directly
@@ -400,7 +418,7 @@ def _trapz_rebin_batch_gpu(x, y, edges, myz, result_shape):
     #Divide edges output wavelength array by (1+myz) to get 2d array
     #of input wavelengths at each boundary in edges and
     #use serachsorted to find index for each boundary
-    e2d = cp.asarray(edges/(1+myz[:,None]))
+    e2d = edges/(1+myz[:,None])
     idx = cp.searchsorted(x, e2d, side='right')#.astype(cp.int32)
 
     # Load CUDA kernel
@@ -408,11 +426,11 @@ def _trapz_rebin_batch_gpu(x, y, edges, myz, result_shape):
     batch_trapz_rebin_kernel = cp_module.get_function('batch_trapz_rebin')
 
     #Array sizes
-    nbin = cp.int32(len(edges)-1)
-    nz = cp.int32(len(myz))
-    nbasis = cp.int32(y.shape[0])
+    nbin = len(edges)-1
+    nz = len(myz)
+    nbasis = y.shape[0]
     n = nbin*nbasis*nz
-    nt = cp.int32(len(x))
+    nt = len(x)
     if (result_shape is None):
       result_shape = (nz, nbasis, nbin)
 
@@ -425,18 +443,33 @@ def _trapz_rebin_batch_gpu(x, y, edges, myz, result_shape):
 
     return result
 
-def rebin_template(template, myz, dwave, use_gpu=False):
+def rebin_template(template, myz, dwave=None, dedges=None, use_gpu=False, xmin=None, xmax=None):
     """Rebin a template to a set of wavelengths.
 
     Given a template and a single redshift - or an array of redshifts,
     rebin the template to a set of wavelength arrays.
+    The wavelengths can be passed as centers (dwave) or edges (dedges)
+    of the wavelength bins.
 
     Args:
         template (Template): the template object
         myz (float or array of float): the redshift(s)
         dwave (dict): the keys are the "wavehash" and the values
             are a 1D array containing the wavelength grid.
+        dedges (dict): the keys are the "wavehash" and the values
+            can be either a 1D array containing the bin edges
+            of the wavelength grid (computed from centers2edges)
+            or a 3-element tuple with this 1D array as a CuPy array
+            and the min and max values as scalars in CPU memory.
         use_gpu (bool): whether or not to use the GPU algorithm
+        xmin (float): (optional) minimum x-value - x[0]*(1+myz.max()) will be
+            used if omitted - this is useful to avoid GPU to CPU copying in
+            the case where x is a CuPy array and providing the scalar value
+            results in a large speed gain
+        xmax (float) (optional) maximum x-value - x[-1]*(1+myz.min()) will be
+            used if omitted - this is useful to avoid GPU to CPU copying in
+            the case where x is a CuPy array and providing the scalar value
+            results in a large speed gain
 
     Returns:
         dict:  The rebinned template for every basis function and wavelength
@@ -446,9 +479,32 @@ def rebin_template(template, myz, dwave, use_gpu=False):
     """
     nbasis = template.flux.shape[0]  #- number of template basis vectors
     result = dict()
+    #calculate xmin and xmax if not given
+    if (xmin is None):
+        xmin = template.minwave*(1+myz.max())
+    if (xmax is None):
+        xmax = template.maxwave*(1+myz.min())
     #rebin all z and all bases in batch in parallel
     #and return dict of 3-d numpy / cupy arrays 
-    for hs, wave in dwave.items():
-        result[hs] = trapz_rebin(template.wave, template.flux, xnew=wave, myz=myz, use_gpu=use_gpu, xmin=template.minwave, xmax=template.maxwave)
-
+    if dwave is not None:
+        #Handle the case where dwave is passed
+        for hs, wave in dwave.items():
+            if (use_gpu and template.gpuwave is not None):
+                #Use gpuwave and gpuflux arrays already on GPU
+                result[hs] = trapz_rebin(template.gpuwave, template.gpuflux, xnew=wave, myz=myz, use_gpu=use_gpu, xmin=xmin, xmax=xmax)
+            else:
+                result[hs] = trapz_rebin(template.wave, template.flux, xnew=wave, myz=myz, use_gpu=use_gpu, xmin=xmin, xmax=xmax)
+    elif dedges is not None:
+        #Handle the case where dedges is passed because edges is already computed
+        minedge = None
+        maxedge = None
+        for hs, edges in dedges.items():
+            #Check if edges is a 1-d array or a tuple also containing scalar min/max values
+            if type(edges) is tuple:
+                (edges, minedge, maxedge) = edges
+            if (use_gpu and template.gpuwave is not None):
+                #Use gpuwave and gpuflux arrays already on GPU
+                result[hs] = trapz_rebin(template.gpuwave, template.gpuflux, edges=edges, myz=myz, use_gpu=use_gpu, xmin=xmin, xmax=xmax, edge_min=minedge, edge_max=maxedge)
+            else:
+                result[hs] = trapz_rebin(template.wave, template.flux, edges=edges, myz=myz, use_gpu=use_gpu, xmin=xmin, xmax=xmax, edge_min=minedge, edge_max=maxedge)
     return result
diff --git a/py/redrock/targets.py b/py/redrock/targets.py
index 75f429db..08e37493 100644
--- a/py/redrock/targets.py
+++ b/py/redrock/targets.py
@@ -44,6 +44,35 @@ def __init__(self, wave, flux, ivar, R, Rcsr=None):
         else:
             self.wavehash = hash((len(wave), wave[0], wave[1], wave[-2], wave[-1]))
 
+        #It is much more efficient to calculate edges once if rebinning
+        #and copy to GPU and store here rather than doing this every time
+        self._gpuwave = None
+        self._gpuedges = None
+        #Min and max edges are stored as CPU scalars to avoid excessive
+        #GPU to CPU copying
+        self.minedge = None
+        self.maxedge = None
+
+    @property
+    def gpuwave(self):
+        if (self._gpuwave is None):
+            #Copy to GPU once
+            import cupy as cp
+            self._gpuwave = cp.asarray(self.wave)
+        return self._gpuwave
+
+    @property
+    def gpuedges(self):
+        if (self._gpuedges is None):
+            #Compute edges, minedge, and maxedge, and copy to GPU once
+            import cupy as cp
+            from .rebin import centers2edges
+            edges = centers2edges(self.wave)
+            self.minedge = edges[0]
+            self.maxedge = edges[-1]
+            self._gpuedges = cp.asarray(edges)
+        return self._gpuedges
+
     @property
     def Rcsr(self):
         if self._Rcsr is None:
@@ -131,6 +160,22 @@ def __init__(self, targetid, spectra, coadd=False, cosmics_nsig=0., meta=None):
         if coadd:
             self.compute_coadd(cache_Rcsr=False, cosmics_nsig=cosmics_nsig)
 
+        #It is much more efficient to copy weights, flux, wflux to GPU once
+        #and store here rather than doing this every time
+        self.gpuweights = None
+        self.gpuflux = None
+        self.gpuwflux = None
+
+    def gpu_spectral_data(self):
+        if self.gpuweights is None:
+            #Assemble arrays and copy to GPU once
+            import cupy as cp
+            self.gpuweights = cp.concatenate([cp.asarray(s.ivar) for s in self.spectra])
+            self.gpuflux = cp.concatenate([cp.asarray(s.flux) for s in self.spectra])
+            self.gpuwflux = self.gpuweights * self.gpuflux
+        return (self.gpuweights, self.gpuflux, self.gpuwflux)
+
+
     ### @profile
     def compute_coadd(self, cache_Rcsr=False, cosmics_nsig=0.):
         """Compute the coadd from the current spectra list.
diff --git a/py/redrock/templates.py b/py/redrock/templates.py
index ae91bf2d..2e6d16f9 100644
--- a/py/redrock/templates.py
+++ b/py/redrock/templates.py
@@ -110,8 +110,13 @@ def __init__(self, filename=None, spectype=None, redshifts=None,
 
         self._nbasis = self.flux.shape[0]
         self._nwave = self.flux.shape[1]
+        #It is much more efficient to copy wave and flux to GPU once
+        #and store here rather than doing this every time, and keep track
+        #of min and max wave as scalars on CPU
         self.minwave = self.wave[0]
         self.maxwave = self.wave[-1]
+        self.gpuwave = None
+        self.gpuflux = None
 
 
     @property
diff --git a/py/redrock/zfind.py b/py/redrock/zfind.py
index a9268e3c..6a949591 100644
--- a/py/redrock/zfind.py
+++ b/py/redrock/zfind.py
@@ -87,7 +87,7 @@ def _mp_fitz(chi2, target_data, t, nminima, qout, archetype, use_gpu):
             tg.sharedmem_unpack()
         results = list()
         for i, tg in enumerate(target_data):
-            zfit = fitz(chi2[i], t.template.redshifts, tg.spectra,
+            zfit = fitz(chi2[i], t.template.redshifts, tg,
                 t.template, nminima=nminima, archetype=archetype, use_gpu=use_gpu)
             results.append( (tg.id, zfit) )
         qout.put(results)
@@ -278,11 +278,12 @@ def zfind(targets, templates, mp_procs=1, nminima=3, archetypes=None, priors=Non
     if (use_gpu):
         #If using GPU, copy template flux and wave arrays to cupy objects
         #on the GPU once here so it is not copied every iteration of
-        #rebinning below
+        #rebinning below.  Use gpuwave and gpuflux so as to not overwrite
+        #numpy arrays.
         import cupy as cp
         for t in templates:
-            t.template.wave = cp.asarray(t.template.wave)
-            t.template.flux = cp.asarray(t.template.flux)
+            t.template.gpuwave = cp.asarray(t.template.wave)
+            t.template.gpuflux = cp.asarray(t.template.flux)
 
     # Note: rebalancing no longer needs to be done now that following steps
     # have been GPU-ized - CW 12/22
@@ -330,7 +331,7 @@ def zfind(targets, templates, mp_procs=1, nminima=3, archetypes=None, priors=Non
             for tg in local_targets:
                 zfit = fitz(results[tg.id][ft]['zchi2'] \
                     + results[tg.id][ft]['penalty'],
-                    t.template.redshifts, tg.spectra,
+                    t.template.redshifts, tg,
                     t.template, nminima=nminima,archetype=archetype, use_gpu=use_gpu)
                 results[tg.id][ft]['zfit'] = zfit
         else:
diff --git a/py/redrock/zscan.py b/py/redrock/zscan.py
index 6cdc1812..24337cc9 100644
--- a/py/redrock/zscan.py
+++ b/py/redrock/zscan.py
@@ -416,10 +416,10 @@ def _calc_batch_dot_product_3d2d_gpu(Tbs, zc):
     batch_dot_product_3d2d_kernel = cp_module.get_function('batch_dot_product_3d2d')
 
     #Array dims needed by CUDA:
-    nz = cp.int32(zc.shape[0])
+    nz = zc.shape[0]
     nrows = Tbs[0].shape[0]
     n = nrows * nz
-    nbasis = cp.int32(zc.shape[1])
+    nbasis = zc.shape[1]
 
     #Allocate CUPY array and calc blocks to be used
     blocks = (n+block_size-1)//block_size
@@ -612,6 +612,7 @@ def calc_zchi2_batch(spectra, tdata, weights, flux, wflux, nz, nbasis, solve_mat
         zcoeff = np.zeros((nz, nbasis))
         #On the CPU, the templates are looped over and all operations
         #are performed on one template at a time.
+
         for i in range(nz):
             #1) dot_product_sparse_one will compute dot products of all
             #spectra with ONE template and return a 2D array of size
@@ -700,18 +701,20 @@ def calc_zchi2(target_ids, target_data, dtemplate, progress=None, use_gpu=False)
     tdata = dtemplate.local.data
 
     for j in range(ntargets):
-        (weights, flux, wflux) = spectral_data(target_data[j].spectra)
+        if (use_gpu):
+            #Use new helper method gpu_spectral_data() that will copy to GPU
+            #on first access and recall data in subsequent calls instead of
+            #copying every time
+            (weights, flux, wflux) = target_data[j].gpu_spectral_data()
+        else:
+            #Use spectral_data() to get numpy arrays
+            (weights, flux, wflux) = spectral_data(target_data[j].spectra)
         if np.sum(weights) == 0:
             zchi2[j,:] = 9e99
             #Update progress for multiprocessing!!
             if dtemplate.comm is None and progress is not None:
                 progress.put(1)
             continue
-        if (use_gpu):
-            #Copy data to CUPY arrays
-            weights = cp.asarray(weights)
-            flux = cp.asarray(flux)
-            wflux = cp.asarray(wflux)
 
         # Solving for template fit coefficients for all redshifts.
         # We use the pre-interpolated templates for each
@@ -719,6 +722,7 @@ def calc_zchi2(target_ids, target_data, dtemplate, progress=None, use_gpu=False)
 
         # Use helper method calc_zchi2_batch to calculate zchi2 and zcoeff
         # for all templates for all three spectra for this target
+
         (zchi2[j,:], zcoeff[j,:,:]) = calc_zchi2_batch(target_data[j].spectra, tdata, weights, flux, wflux, nz, nbasis, dtemplate.template.solve_matrices_algorithm, use_gpu)
 
         #- Penalize chi2 for negative [OII] flux; ad-hoc

From 8f966b9d866457b6eb026c5be4c8235920ea8562 Mon Sep 17 00:00:00 2001
From: Craig Warner <cwarner@astro.ufl.edu>
Date: Wed, 29 Mar 2023 20:19:22 -0400
Subject: [PATCH 2/5] Update to fix so that cupy is only imported when use_gpu
 = True in rebin.py

---
 py/redrock/rebin.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/py/redrock/rebin.py b/py/redrock/rebin.py
index e2aadb5a..96734a27 100644
--- a/py/redrock/rebin.py
+++ b/py/redrock/rebin.py
@@ -291,10 +291,14 @@ def trapz_rebin(x, y, xnew=None, edges=None, myz=None, use_gpu=False, xmin=None,
         ValueError: if edges are outside the range of x or if len(x) != len(y)
 
     """
-    import cupy as cp
+    if use_gpu:
+        import cupy as cp
     if edges is None:
         edges = centers2edges(xnew)
-    elif type(edges) != cp.ndarray:
+    elif use_gpu:
+        if type(edges) != cp.ndarray:
+            edges = np.asarray(edges)
+    else:
         edges = np.asarray(edges)
     nbins = len(edges)-1
     #Set edge_min and edge_max if not passed as input
@@ -313,7 +317,10 @@ def trapz_rebin(x, y, xnew=None, edges=None, myz=None, use_gpu=False, xmin=None,
     elif (np.isscalar(myz)):
         myz = np.array([myz], dtype=np.float64)
         scalar_z = True
-    elif (type(myz) != cp.ndarray):
+    elif (use_gpu):
+        if (type(myz) != cp.ndarray):
+            myz = np.asarray(myz, dtype=np.float64)
+    else:
         myz = np.asarray(myz, dtype=np.float64)
     nz = len(myz)
     if (nz == 0):

From df5215b66c53e737fe187f862e119c2341caa8d0 Mon Sep 17 00:00:00 2001
From: Craig D Warner <cwarner@astro.ufl.edu>
Date: Mon, 24 Apr 2023 10:34:03 -0700
Subject: [PATCH 3/5] Added fullprecision argument to
 calc_batch_dot_product_3d3d_gpu (and to calc_zchi2_batch to pass-through)
 which sets nparallel = 4 to keep the number of parallel threads working on
 each output element of a dot product small and avoid floating point rounding
 errors due to random order of summation (a+b+c != c+a+b).

Coarse zscan still uses faster version but fine redshift scan uses
default arg of fullprecision=True to guarantee reproducibility of
results.  This fixes issue #242.
---
 py/redrock/zscan.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/py/redrock/zscan.py b/py/redrock/zscan.py
index 24337cc9..417cd354 100644
--- a/py/redrock/zscan.py
+++ b/py/redrock/zscan.py
@@ -437,7 +437,7 @@ def _calc_batch_dot_product_3d2d_gpu(Tbs, zc):
 ###    seems to have a bug on Volta architecure GPUs.
 ###    This is the equivalent of M = a @ b
 ###    (Or if transpose_a is true, M = a.swapaxes(-2, -1) @ b)
-def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False):
+def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False, fullprecision=True):
     """GPU implementation.
     Calculate a batch dot product of the 3d array a with the 3d
     array b.  The 3-d array shape is A x B x C and the 2-d array
@@ -457,6 +457,14 @@ def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False):
             (nz x nrows x nbasis)
         transpose_a (bool): Whether or not to transpose the a array
             before performing the dot product
+        fullprecision (bool): Whether or not to ensure reproducibly identical
+            results.  The context is that it can be faster to use many
+            parallel threads to compute a single output array element of the
+            dot product result, but due to floating point rounding issues,
+            the random order of the summation can change the result by
+            order 1e-16 (e.g. a+b+c != c+a+b).  When set to true, the
+            number of parallel threads is decreased to ensure reproducibility
+            at a trade-off of speed.
 
     Returns:
         M (array): the output of the dot product, (nz x ncols x ncols)
@@ -466,6 +474,7 @@ def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False):
 
     #Use batch_dot_product_3d3d kernel to compute model array
     # Load CUDA kernel
+    import cupy as cp
     cp_module = cp.RawModule(code=cuda_source)
     batch_dot_product_3d3d_kernel = cp_module.get_function('batch_dot_product_3d3d')
 
@@ -479,9 +488,11 @@ def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False):
         ncols = cp.int32(a.shape[1])
     transpose_a = cp.int32(transpose_a)
 
-    if (nz > 512):
+    if (fullprecision):
         nparallel = cp.int32(4)
-    else:
+    elif (nz <= 512):
+        #With smaller arrays, use more parallel threads in order to maximize
+        #parallelism and leverage the full resources of the GPU
         nparallel = cp.int32(64)
     #Create CUPY arrays and calculate number of blocks
     n = nz*ncols*ncols*nparallel
@@ -499,7 +510,7 @@ def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False):
 ###    templates are looped over on the CPU.  The operations performed
 ###    are very obviously analagous though and should be highly
 ###    maintainable.  The main difference is the extra loop on the CPU version
-def calc_zchi2_batch(spectra, tdata, weights, flux, wflux, nz, nbasis, solve_matrices_algorithm="PCA", use_gpu=False):
+def calc_zchi2_batch(spectra, tdata, weights, flux, wflux, nz, nbasis, solve_matrices_algorithm="PCA", use_gpu=False, fullprecision=True):
     """Calculate a batch of chi2.
     For many redshifts and a set of spectral data, compute the chi2 for
     template data that is already on the correct grid.
@@ -514,6 +525,8 @@ def calc_zchi2_batch(spectra, tdata, weights, flux, wflux, nz, nbasis, solve_mat
         nz (int): number of templates
         nbasis (int): nbasis
         use_gpu (bool): use GPU or not
+        fullprecision (bool): Whether or not to ensure reproducibly identical
+            results.  See calc_batch_dot_product_3d3d_gpu.
 
     Returns:
         zchi2 (array): array with one element per redshift for this target
@@ -568,7 +581,7 @@ def calc_zchi2_batch(spectra, tdata, weights, flux, wflux, nz, nbasis, solve_mat
         ###!!! NOTE - uncomment the below line to run (B)
         #(all_M, all_y) = calc_M_y_batch(Tbs, weights, wflux, nz, nbasis)
         ###!!! NOTE - uncomment the 2 lines below to run (C)
-        all_M = calc_batch_dot_product_3d3d_gpu(Tbs, (weights[None, :, None] * Tbs), transpose_a=True)
+        all_M = calc_batch_dot_product_3d3d_gpu(Tbs, (weights[None, :, None] * Tbs), transpose_a=True, fullprecision=fullprecision)
         all_y = (Tbs.swapaxes(-2, -1) @ wflux)
         ###!!! NOTE - uncomment the 2 lines below to run an alternative
         ###    version of (C) that does the transpose on the CPU - this seems
@@ -723,7 +736,8 @@ def calc_zchi2(target_ids, target_data, dtemplate, progress=None, use_gpu=False)
         # Use helper method calc_zchi2_batch to calculate zchi2 and zcoeff
         # for all templates for all three spectra for this target
 
-        (zchi2[j,:], zcoeff[j,:,:]) = calc_zchi2_batch(target_data[j].spectra, tdata, weights, flux, wflux, nz, nbasis, dtemplate.template.solve_matrices_algorithm, use_gpu)
+        # For coarse z scan, use fullprecision = False to maximize speed
+        (zchi2[j,:], zcoeff[j,:,:]) = calc_zchi2_batch(target_data[j].spectra, tdata, weights, flux, wflux, nz, nbasis, dtemplate.template.solve_matrices_algorithm, use_gpu, fullprecision=False)
 
         #- Penalize chi2 for negative [OII] flux; ad-hoc
         if dtemplate.template.template_type == 'GALAXY':

From ed4f0ff2b0c877073b5ac45558f663d889bed80a Mon Sep 17 00:00:00 2001
From: Craig D Warner <cwarner@astro.ufl.edu>
Date: Mon, 24 Apr 2023 10:43:00 -0700
Subject: [PATCH 4/5] Fix - had accidentally deleted one line from code before
 submitting PR when running tests.

---
 py/redrock/zscan.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/py/redrock/zscan.py b/py/redrock/zscan.py
index 417cd354..4734d721 100644
--- a/py/redrock/zscan.py
+++ b/py/redrock/zscan.py
@@ -488,6 +488,7 @@ def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False, fullprecision=True)
         ncols = cp.int32(a.shape[1])
     transpose_a = cp.int32(transpose_a)
 
+    nparallel = cp.int32(4)
     if (fullprecision):
         nparallel = cp.int32(4)
     elif (nz <= 512):

From 1533d71f87709763140f6e10380df23c0fbeca82 Mon Sep 17 00:00:00 2001
From: Benjamin Alan Weaver <benjamin.weaver@noirlab.edu>
Date: Mon, 11 Sep 2023 14:55:15 -0700
Subject: [PATCH 5/5] update RTD configuration

---
 .readthedocs.yml         | 23 +++++++++++++++++++++++
 README.rst               | 16 +++++++++-------
 doc/rtd-requirements.txt |  6 ++++--
 3 files changed, 36 insertions(+), 9 deletions(-)
 create mode 100644 .readthedocs.yml

diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 00000000..9bba0552
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,23 @@
+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+build:
+    os: "ubuntu-22.04"
+    tools:
+        python: "3.10"
+
+# Build documentation in the doc/ directory with Sphinx
+sphinx:
+    configuration: doc/conf.py
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats: all
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+    install:
+        - requirements: doc/rtd-requirements.txt
diff --git a/README.rst b/README.rst
index 982696e4..669179dd 100644
--- a/README.rst
+++ b/README.rst
@@ -2,16 +2,18 @@
 redrock
 =======
 
-.. image:: https://travis-ci.org/desihub/redrock.svg?branch=master
-    :target: https://travis-ci.org/desihub/redrock
-    :alt: Travis Build Status
+|Actions Status| |Coveralls Status| |Documentation Status|
 
-.. image:: https://coveralls.io/repos/github/desihub/redrock/badge.svg?branch=master
-    :target: https://coveralls.io/github/desihub/redrock?branch=master
+.. |Actions Status| image:: https://github.com/desihub/redrock/workflows/CI/badge.svg
+    :target: https://github.com/desihub/redrock/actions
+    :alt: GitHub Actions CI Status
+
+.. |Coveralls Status| image:: https://coveralls.io/repos/desihub/redrock/badge.svg
+    :target: https://coveralls.io/github/desihub/redrock
     :alt: Test Coverage Status
 
-.. image:: https://readthedocs.org/projects/redrock/badge/?version=latest
-    :target: http://redrock.readthedocs.org/en/latest/
+.. |Documentation Status| image:: https://readthedocs.org/projects/redrock/badge/?version=latest
+    :target: https://redrock.readthedocs.io/en/latest/
     :alt: Documentation Status
 
 Introduction
diff --git a/doc/rtd-requirements.txt b/doc/rtd-requirements.txt
index 4bdb01c5..8e0f6391 100644
--- a/doc/rtd-requirements.txt
+++ b/doc/rtd-requirements.txt
@@ -1,2 +1,4 @@
-setuptools
-docutils<0.18
+setuptools>60
+Sphinx>6,<7
+sphinx-rtd-theme>1
+urllib3<2