From 421eee0abdd7bbdc777cfd1d5d846f5a4f6590d9 Mon Sep 17 00:00:00 2001 From: Craig Warner Date: Wed, 29 Mar 2023 18:11:49 -0400 Subject: [PATCH 1/5] Refactored code to minimize GPU to CPU copying, resulting in big speed-ups. The refactoring results in having two copies of several arrays (flux, wavelength, etc) - one on the GPU and one on the CPU. It also results in several scalars such as min/max wavelength being stored on the CPU to avoid very costly single value being copied to/from GPU. This results in a speed up on "Finding best redshift" from 12.5s to 9.3s on 64 CPU / 4 GPU and from 10.0s to 7.2s on 4 CPU / 4 GPU, bringing it closer in line with the 6.7s for 64 CPU. --- py/redrock/fitz.py | 59 ++++++++++++++++----- py/redrock/rebin.py | 110 ++++++++++++++++++++++++++++++---------- py/redrock/targets.py | 45 ++++++++++++++++ py/redrock/templates.py | 5 ++ py/redrock/zfind.py | 11 ++-- py/redrock/zscan.py | 20 +++++--- 6 files changed, 198 insertions(+), 52 deletions(-) diff --git a/py/redrock/fitz.py b/py/redrock/fitz.py index 03b33e03..2ae1ffa1 100644 --- a/py/redrock/fitz.py +++ b/py/redrock/fitz.py @@ -107,7 +107,7 @@ def minfit(x, y): return (x0, xerr, y0, zwarn) -def fitz(zchi2, redshifts, spectra, template, nminima=3, archetype=None, use_gpu=False): +def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=False): """Refines redshift measurement around up to nminima minima. TODO: @@ -116,8 +116,8 @@ def fitz(zchi2, redshifts, spectra, template, nminima=3, archetype=None, use_gpu Args: zchi2 (array): chi^2 values for each redshift. redshifts (array): the redshift values. - spectra (list): list of Spectrum objects at different wavelengths - grids. + target (Target): the target for this fit which includes a list + of Spectrum objects at different wavelength grids. template (Template): the template for this fit. nminima (int): the number of minima to consider. use_gpu (bool): use GPU or not @@ -132,6 +132,7 @@ def fitz(zchi2, redshifts, spectra, template, nminima=3, archetype=None, use_gpu import cupy as cp nbasis = template.nbasis + spectra = target.spectra # Build dictionary of wavelength grids dwave = { s.wavehash:s.wave for s in spectra } @@ -146,10 +147,13 @@ def fitz(zchi2, redshifts, spectra, template, nminima=3, archetype=None, use_gpu (weights, flux, wflux) = spectral_data(spectra) if (use_gpu): - #Copy arrays to GPU - weights = cp.asarray(weights) - flux = cp.asarray(flux) - wflux = cp.asarray(wflux) + #Get CuPy arrays of weights, flux, wflux + #These are created on the first call of gpu_spectral_data() for a + #target and stored. They are retrieved on subsequent calls. + (gpuweights, gpuflux, gpuwflux) = target.gpu_spectral_data() + # Build dictionaries of wavelength bin edges, min/max, and centers + gpuedges = { s.wavehash:(s.gpuedges, s.minedge, s.maxedge) for s in spectra } + gpudwave = { s.wavehash:s.gpuwave for s in spectra } results = list() #Define nz here instead of hard-coding length 15 and then defining nz as @@ -171,12 +175,25 @@ def fitz(zchi2, redshifts, spectra, template, nminima=3, archetype=None, use_gpu ilo = max(0, imin-1) ihi = min(imin+1, len(zchi2)-1) zz = np.linspace(redshifts[ilo], redshifts[ihi], nz) + if (use_gpu): + #Create a redshift grid on the GPU as well + gpuzz = cp.asarray(zz) zzchi2 = np.zeros(nz, dtype=np.float64) zzcoeff = np.zeros((nz, nbasis), dtype=np.float64) + #Calculate xmin and xmax from template and zz array on CPU and + #pass as scalars + xmin = template.minwave*(1+zz.max()) + xmax = template.maxwave*(1+zz.min()) + #Use batch mode for rebin_template, transmission_Lyman, and calc_zchi2 - binned = rebin_template(template, zz, dwave, use_gpu=use_gpu) + if (use_gpu): + #Use gpuedges already calculated and on GPU + binned = rebin_template(template, gpuzz, dedges=gpuedges, use_gpu=use_gpu, xmin=xmin, xmax=xmax) + else: + #Use numpy CPU arrays + binned = rebin_template(template, zz, dwave, use_gpu=use_gpu, xmin=xmin, xmax=xmax) # Correct spectra for Lyman-series for k in list(dwave.keys()): #New algorithm accepts all z as an array and returns T, a 2-d @@ -188,23 +205,41 @@ def fitz(zchi2, redshifts, spectra, template, nminima=3, archetype=None, use_gpu continue #Vectorize multiplication binned[k] *= T[:,:,None] - (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, binned, weights, flux, wflux, nz, nbasis, use_gpu=use_gpu) + if (use_gpu): + #Use gpu arrays for weights, flux, wflux + (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, binned, gpuweights, gpuflux, gpuwflux, nz, nbasis, use_gpu=use_gpu) + else: + #Use numpy CPU arrays for weights, flux, wflux + (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, binned, weights, flux, wflux, nz, nbasis, use_gpu=use_gpu) #- fit parabola to 3 points around minimum i = min(max(np.argmin(zzchi2),1), len(zz)-2) zmin, sigma, chi2min, zwarn = minfit(zz[i-1:i+2], zzchi2[i-1:i+2]) try: - binned = rebin_template(template, np.array([zmin]), dwave, use_gpu=use_gpu) + #Calculate xmin and xmax from template and pass as scalars + xmin = template.minwave*(1+zmin) + ximax = template.maxwave*(1+zmin) + if (use_gpu): + #Use gpuedges already calculated and on GPU + binned = rebin_template(template, cp.array([zmin]), dedges=gpuedges, use_gpu=use_gpu, xmin=xmin, xmax=xmax) + else: + binned = rebin_template(template, np.array([zmin]), dwave, use_gpu=use_gpu, xmin=xmin, xmax=xmax) for k in list(dwave.keys()): - T = transmission_Lyman(np.array([zmin]),dwave[k], use_gpu=use_gpu) + if (use_gpu): + #Copy binned[k] back to CPU to perform next steps on CPU + #because faster with only 1 redshift + binned[k] = binned[k].get() + #Use CPU always + T = transmission_Lyman(np.array([zmin]),dwave[k], use_gpu=False) if (T is None): #Return value of None means that wavelenght regime #does not overlap Lyman transmission - continue here continue #Vectorize multiplication binned[k] *= T[:,:,None] - (chi2, coeff) = calc_zchi2_batch(spectra, binned, weights, flux, wflux, 1, nbasis, use_gpu=use_gpu) + #Use CPU always with one redshift + (chi2, coeff) = calc_zchi2_batch(spectra, binned, weights, flux, wflux, 1, nbasis, use_gpu=False) coeff = coeff[0,:] except ValueError as err: if zmin x[-1]*(1+myz.min())): - if (edges[0] < xmin*(1+myz.max()) or edges[-1] > xmax*(1+myz.min())): - raise ValueError('edges must be within input x range') + if xmin is None: + xmin = x[0]*(1+myz.max()) + if xmax is None: + xmax = x[-1]*(1+myz.min()) + #Use edge_min, xmin, edge_max, and xmax for comparison - these are + #passed as scalars when x and edges are on GPU which results in + #large speed gain by minimizing GPU to CPU copying of single values + if (edge_min < xmin or edge_max > xmax): + raise ValueError('edges must be within input x range') if (not use_gpu and scalar_z and len(y.shape) == 1): #Special case, call _trapz_rebin_1d directly @@ -400,7 +418,7 @@ def _trapz_rebin_batch_gpu(x, y, edges, myz, result_shape): #Divide edges output wavelength array by (1+myz) to get 2d array #of input wavelengths at each boundary in edges and #use serachsorted to find index for each boundary - e2d = cp.asarray(edges/(1+myz[:,None])) + e2d = edges/(1+myz[:,None]) idx = cp.searchsorted(x, e2d, side='right')#.astype(cp.int32) # Load CUDA kernel @@ -408,11 +426,11 @@ def _trapz_rebin_batch_gpu(x, y, edges, myz, result_shape): batch_trapz_rebin_kernel = cp_module.get_function('batch_trapz_rebin') #Array sizes - nbin = cp.int32(len(edges)-1) - nz = cp.int32(len(myz)) - nbasis = cp.int32(y.shape[0]) + nbin = len(edges)-1 + nz = len(myz) + nbasis = y.shape[0] n = nbin*nbasis*nz - nt = cp.int32(len(x)) + nt = len(x) if (result_shape is None): result_shape = (nz, nbasis, nbin) @@ -425,18 +443,33 @@ def _trapz_rebin_batch_gpu(x, y, edges, myz, result_shape): return result -def rebin_template(template, myz, dwave, use_gpu=False): +def rebin_template(template, myz, dwave=None, dedges=None, use_gpu=False, xmin=None, xmax=None): """Rebin a template to a set of wavelengths. Given a template and a single redshift - or an array of redshifts, rebin the template to a set of wavelength arrays. + The wavelengths can be passed as centers (dwave) or edges (dedges) + of the wavelength bins. Args: template (Template): the template object myz (float or array of float): the redshift(s) dwave (dict): the keys are the "wavehash" and the values are a 1D array containing the wavelength grid. + dedges (dict): the keys are the "wavehash" and the values + can be either a 1D array containing the bin edges + of the wavelength grid (computed from centers2edges) + or a 3-element tuple with this 1D array as a CuPy array + and the min and max values as scalars in CPU memory. use_gpu (bool): whether or not to use the GPU algorithm + xmin (float): (optional) minimum x-value - x[0]*(1+myz.max()) will be + used if omitted - this is useful to avoid GPU to CPU copying in + the case where x is a CuPy array and providing the scalar value + results in a large speed gain + xmax (float) (optional) maximum x-value - x[-1]*(1+myz.min()) will be + used if omitted - this is useful to avoid GPU to CPU copying in + the case where x is a CuPy array and providing the scalar value + results in a large speed gain Returns: dict: The rebinned template for every basis function and wavelength @@ -446,9 +479,32 @@ def rebin_template(template, myz, dwave, use_gpu=False): """ nbasis = template.flux.shape[0] #- number of template basis vectors result = dict() + #calculate xmin and xmax if not given + if (xmin is None): + xmin = template.minwave*(1+myz.max()) + if (xmax is None): + xmax = template.maxwave*(1+myz.min()) #rebin all z and all bases in batch in parallel #and return dict of 3-d numpy / cupy arrays - for hs, wave in dwave.items(): - result[hs] = trapz_rebin(template.wave, template.flux, xnew=wave, myz=myz, use_gpu=use_gpu, xmin=template.minwave, xmax=template.maxwave) - + if dwave is not None: + #Handle the case where dwave is passed + for hs, wave in dwave.items(): + if (use_gpu and template.gpuwave is not None): + #Use gpuwave and gpuflux arrays already on GPU + result[hs] = trapz_rebin(template.gpuwave, template.gpuflux, xnew=wave, myz=myz, use_gpu=use_gpu, xmin=xmin, xmax=xmax) + else: + result[hs] = trapz_rebin(template.wave, template.flux, xnew=wave, myz=myz, use_gpu=use_gpu, xmin=xmin, xmax=xmax) + elif dedges is not None: + #Handle the case where dedges is passed because edges is already computed + minedge = None + maxedge = None + for hs, edges in dedges.items(): + #Check if edges is a 1-d array or a tuple also containing scalar min/max values + if type(edges) is tuple: + (edges, minedge, maxedge) = edges + if (use_gpu and template.gpuwave is not None): + #Use gpuwave and gpuflux arrays already on GPU + result[hs] = trapz_rebin(template.gpuwave, template.gpuflux, edges=edges, myz=myz, use_gpu=use_gpu, xmin=xmin, xmax=xmax, edge_min=minedge, edge_max=maxedge) + else: + result[hs] = trapz_rebin(template.wave, template.flux, edges=edges, myz=myz, use_gpu=use_gpu, xmin=xmin, xmax=xmax, edge_min=minedge, edge_max=maxedge) return result diff --git a/py/redrock/targets.py b/py/redrock/targets.py index 75f429db..08e37493 100644 --- a/py/redrock/targets.py +++ b/py/redrock/targets.py @@ -44,6 +44,35 @@ def __init__(self, wave, flux, ivar, R, Rcsr=None): else: self.wavehash = hash((len(wave), wave[0], wave[1], wave[-2], wave[-1])) + #It is much more efficient to calculate edges once if rebinning + #and copy to GPU and store here rather than doing this every time + self._gpuwave = None + self._gpuedges = None + #Min and max edges are stored as CPU scalars to avoid excessive + #GPU to CPU copying + self.minedge = None + self.maxedge = None + + @property + def gpuwave(self): + if (self._gpuwave is None): + #Copy to GPU once + import cupy as cp + self._gpuwave = cp.asarray(self.wave) + return self._gpuwave + + @property + def gpuedges(self): + if (self._gpuedges is None): + #Compute edges, minedge, and maxedge, and copy to GPU once + import cupy as cp + from .rebin import centers2edges + edges = centers2edges(self.wave) + self.minedge = edges[0] + self.maxedge = edges[-1] + self._gpuedges = cp.asarray(edges) + return self._gpuedges + @property def Rcsr(self): if self._Rcsr is None: @@ -131,6 +160,22 @@ def __init__(self, targetid, spectra, coadd=False, cosmics_nsig=0., meta=None): if coadd: self.compute_coadd(cache_Rcsr=False, cosmics_nsig=cosmics_nsig) + #It is much more efficient to copy weights, flux, wflux to GPU once + #and store here rather than doing this every time + self.gpuweights = None + self.gpuflux = None + self.gpuwflux = None + + def gpu_spectral_data(self): + if self.gpuweights is None: + #Assemble arrays and copy to GPU once + import cupy as cp + self.gpuweights = cp.concatenate([cp.asarray(s.ivar) for s in self.spectra]) + self.gpuflux = cp.concatenate([cp.asarray(s.flux) for s in self.spectra]) + self.gpuwflux = self.gpuweights * self.gpuflux + return (self.gpuweights, self.gpuflux, self.gpuwflux) + + ### @profile def compute_coadd(self, cache_Rcsr=False, cosmics_nsig=0.): """Compute the coadd from the current spectra list. diff --git a/py/redrock/templates.py b/py/redrock/templates.py index ae91bf2d..2e6d16f9 100644 --- a/py/redrock/templates.py +++ b/py/redrock/templates.py @@ -110,8 +110,13 @@ def __init__(self, filename=None, spectype=None, redshifts=None, self._nbasis = self.flux.shape[0] self._nwave = self.flux.shape[1] + #It is much more efficient to copy wave and flux to GPU once + #and store here rather than doing this every time, and keep track + #of min and max wave as scalars on CPU self.minwave = self.wave[0] self.maxwave = self.wave[-1] + self.gpuwave = None + self.gpuflux = None @property diff --git a/py/redrock/zfind.py b/py/redrock/zfind.py index a9268e3c..6a949591 100644 --- a/py/redrock/zfind.py +++ b/py/redrock/zfind.py @@ -87,7 +87,7 @@ def _mp_fitz(chi2, target_data, t, nminima, qout, archetype, use_gpu): tg.sharedmem_unpack() results = list() for i, tg in enumerate(target_data): - zfit = fitz(chi2[i], t.template.redshifts, tg.spectra, + zfit = fitz(chi2[i], t.template.redshifts, tg, t.template, nminima=nminima, archetype=archetype, use_gpu=use_gpu) results.append( (tg.id, zfit) ) qout.put(results) @@ -278,11 +278,12 @@ def zfind(targets, templates, mp_procs=1, nminima=3, archetypes=None, priors=Non if (use_gpu): #If using GPU, copy template flux and wave arrays to cupy objects #on the GPU once here so it is not copied every iteration of - #rebinning below + #rebinning below. Use gpuwave and gpuflux so as to not overwrite + #numpy arrays. import cupy as cp for t in templates: - t.template.wave = cp.asarray(t.template.wave) - t.template.flux = cp.asarray(t.template.flux) + t.template.gpuwave = cp.asarray(t.template.wave) + t.template.gpuflux = cp.asarray(t.template.flux) # Note: rebalancing no longer needs to be done now that following steps # have been GPU-ized - CW 12/22 @@ -330,7 +331,7 @@ def zfind(targets, templates, mp_procs=1, nminima=3, archetypes=None, priors=Non for tg in local_targets: zfit = fitz(results[tg.id][ft]['zchi2'] \ + results[tg.id][ft]['penalty'], - t.template.redshifts, tg.spectra, + t.template.redshifts, tg, t.template, nminima=nminima,archetype=archetype, use_gpu=use_gpu) results[tg.id][ft]['zfit'] = zfit else: diff --git a/py/redrock/zscan.py b/py/redrock/zscan.py index 6cdc1812..24337cc9 100644 --- a/py/redrock/zscan.py +++ b/py/redrock/zscan.py @@ -416,10 +416,10 @@ def _calc_batch_dot_product_3d2d_gpu(Tbs, zc): batch_dot_product_3d2d_kernel = cp_module.get_function('batch_dot_product_3d2d') #Array dims needed by CUDA: - nz = cp.int32(zc.shape[0]) + nz = zc.shape[0] nrows = Tbs[0].shape[0] n = nrows * nz - nbasis = cp.int32(zc.shape[1]) + nbasis = zc.shape[1] #Allocate CUPY array and calc blocks to be used blocks = (n+block_size-1)//block_size @@ -612,6 +612,7 @@ def calc_zchi2_batch(spectra, tdata, weights, flux, wflux, nz, nbasis, solve_mat zcoeff = np.zeros((nz, nbasis)) #On the CPU, the templates are looped over and all operations #are performed on one template at a time. + for i in range(nz): #1) dot_product_sparse_one will compute dot products of all #spectra with ONE template and return a 2D array of size @@ -700,18 +701,20 @@ def calc_zchi2(target_ids, target_data, dtemplate, progress=None, use_gpu=False) tdata = dtemplate.local.data for j in range(ntargets): - (weights, flux, wflux) = spectral_data(target_data[j].spectra) + if (use_gpu): + #Use new helper method gpu_spectral_data() that will copy to GPU + #on first access and recall data in subsequent calls instead of + #copying every time + (weights, flux, wflux) = target_data[j].gpu_spectral_data() + else: + #Use spectral_data() to get numpy arrays + (weights, flux, wflux) = spectral_data(target_data[j].spectra) if np.sum(weights) == 0: zchi2[j,:] = 9e99 #Update progress for multiprocessing!! if dtemplate.comm is None and progress is not None: progress.put(1) continue - if (use_gpu): - #Copy data to CUPY arrays - weights = cp.asarray(weights) - flux = cp.asarray(flux) - wflux = cp.asarray(wflux) # Solving for template fit coefficients for all redshifts. # We use the pre-interpolated templates for each @@ -719,6 +722,7 @@ def calc_zchi2(target_ids, target_data, dtemplate, progress=None, use_gpu=False) # Use helper method calc_zchi2_batch to calculate zchi2 and zcoeff # for all templates for all three spectra for this target + (zchi2[j,:], zcoeff[j,:,:]) = calc_zchi2_batch(target_data[j].spectra, tdata, weights, flux, wflux, nz, nbasis, dtemplate.template.solve_matrices_algorithm, use_gpu) #- Penalize chi2 for negative [OII] flux; ad-hoc From 8f966b9d866457b6eb026c5be4c8235920ea8562 Mon Sep 17 00:00:00 2001 From: Craig Warner Date: Wed, 29 Mar 2023 20:19:22 -0400 Subject: [PATCH 2/5] Update to fix so that cupy is only imported when use_gpu = True in rebin.py --- py/redrock/rebin.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/py/redrock/rebin.py b/py/redrock/rebin.py index e2aadb5a..96734a27 100644 --- a/py/redrock/rebin.py +++ b/py/redrock/rebin.py @@ -291,10 +291,14 @@ def trapz_rebin(x, y, xnew=None, edges=None, myz=None, use_gpu=False, xmin=None, ValueError: if edges are outside the range of x or if len(x) != len(y) """ - import cupy as cp + if use_gpu: + import cupy as cp if edges is None: edges = centers2edges(xnew) - elif type(edges) != cp.ndarray: + elif use_gpu: + if type(edges) != cp.ndarray: + edges = np.asarray(edges) + else: edges = np.asarray(edges) nbins = len(edges)-1 #Set edge_min and edge_max if not passed as input @@ -313,7 +317,10 @@ def trapz_rebin(x, y, xnew=None, edges=None, myz=None, use_gpu=False, xmin=None, elif (np.isscalar(myz)): myz = np.array([myz], dtype=np.float64) scalar_z = True - elif (type(myz) != cp.ndarray): + elif (use_gpu): + if (type(myz) != cp.ndarray): + myz = np.asarray(myz, dtype=np.float64) + else: myz = np.asarray(myz, dtype=np.float64) nz = len(myz) if (nz == 0): From df5215b66c53e737fe187f862e119c2341caa8d0 Mon Sep 17 00:00:00 2001 From: Craig D Warner Date: Mon, 24 Apr 2023 10:34:03 -0700 Subject: [PATCH 3/5] Added fullprecision argument to calc_batch_dot_product_3d3d_gpu (and to calc_zchi2_batch to pass-through) which sets nparallel = 4 to keep the number of parallel threads working on each output element of a dot product small and avoid floating point rounding errors due to random order of summation (a+b+c != c+a+b). Coarse zscan still uses faster version but fine redshift scan uses default arg of fullprecision=True to guarantee reproducibility of results. This fixes issue #242. --- py/redrock/zscan.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/py/redrock/zscan.py b/py/redrock/zscan.py index 24337cc9..417cd354 100644 --- a/py/redrock/zscan.py +++ b/py/redrock/zscan.py @@ -437,7 +437,7 @@ def _calc_batch_dot_product_3d2d_gpu(Tbs, zc): ### seems to have a bug on Volta architecure GPUs. ### This is the equivalent of M = a @ b ### (Or if transpose_a is true, M = a.swapaxes(-2, -1) @ b) -def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False): +def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False, fullprecision=True): """GPU implementation. Calculate a batch dot product of the 3d array a with the 3d array b. The 3-d array shape is A x B x C and the 2-d array @@ -457,6 +457,14 @@ def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False): (nz x nrows x nbasis) transpose_a (bool): Whether or not to transpose the a array before performing the dot product + fullprecision (bool): Whether or not to ensure reproducibly identical + results. The context is that it can be faster to use many + parallel threads to compute a single output array element of the + dot product result, but due to floating point rounding issues, + the random order of the summation can change the result by + order 1e-16 (e.g. a+b+c != c+a+b). When set to true, the + number of parallel threads is decreased to ensure reproducibility + at a trade-off of speed. Returns: M (array): the output of the dot product, (nz x ncols x ncols) @@ -466,6 +474,7 @@ def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False): #Use batch_dot_product_3d3d kernel to compute model array # Load CUDA kernel + import cupy as cp cp_module = cp.RawModule(code=cuda_source) batch_dot_product_3d3d_kernel = cp_module.get_function('batch_dot_product_3d3d') @@ -479,9 +488,11 @@ def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False): ncols = cp.int32(a.shape[1]) transpose_a = cp.int32(transpose_a) - if (nz > 512): + if (fullprecision): nparallel = cp.int32(4) - else: + elif (nz <= 512): + #With smaller arrays, use more parallel threads in order to maximize + #parallelism and leverage the full resources of the GPU nparallel = cp.int32(64) #Create CUPY arrays and calculate number of blocks n = nz*ncols*ncols*nparallel @@ -499,7 +510,7 @@ def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False): ### templates are looped over on the CPU. The operations performed ### are very obviously analagous though and should be highly ### maintainable. The main difference is the extra loop on the CPU version -def calc_zchi2_batch(spectra, tdata, weights, flux, wflux, nz, nbasis, solve_matrices_algorithm="PCA", use_gpu=False): +def calc_zchi2_batch(spectra, tdata, weights, flux, wflux, nz, nbasis, solve_matrices_algorithm="PCA", use_gpu=False, fullprecision=True): """Calculate a batch of chi2. For many redshifts and a set of spectral data, compute the chi2 for template data that is already on the correct grid. @@ -514,6 +525,8 @@ def calc_zchi2_batch(spectra, tdata, weights, flux, wflux, nz, nbasis, solve_mat nz (int): number of templates nbasis (int): nbasis use_gpu (bool): use GPU or not + fullprecision (bool): Whether or not to ensure reproducibly identical + results. See calc_batch_dot_product_3d3d_gpu. Returns: zchi2 (array): array with one element per redshift for this target @@ -568,7 +581,7 @@ def calc_zchi2_batch(spectra, tdata, weights, flux, wflux, nz, nbasis, solve_mat ###!!! NOTE - uncomment the below line to run (B) #(all_M, all_y) = calc_M_y_batch(Tbs, weights, wflux, nz, nbasis) ###!!! NOTE - uncomment the 2 lines below to run (C) - all_M = calc_batch_dot_product_3d3d_gpu(Tbs, (weights[None, :, None] * Tbs), transpose_a=True) + all_M = calc_batch_dot_product_3d3d_gpu(Tbs, (weights[None, :, None] * Tbs), transpose_a=True, fullprecision=fullprecision) all_y = (Tbs.swapaxes(-2, -1) @ wflux) ###!!! NOTE - uncomment the 2 lines below to run an alternative ### version of (C) that does the transpose on the CPU - this seems @@ -723,7 +736,8 @@ def calc_zchi2(target_ids, target_data, dtemplate, progress=None, use_gpu=False) # Use helper method calc_zchi2_batch to calculate zchi2 and zcoeff # for all templates for all three spectra for this target - (zchi2[j,:], zcoeff[j,:,:]) = calc_zchi2_batch(target_data[j].spectra, tdata, weights, flux, wflux, nz, nbasis, dtemplate.template.solve_matrices_algorithm, use_gpu) + # For coarse z scan, use fullprecision = False to maximize speed + (zchi2[j,:], zcoeff[j,:,:]) = calc_zchi2_batch(target_data[j].spectra, tdata, weights, flux, wflux, nz, nbasis, dtemplate.template.solve_matrices_algorithm, use_gpu, fullprecision=False) #- Penalize chi2 for negative [OII] flux; ad-hoc if dtemplate.template.template_type == 'GALAXY': From ed4f0ff2b0c877073b5ac45558f663d889bed80a Mon Sep 17 00:00:00 2001 From: Craig D Warner Date: Mon, 24 Apr 2023 10:43:00 -0700 Subject: [PATCH 4/5] Fix - had accidentally deleted one line from code before submitting PR when running tests. --- py/redrock/zscan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/py/redrock/zscan.py b/py/redrock/zscan.py index 417cd354..4734d721 100644 --- a/py/redrock/zscan.py +++ b/py/redrock/zscan.py @@ -488,6 +488,7 @@ def calc_batch_dot_product_3d3d_gpu(a, b, transpose_a=False, fullprecision=True) ncols = cp.int32(a.shape[1]) transpose_a = cp.int32(transpose_a) + nparallel = cp.int32(4) if (fullprecision): nparallel = cp.int32(4) elif (nz <= 512): From 1533d71f87709763140f6e10380df23c0fbeca82 Mon Sep 17 00:00:00 2001 From: Benjamin Alan Weaver Date: Mon, 11 Sep 2023 14:55:15 -0700 Subject: [PATCH 5/5] update RTD configuration --- .readthedocs.yml | 23 +++++++++++++++++++++++ README.rst | 16 +++++++++------- doc/rtd-requirements.txt | 6 ++++-- 3 files changed, 36 insertions(+), 9 deletions(-) create mode 100644 .readthedocs.yml diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 00000000..9bba0552 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,23 @@ +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +build: + os: "ubuntu-22.04" + tools: + python: "3.10" + +# Build documentation in the doc/ directory with Sphinx +sphinx: + configuration: doc/conf.py + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: all + +# Optionally set the version of Python and requirements required to build your docs +python: + install: + - requirements: doc/rtd-requirements.txt diff --git a/README.rst b/README.rst index 982696e4..669179dd 100644 --- a/README.rst +++ b/README.rst @@ -2,16 +2,18 @@ redrock ======= -.. image:: https://travis-ci.org/desihub/redrock.svg?branch=master - :target: https://travis-ci.org/desihub/redrock - :alt: Travis Build Status +|Actions Status| |Coveralls Status| |Documentation Status| -.. image:: https://coveralls.io/repos/github/desihub/redrock/badge.svg?branch=master - :target: https://coveralls.io/github/desihub/redrock?branch=master +.. |Actions Status| image:: https://github.com/desihub/redrock/workflows/CI/badge.svg + :target: https://github.com/desihub/redrock/actions + :alt: GitHub Actions CI Status + +.. |Coveralls Status| image:: https://coveralls.io/repos/desihub/redrock/badge.svg + :target: https://coveralls.io/github/desihub/redrock :alt: Test Coverage Status -.. image:: https://readthedocs.org/projects/redrock/badge/?version=latest - :target: http://redrock.readthedocs.org/en/latest/ +.. |Documentation Status| image:: https://readthedocs.org/projects/redrock/badge/?version=latest + :target: https://redrock.readthedocs.io/en/latest/ :alt: Documentation Status Introduction diff --git a/doc/rtd-requirements.txt b/doc/rtd-requirements.txt index 4bdb01c5..8e0f6391 100644 --- a/doc/rtd-requirements.txt +++ b/doc/rtd-requirements.txt @@ -1,2 +1,4 @@ -setuptools -docutils<0.18 +setuptools>60 +Sphinx>6,<7 +sphinx-rtd-theme>1 +urllib3<2