From a299d0bb65cfe1b51a683d2903f2a8e80b10e353 Mon Sep 17 00:00:00 2001 From: randyh62 <42045079+randyh62@users.noreply.github.com> Date: Fri, 16 Feb 2024 10:51:37 -0800 Subject: [PATCH] Doc reorg for diataxis (#513) * doc_reorg * additional edits * _toc.yml.in update * review comments implemented --- LICENSE.txt | 2 +- docs/block_ops/data_mov_funcs.rst | 35 ++++---- docs/block_ops/index.rst | 28 +++++-- .../ops_classes/adjacent_difference.rst | 11 ++- docs/block_ops/ops_classes/discontinuity.rst | 11 ++- docs/block_ops/ops_classes/exchange.rst | 11 ++- docs/block_ops/ops_classes/histogram.rst | 16 +++- docs/block_ops/ops_classes/index.rst | 32 ++++---- docs/block_ops/ops_classes/load.rst | 15 +++- docs/block_ops/ops_classes/reduce.rst | 15 +++- docs/block_ops/ops_classes/scan.rst | 15 +++- docs/block_ops/ops_classes/shuffle.rst | 11 ++- docs/block_ops/ops_classes/sort.rst | 15 +++- docs/block_ops/ops_classes/store.rst | 15 +++- docs/concepts/concepts.rst | 12 +++ docs/{ => concepts}/glossary.rst | 26 +++--- docs/concepts/intro.rst | 39 +++++++++ docs/device_ops/adjacent_difference.rst | 21 +++-- docs/device_ops/binary_search.rst | 11 ++- docs/device_ops/config.rst | 13 ++- docs/device_ops/histogram.rst | 21 +++-- docs/device_ops/index.rst | 37 +++++---- docs/device_ops/merge.rst | 15 +++- docs/device_ops/partition.rst | 15 +++- docs/device_ops/reduce.rst | 23 ++++-- docs/device_ops/run_length_encoding.rst | 17 ++-- docs/device_ops/scan.rst | 33 +++++--- docs/device_ops/select.rst | 15 +++- docs/device_ops/sort.rst | 39 +++++---- docs/device_ops/transform.rst | 15 +++- docs/device_ops/unique.rst | 15 +++- docs/index.rst | 44 +++++++++-- docs/intro.rst | 30 ------- docs/reference/acknowledge.rst | 13 +++ docs/{ => reference}/data-type-support.rst | 11 +-- docs/{ => reference}/intrinsics.rst | 27 ++++--- docs/{ => reference}/iterators.rst | 25 +++--- docs/{ => reference}/ops_summary.rst | 54 +++++++------ docs/reference/reference.rst | 18 +++++ docs/{ => reference}/thread_ops.rst | 17 ++-- docs/sphinx/_toc.yml.in | 79 +++++++++++++++---- docs/warp_ops/exchange.rst | 11 ++- docs/warp_ops/index.rst | 25 +++--- docs/warp_ops/load.rst | 15 +++- docs/warp_ops/reduce.rst | 11 ++- docs/warp_ops/scan.rst | 11 ++- docs/warp_ops/shuffle.rst | 11 ++- docs/warp_ops/sort.rst | 11 ++- docs/warp_ops/store.rst | 15 +++- 49 files changed, 718 insertions(+), 309 deletions(-) create mode 100644 docs/concepts/concepts.rst rename docs/{ => concepts}/glossary.rst (53%) create mode 100644 docs/concepts/intro.rst delete mode 100644 docs/intro.rst create mode 100644 docs/reference/acknowledge.rst rename docs/{ => reference}/data-type-support.rst (68%) rename docs/{ => reference}/intrinsics.rst (72%) rename docs/{ => reference}/iterators.rst (75%) rename docs/{ => reference}/ops_summary.rst (56%) create mode 100644 docs/reference/reference.rst rename docs/{ => reference}/thread_ops.rst (72%) diff --git a/LICENSE.txt b/LICENSE.txt index 2bd41a127..ba22abef2 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/block_ops/data_mov_funcs.rst b/docs/block_ops/data_mov_funcs.rst index a80bd0242..4fce51dbd 100644 --- a/docs/block_ops/data_mov_funcs.rst +++ b/docs/block_ops/data_mov_funcs.rst @@ -1,62 +1,69 @@ -Data movement functions ------------------------ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _data_mov_funcs: + +******************************************************************** + Data movement functions +******************************************************************** Direct Blocked -~~~~~~~~~~~~~~ +=============== Load -.... +------ .. doxygenfunction:: rocprim::block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread]) .. doxygenfunction:: rocprim::block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid) .. doxygenfunction:: rocprim::block_load_direct_blocked (unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds) Store -..... +---------- .. doxygenfunction:: rocprim::block_store_direct_blocked (unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread]) .. doxygenfunction:: rocprim::block_store_direct_blocked (unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread], unsigned int valid) Direct Blocked Vectorized -~~~~~~~~~~~~~~~~~~~~~~~~~ +=========================== Load -.... +------- .. doxygenfunction:: rocprim::block_load_direct_blocked_vectorized (unsigned int flat_id, T *block_input, U(&items)[ItemsPerThread]) Store -..... +---------- .. doxygenfunction:: rocprim::block_store_direct_blocked_vectorized (unsigned int flat_id, T *block_output, U(&items)[ItemsPerThread]) Direct Striped -~~~~~~~~~~~~~~ +================== Load -.... +--------- .. doxygenfunction:: rocprim::block_load_direct_striped (unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread]) .. doxygenfunction:: rocprim::block_load_direct_striped (unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid) .. doxygenfunction:: rocprim::block_load_direct_striped (unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds) Store -..... +---------- .. doxygenfunction:: rocprim::block_store_direct_striped (unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread]) .. doxygenfunction:: rocprim::block_store_direct_striped (unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread], unsigned int valid) Direct Warp Striped -~~~~~~~~~~~~~~~~~~~ +==================== Load -.... +--------- .. doxygengroup:: blockmodule_warp_load_functions :content-only: Store -..... +---------- .. doxygengroup:: blockmodule_warp_store_functions :content-only: diff --git a/docs/block_ops/index.rst b/docs/block_ops/index.rst index f10888cac..c3f3ce6fd 100644 --- a/docs/block_ops/index.rst +++ b/docs/block_ops/index.rst @@ -1,6 +1,24 @@ -Block-Wide Operations -===================== +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation -.. toctree:: - ops_classes/index - data_mov_funcs +.. _block-index: + +******************************************************************** + Block-Wide Operations +******************************************************************** + + * :ref:`class-index` + + * :ref:`blk-load` + * :ref:`blk-store` + * :ref:`blk-adjacent_difference` + * :ref:`blk-discontinuity` + * :ref:`blk-scan` + * :ref:`blk-reduce` + * :ref:`blk-shuffle` + * :ref:`blk-exchange` + * :ref:`blk-sort` + * :ref:`blk-histogram` + + * :ref:`data_mov_funcs` diff --git a/docs/block_ops/ops_classes/adjacent_difference.rst b/docs/block_ops/ops_classes/adjacent_difference.rst index c16609e8b..e8532add0 100644 --- a/docs/block_ops/ops_classes/adjacent_difference.rst +++ b/docs/block_ops/ops_classes/adjacent_difference.rst @@ -1,5 +1,12 @@ -Adjacent difference -~~~~~~~~~~~~~~~~~~~ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _blk-adjacent_difference: + +******************************************************************** + Adjacent difference +******************************************************************** .. doxygenclass:: rocprim::block_adjacent_difference :members: diff --git a/docs/block_ops/ops_classes/discontinuity.rst b/docs/block_ops/ops_classes/discontinuity.rst index f5f444636..65c88cde5 100644 --- a/docs/block_ops/ops_classes/discontinuity.rst +++ b/docs/block_ops/ops_classes/discontinuity.rst @@ -1,5 +1,12 @@ -Discontinuity -~~~~~~~~~~~~~ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _blk-discontinuity: + +******************************************************************** + Discontinuity +******************************************************************** .. doxygenclass:: rocprim::block_discontinuity :members: diff --git a/docs/block_ops/ops_classes/exchange.rst b/docs/block_ops/ops_classes/exchange.rst index cbbb4e673..cab380f5b 100644 --- a/docs/block_ops/ops_classes/exchange.rst +++ b/docs/block_ops/ops_classes/exchange.rst @@ -1,5 +1,12 @@ -Exchange -~~~~~~~~ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _blk-exchange: + +******************************************************************** + Exchange +******************************************************************** .. doxygenclass:: rocprim::block_exchange :members: diff --git a/docs/block_ops/ops_classes/histogram.rst b/docs/block_ops/ops_classes/histogram.rst index 68bf970d6..f4e4aee06 100644 --- a/docs/block_ops/ops_classes/histogram.rst +++ b/docs/block_ops/ops_classes/histogram.rst @@ -1,12 +1,20 @@ -Histogram -~~~~~~~~~ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _blk-histogram: + +******************************************************************** + Histogram +******************************************************************** + Class -..... +========= .. doxygenclass:: rocprim::block_histogram :members: Algorithms -.......... +=========== .. doxygenenum:: rocprim::block_histogram_algorithm diff --git a/docs/block_ops/ops_classes/index.rst b/docs/block_ops/ops_classes/index.rst index cf651d630..1de36b53d 100644 --- a/docs/block_ops/ops_classes/index.rst +++ b/docs/block_ops/ops_classes/index.rst @@ -1,18 +1,20 @@ -Operation classes ------------------ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation -.. toctree:: - load - store +.. _class-index: - adjacent_difference - discontinuity +******************************************************************** + Operation classes +******************************************************************** - scan - reduce - - shuffle - exchange - sort - - histogram + * :ref:`blk-load` + * :ref:`blk-store` + * :ref:`blk-adjacent_difference` + * :ref:`blk-discontinuity` + * :ref:`blk-scan` + * :ref:`blk-reduce` + * :ref:`blk-shuffle` + * :ref:`blk-exchange` + * :ref:`blk-sort` + * :ref:`blk-histogram` diff --git a/docs/block_ops/ops_classes/load.rst b/docs/block_ops/ops_classes/load.rst index 1b4de9269..f40e7b26e 100644 --- a/docs/block_ops/ops_classes/load.rst +++ b/docs/block_ops/ops_classes/load.rst @@ -1,13 +1,20 @@ -Load -~~~~ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _blk-load: + +******************************************************************** + Load +******************************************************************** Class -..... +========== .. doxygenclass:: rocprim::block_load :members: Algorithms -.......... +============== .. doxygenenum:: rocprim::block_load_method diff --git a/docs/block_ops/ops_classes/reduce.rst b/docs/block_ops/ops_classes/reduce.rst index f2e5a8119..68eb31f71 100644 --- a/docs/block_ops/ops_classes/reduce.rst +++ b/docs/block_ops/ops_classes/reduce.rst @@ -1,13 +1,20 @@ -Reduce -~~~~~~ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _blk-reduce: + +******************************************************************** + Reduce +******************************************************************** Class -..... +========== .. doxygenclass:: rocprim::block_reduce :members: Algorithms -.......... +============ .. doxygenenum:: rocprim::block_reduce_algorithm diff --git a/docs/block_ops/ops_classes/scan.rst b/docs/block_ops/ops_classes/scan.rst index 1b66f45a6..e273b7909 100644 --- a/docs/block_ops/ops_classes/scan.rst +++ b/docs/block_ops/ops_classes/scan.rst @@ -1,13 +1,20 @@ -Scan -~~~~ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _blk-scan: + +******************************************************************** + Scan +******************************************************************** Class -..... +======= .. doxygenclass:: rocprim::block_scan :members: Algorithms -.......... +============== .. doxygenenum:: rocprim::block_scan_algorithm diff --git a/docs/block_ops/ops_classes/shuffle.rst b/docs/block_ops/ops_classes/shuffle.rst index 9335eaf95..120391b1c 100644 --- a/docs/block_ops/ops_classes/shuffle.rst +++ b/docs/block_ops/ops_classes/shuffle.rst @@ -1,5 +1,12 @@ -Shuffle -~~~~~~~ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _blk-shuffle: + +******************************************************************** + Shuffle +******************************************************************** .. doxygenclass:: rocprim::block_shuffle :members: diff --git a/docs/block_ops/ops_classes/sort.rst b/docs/block_ops/ops_classes/sort.rst index f5cce2e14..a71d94b6c 100644 --- a/docs/block_ops/ops_classes/sort.rst +++ b/docs/block_ops/ops_classes/sort.rst @@ -1,8 +1,15 @@ -Sort -~~~~ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _blk-sort: + +******************************************************************** + Sort +******************************************************************** generic -....... +========= .. doxygenclass:: rocprim::block_sort @@ -11,7 +18,7 @@ generic .. doxygenenum:: rocprim::block_sort_algorithm radix sort -.......... +=========== .. doxygenclass:: rocprim::block_radix_sort :members: diff --git a/docs/block_ops/ops_classes/store.rst b/docs/block_ops/ops_classes/store.rst index 41eaf7bb2..ba90c8099 100644 --- a/docs/block_ops/ops_classes/store.rst +++ b/docs/block_ops/ops_classes/store.rst @@ -1,13 +1,20 @@ -Store -~~~~~ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _blk-store: + +******************************************************************** + Store +******************************************************************** Class -..... +====== .. doxygenclass:: rocprim::block_store :members: Algorithms -.......... +=========== .. doxygenenum:: rocprim::block_store_method diff --git a/docs/concepts/concepts.rst b/docs/concepts/concepts.rst new file mode 100644 index 000000000..fba90e510 --- /dev/null +++ b/docs/concepts/concepts.rst @@ -0,0 +1,12 @@ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _concepts: + +******************************************************************** + rocPRIM Concepts +******************************************************************** + +* :ref:`rocprim-intro` +* :ref:`glossary` diff --git a/docs/glossary.rst b/docs/concepts/glossary.rst similarity index 53% rename from docs/glossary.rst rename to docs/concepts/glossary.rst index 6b99f830d..27376ad58 100644 --- a/docs/glossary.rst +++ b/docs/concepts/glossary.rst @@ -1,34 +1,40 @@ -Glossary -======== +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _glossary: + +******************************************************************** + Glossary +******************************************************************** This glossary is to help users understand the basic concepts or terminologies used in the rocPRIM library. Terminologies -.. glossary:: Warp - Refers to a group of threads that execute in SIMT (Single Instruction, Multiple Thread) fashion. Also known as wavefronts on AMD GPUs. + Refers to a group of threads that execute in single instruction, multiple thread (SIMT) fashion. Also known as wavefronts on AMD GPUs. Hardware Warp Size - Refers to the number of threads in a warp defined by the hardware. On Nvidia GPUs, a warp size is 32 while on AMD GPUs, a warp size is 64. + Refers to the number of threads in a warp defined by the hardware. On Nvidia GPUs a warp size is 32, while on AMD GPUs a warp size is 64. Logical Warp Size Refers to the number of threads in a warp defined by the user, which can be equal to or less than the size of the hardware warp size. Lane ID Refers to the thread identifier within the warp. A logical lane ID refers to the thread identifier in a "logical - warp", which can be smaller than a hardware warp size (And can be defined as ``lane_id() % WarpSize``). + warp", which can be smaller than a hardware warp size (and can be defined as ``lane_id() % WarpSize``). Warp ID Refers to the identifier of the hardware/logical warp in a block. Warp ID is guaranteed to be unique among warps. Block - Refers to a group of threads that are executed on the same compute unit (streaming multiprocessor). These threads can \n + Refers to a group of threads that are executed on the same compute unit (streaming multiprocessor). These threads can be indexed using 1 Dimension {X}, 2 Dimensions {X, Y} or 3 Dimensions {X, Y, Z}. A block consists of multiple warps. Tile - Refers to a block, but in the C++AMP/HCC nomenclature. + Refers to a block in C++AMP/HIPCC nomenclature. Flat ID - Refers to a flattened identifier of a block (tile) or a thread identifier. Flat ID is a 1D value created from 2D or 3D \n - identifier. Example: flat id of thread id (X, Y) in 2D thread block 128x4 (XxY) is Y * 128 + X. + Refers to a flattened identifier of a block (tile) or a thread identifier. Flat ID is a 1D value created from 2D or 3D + identifier. For example the flat ID of thread ID (X, Y) in 2D thread block 128x4 (XxY) is ``Y * 128 + X``. diff --git a/docs/concepts/intro.rst b/docs/concepts/intro.rst new file mode 100644 index 000000000..f5e997ded --- /dev/null +++ b/docs/concepts/intro.rst @@ -0,0 +1,39 @@ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _rocprim-intro: + +******************************************************************** + Introduction to rocPRIM +******************************************************************** + +Operations and Sequences +======================== + +A rocPRIM operation is a computation over a sequence of objects. A rocPRIM operation can return a single value like the ``reduce`` operation; return another sequence like the ``sort`` operation; or return multiple sequences like the ``partition`` operation. The elements of the sequence could be of any type or class, although template specialization allows rocPRIM to optimize the computations over the usual numerical datatypes. Operations accept input in the form of ``iterators`` that point to a sequence of objects to process, and write output to a mutable ``iterator``. + +A high level view of the available operations can be found on :ref:`ops-summary`. rocPRIM includes a variety of generic operations that are frequently very useful. + +.. note:: + Refer to :ref:`data-type-support` for information on supported datatypes. + +Scope +====== + +An important property of a rocPRIM operation is its scope, which determines the level of the computing model used for processing the operation. The scope determines which parts of the GPU will cooperate to compute the result. The scope has a direct influence on how the data will be subdivided and processed by the computing units or VALUs. The rocPRIM operation scopes are: + +* *Device/Grid* the operation and data will be split and dispatched to all the CUs. +* :term:`Block` The operation should take place within the same block by the same CU. +* :term:`Warp` as above but with a warp and a VALU. +* *Thread* The operation will take place sequentially in the same thread. Thread-wide operations are also called *Utilities* since they coincide with utility functions used on a CPU. + +The scope has an impact on how the operation is initiated: + +* *Device/Grid* it is a kernel, thus it is dispatched with its own grid/block dimensions. +* *Block/Warp/Thread* it is a function call, and inherits the dimensions of the current kernel. + +This also dictates how synchronization should be done to wait for completion: + +* *Device/Grid* Synchronization is done via wait lists and queue barriers (``stream``). +* *Block/Warp/Thread* it is in the same control flow of the caller threads. Synchronization is done via memory barriers. diff --git a/docs/device_ops/adjacent_difference.rst b/docs/device_ops/adjacent_difference.rst index 90213cbee..987614cd4 100644 --- a/docs/device_ops/adjacent_difference.rst +++ b/docs/device_ops/adjacent_difference.rst @@ -1,28 +1,35 @@ -Adjacent difference -------------------- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-adjacent_difference: + +******************************************************************** + Adjacent difference +******************************************************************** Configuring the kernel -~~~~~~~~~~~~~~~~~~~~~~ +======================== .. doxygenstruct:: rocprim::adjacent_difference_config left -~~~~ +====== .. doxygenfunction:: rocprim::adjacent_difference(void *const temporary_storage, std::size_t &storage_size, const InputIt input, const OutputIt output, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false) left, inplace -~~~~~~~~~~~~~ +=============== .. doxygenfunction:: rocprim::adjacent_difference_inplace(void *const temporary_storage, std::size_t &storage_size, const InputIt values, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false) right -~~~~~ +============= .. doxygenfunction:: rocprim::adjacent_difference_right(void *const temporary_storage, std::size_t &storage_size, const InputIt input, const OutputIt output, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false) right, inplace -~~~~~~~~~~~~~~ +=============== .. doxygenfunction:: rocprim::adjacent_difference_right_inplace(void *const temporary_storage, std::size_t &storage_size, const InputIt values, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false) diff --git a/docs/device_ops/binary_search.rst b/docs/device_ops/binary_search.rst index 8e33ba3b0..eb2b38f2b 100644 --- a/docs/device_ops/binary_search.rst +++ b/docs/device_ops/binary_search.rst @@ -1,4 +1,11 @@ -Binary Search -------------- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-binary_search: + +******************************************************************** + Binary Search +******************************************************************** .. doxygenfunction:: rocprim::binary_search(void *temporary_storage, size_t &storage_size, HaystackIterator haystack, NeedlesIterator needles, OutputIterator output, size_t haystack_size, size_t needles_size, CompareFunction compare_op=CompareFunction(), hipStream_t stream=0, bool debug_synchronous=false) diff --git a/docs/device_ops/config.rst b/docs/device_ops/config.rst index ef567f178..badfae3c5 100644 --- a/docs/device_ops/config.rst +++ b/docs/device_ops/config.rst @@ -1,9 +1,16 @@ -Configuring the Kernels -======================= +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-config: + +******************************************************************** + Configuring the Kernels +******************************************************************** A kernel config is a way to select the grid/block dimensions, but also how the data will be fetched and stored (the algorithms used for -``load``/``store`` ) for the operations using them (such as ``select``). +``load`` and ``store``) for the operations using them (such as ``select``). .. doxygenstruct:: rocprim::kernel_config diff --git a/docs/device_ops/histogram.rst b/docs/device_ops/histogram.rst index 10d03d217..3627060be 100644 --- a/docs/device_ops/histogram.rst +++ b/docs/device_ops/histogram.rst @@ -1,31 +1,38 @@ -Histogram ---------- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-histogram: + +******************************************************************** + Histogram +******************************************************************** Configuring the kernel -~~~~~~~~~~~~~~~~~~~~~~ +======================== .. doxygenstruct:: rocprim::histogram_config histogram_even -~~~~~~~~~~~~~~ +================ .. doxygenfunction:: rocprim::histogram_even(void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int size, Counter *histogram, unsigned int levels, Level lower_level, Level upper_level, hipStream_t stream=0, bool debug_synchronous=false) .. doxygenfunction:: rocprim::histogram_even(void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int columns, unsigned int rows, size_t row_stride_bytes, Counter *histogram, unsigned int levels, Level lower_level, Level upper_level, hipStream_t stream=0, bool debug_synchronous=false) multi_histogram_even -~~~~~~~~~~~~~~~~~~~~ +===================== .. doxygenfunction:: rocprim::multi_histogram_even(void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int size, Counter *histogram[ActiveChannels], unsigned int levels[ActiveChannels], Level lower_level[ActiveChannels], Level upper_level[ActiveChannels], hipStream_t stream=0, bool debug_synchronous=false) .. doxygenfunction:: rocprim::multi_histogram_even(void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int columns, unsigned int rows, size_t row_stride_bytes, Counter *histogram[ActiveChannels], unsigned int levels[ActiveChannels], Level lower_level[ActiveChannels], Level upper_level[ActiveChannels], hipStream_t stream=0, bool debug_synchronous=false) histogram_range -~~~~~~~~~~~~~~~ +================ .. doxygenfunction:: rocprim::histogram_range(void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int size, Counter *histogram, unsigned int levels, Level *level_values, hipStream_t stream=0, bool debug_synchronous=false) .. doxygenfunction:: rocprim::histogram_range(void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int columns, unsigned int rows, size_t row_stride_bytes, Counter *histogram, unsigned int levels, Level *level_values, hipStream_t stream=0, bool debug_synchronous=false) multi_histogram_range -~~~~~~~~~~~~~~~~~~~~~ +====================== .. doxygenfunction:: rocprim::multi_histogram_range(void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int size, Counter *histogram[ActiveChannels], unsigned int levels[ActiveChannels], Level *level_values[ActiveChannels], hipStream_t stream=0, bool debug_synchronous=false) .. doxygenfunction:: rocprim::multi_histogram_range(void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int columns, unsigned int rows, size_t row_stride_bytes, Counter *histogram[ActiveChannels], unsigned int levels[ActiveChannels], Level *level_values[ActiveChannels], hipStream_t stream=0, bool debug_synchronous=false) diff --git a/docs/device_ops/index.rst b/docs/device_ops/index.rst index 85ea3563e..b75bb2184 100644 --- a/docs/device_ops/index.rst +++ b/docs/device_ops/index.rst @@ -1,20 +1,23 @@ -Device-Wide Operations -====================== +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation -.. toctree:: - :maxdepth: 6 +.. _dev-index: - config +******************************************************************** + Device-Wide Operations +******************************************************************** - transform - unique - sort - merge - partition - run_length_encoding - scan - select - reduce - adjacent_difference - binary_search - histogram + * :ref:`dev-config` + * :ref:`dev-transform` + * :ref:`dev-unique` + * :ref:`dev-sort` + * :ref:`dev-merge` + * :ref:`dev-partition` + * :ref:`dev-run_length` + * :ref:`dev-scan` + * :ref:`dev-select` + * :ref:`dev-reduce` + * :ref:`dev-adjacent_difference` + * :ref:`dev-binary_search` + * :ref:`dev-histogram` diff --git a/docs/device_ops/merge.rst b/docs/device_ops/merge.rst index 34642e4a8..604e25675 100644 --- a/docs/device_ops/merge.rst +++ b/docs/device_ops/merge.rst @@ -1,13 +1,20 @@ -Merge ------ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-merge: + +******************************************************************** + Merge +******************************************************************** Configuring the kernel -~~~~~~~~~~~~~~~~~~~~~~ +======================= .. doxygentypedef:: rocprim::merge_config merge -~~~~~ +========== .. doxygenfunction:: rocprim::merge (void *temporary_storage, size_t &storage_size, InputIterator1 input1, InputIterator2 input2, OutputIterator output, const size_t input1_size, const size_t input2_size, BinaryFunction compare_function=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false) .. doxygenfunction:: rocprim::merge (void *temporary_storage, size_t &storage_size, KeysInputIterator1 keys_input1, KeysInputIterator2 keys_input2, KeysOutputIterator keys_output, ValuesInputIterator1 values_input1, ValuesInputIterator2 values_input2, ValuesOutputIterator values_output, const size_t input1_size, const size_t input2_size, BinaryFunction compare_function=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false) diff --git a/docs/device_ops/partition.rst b/docs/device_ops/partition.rst index 78a234bc2..a10d95920 100644 --- a/docs/device_ops/partition.rst +++ b/docs/device_ops/partition.rst @@ -1,12 +1,19 @@ -Partition ---------- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-partition: + +******************************************************************** + Partition +******************************************************************** partition -~~~~~~~~~ +============ .. doxygenfunction:: rocprim::partition(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, UnaryPredicate predicate, const hipStream_t stream=0, const bool debug_synchronous=false) partition_three_way -~~~~~~~~~~~~~~~~~~~ +====================== .. doxygenfunction:: rocprim::partition_three_way(void *temporary_storage, size_t &storage_size, InputIterator input, FirstOutputIterator output_first_part, SecondOutputIterator output_second_part, UnselectedOutputIterator output_unselected, SelectedCountOutputIterator selected_count_output, const size_t size, FirstUnaryPredicate select_first_part_op, SecondUnaryPredicate select_second_part_op, const hipStream_t stream = 0, const bool debug_synchronous = false) diff --git a/docs/device_ops/reduce.rst b/docs/device_ops/reduce.rst index be29c4d1d..64d01daf5 100644 --- a/docs/device_ops/reduce.rst +++ b/docs/device_ops/reduce.rst @@ -1,32 +1,39 @@ -Reduce ------- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-reduce: + +******************************************************************** + Reduce +******************************************************************** Configuring the kernel -~~~~~~~~~~~~~~~~~~~~~~ +======================= reduce -...... +--------- .. doxygenstruct:: rocprim::reduce_config reduce_by_key -............. +-------------- .. doxygenstruct:: rocprim::reduce_by_key_config reduce -~~~~~~ +========== .. doxygenfunction:: rocprim::reduce(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction reduce_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false) .. doxygenfunction:: rocprim::reduce(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const size_t size, BinaryFunction reduce_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false) segmented_reduce -~~~~~~~~~~~~~~~~ +================== .. doxygenfunction:: rocprim::segmented_reduce(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, BinaryFunction reduce_op=BinaryFunction(), InitValueType initial_value=InitValueType(), hipStream_t stream=0, bool debug_synchronous=false) reduce_by_key -~~~~~~~~~~~~~ +================= .. doxygenfunction:: rocprim::reduce_by_key(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, ValuesInputIterator values_input, const size_t size, UniqueOutputIterator unique_output, AggregatesOutputIterator aggregates_output, UniqueCountOutputIterator unique_count_output, BinaryFunction reduce_op=BinaryFunction(), KeyCompareFunction key_compare_op=KeyCompareFunction(), hipStream_t stream=0, bool debug_synchronous=false) diff --git a/docs/device_ops/run_length_encoding.rst b/docs/device_ops/run_length_encoding.rst index 9fa0a0d6c..02761c1d4 100644 --- a/docs/device_ops/run_length_encoding.rst +++ b/docs/device_ops/run_length_encoding.rst @@ -1,17 +1,24 @@ -Run Length Encode ------------------ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-run_length: + +******************************************************************** + Run Length Encode +******************************************************************** Configuring the kernel -~~~~~~~~~~~~~~~~~~~~~~ +========================== .. doxygenstruct:: rocprim::run_length_encode_config run_length_encode -~~~~~~~~~~~~~~~~~ +==================== .. doxygenfunction:: rocprim::run_length_encode(void *temporary_storage, size_t &storage_size, InputIterator input, unsigned int size, UniqueOutputIterator unique_output, CountsOutputIterator counts_output, RunsCountOutputIterator runs_count_output, hipStream_t stream=0, bool debug_synchronous=false) run_length_encode_non_trivial_runs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +==================================== .. doxygenfunction:: rocprim::run_length_encode_non_trivial_runs(void *temporary_storage, size_t &storage_size, InputIterator input, unsigned int size, OffsetsOutputIterator offsets_output, CountsOutputIterator counts_output, RunsCountOutputIterator runs_count_output, hipStream_t stream=0, bool debug_synchronous=false) diff --git a/docs/device_ops/scan.rst b/docs/device_ops/scan.rst index 948dfdc67..a088f3155 100644 --- a/docs/device_ops/scan.rst +++ b/docs/device_ops/scan.rst @@ -1,51 +1,58 @@ -Scan ----- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-scan: + +******************************************************************** + Scan +******************************************************************** Configuring the kernel -~~~~~~~~~~~~~~~~~~~~~~ +======================== scan -.... +--------- .. doxygenstruct:: rocprim::scan_config scan_by_key -........... +----------- .. doxygenstruct:: rocprim::scan_by_key_config scan -~~~~ +========= inclusive -......... +---------- .. doxygenfunction:: rocprim::inclusive_scan(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const size_t size, BinaryFunction scan_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false) exclusive -......... +---------- .. doxygenfunction:: rocprim::exclusive_scan(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction scan_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false) segmented, inclusive -.................... +---------------------- .. doxygenfunction:: rocprim::segmented_inclusive_scan(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, BinaryFunction scan_op=BinaryFunction(), hipStream_t stream=0, bool debug_synchronous=false) segmented, exclusive -.................... +----------------------- .. doxygenfunction:: rocprim::segmented_exclusive_scan(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, const InitValueType initial_value, BinaryFunction scan_op=BinaryFunction(), hipStream_t stream=0, bool debug_synchronous=false)x scan_by_key -~~~~~~~~~~~ +============ inclusive -......... +------------ .. doxygenfunction:: rocprim::inclusive_scan_by_key(void *const temporary_storage, size_t &storage_size, const KeysInputIterator keys_input, const ValuesInputIterator values_input, const ValuesOutputIterator values_output, const size_t size, const BinaryFunction scan_op=BinaryFunction(), const KeyCompareFunction key_compare_op=KeyCompareFunction(), const hipStream_t stream=0, const bool debug_synchronous=false) exclusive -......... +------------ .. doxygenfunction:: rocprim::exclusive_scan_by_key(void *const temporary_storage, size_t &storage_size, const KeysInputIterator keys_input, const ValuesInputIterator values_input, const ValuesOutputIterator values_output, const InitialValueType initial_value, const size_t size, const BinaryFunction scan_op=BinaryFunction(), const KeyCompareFunction key_compare_op=KeyCompareFunction(), const hipStream_t stream=0, const bool debug_synchronous=false) diff --git a/docs/device_ops/select.rst b/docs/device_ops/select.rst index c5ecd58d2..6d4b7676d 100644 --- a/docs/device_ops/select.rst +++ b/docs/device_ops/select.rst @@ -1,13 +1,20 @@ -Select ------- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-select: + +******************************************************************** + Select +******************************************************************** Configuring the kernel -~~~~~~~~~~~~~~~~~~~~~~ +======================== .. doxygenstruct:: rocprim::select_config select -~~~~~~ +=============== .. doxygenfunction:: rocprim::select(void *temporary_storage, size_t &storage_size, InputIterator input, FlagIterator flags, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, const hipStream_t stream=0, const bool debug_synchronous=false) .. doxygenfunction:: rocprim::select(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, UnaryPredicate predicate, const hipStream_t stream=0, const bool debug_synchronous=false) diff --git a/docs/device_ops/sort.rst b/docs/device_ops/sort.rst index a0674e59c..7ffa1cf7c 100644 --- a/docs/device_ops/sort.rst +++ b/docs/device_ops/sort.rst @@ -1,69 +1,76 @@ -Sort ----- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-sort: + +******************************************************************** + Sort +******************************************************************** Configuring the kernel -~~~~~~~~~~~~~~~~~~~~~~ +======================= merge_sort -.......... +----------- .. doxygenstruct:: rocprim::merge_sort_config radix_sort -.......... +------------- .. doxygenstruct:: rocprim::radix_sort_config merge_sort -~~~~~~~~~~ +============ .. doxygenfunction:: rocprim::merge_sort(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, const size_t size, BinaryFunction compare_function=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false) .. doxygenfunction:: rocprim::merge_sort(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, const size_t size, BinaryFunction compare_function=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false) radix_sort_keys -~~~~~~~~~~~~~~~ +================ ascending -......... +---------- .. doxygenfunction:: rocprim::radix_sort_keys(void *temporary_storage, size_t &storage_size, double_buffer< Key > &keys, Size size, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false) descending -.......... +----------- .. doxygenfunction:: rocprim::radix_sort_keys_desc(void *temporary_storage, size_t &storage_size, double_buffer< Key > &keys, Size size, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false) segmented, ascending -.................... +----------------------- .. doxygenfunction:: rocprim::segmented_radix_sort_keys(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false) segmented, descending -..................... +----------------------- .. doxygenfunction:: rocprim::segmented_radix_sort_keys_desc(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false) radix_sort_pairs -~~~~~~~~~~~~~~~~ +==================== ascending -......... +----------- .. doxygenfunction:: rocprim::radix_sort_pairs(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, Size size, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false) descending -.......... +---------------- .. doxygenfunction:: rocprim::radix_sort_pairs_desc(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, Size size, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false) segmented, ascending -.................... +------------------------ .. doxygenfunction:: rocprim::segmented_radix_sort_pairs(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false) segmented, ascending -.................... +-------------------------- .. doxygenfunction:: rocprim::segmented_radix_sort_pairs_desc(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false) diff --git a/docs/device_ops/transform.rst b/docs/device_ops/transform.rst index b38eb45bc..bf4ab2256 100644 --- a/docs/device_ops/transform.rst +++ b/docs/device_ops/transform.rst @@ -1,13 +1,20 @@ -Transform ---------- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-transform: + +******************************************************************** + Transform +******************************************************************** Configuring the kernel -~~~~~~~~~~~~~~~~~~~~~~ +====================== .. doxygenstruct:: rocprim::transform_config transform -~~~~~~~~~ +========== .. doxygenfunction:: rocprim::transform(InputIterator, OutputIterator, const size_t, UnaryFunction, const hipStream_t stream, bool) .. doxygenfunction:: rocprim::transform(InputIterator1, InputIterator2, OutputIterator, const size_t, BinaryFunction, const hipStream_t, bool) diff --git a/docs/device_ops/unique.rst b/docs/device_ops/unique.rst index b4d686dc0..5e21c99ba 100644 --- a/docs/device_ops/unique.rst +++ b/docs/device_ops/unique.rst @@ -1,13 +1,20 @@ -Unique ------- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-unique: + +******************************************************************** + Unique +******************************************************************** unique -~~~~~~ +--------- .. doxygenfunction:: rocprim::unique(void *, size_t &, InputIterator, OutputIterator, UniqueCountOutputIterator, const size_t, EqualityOp, const hipStream_t, const bool) unique_by_key -~~~~~~~~~~~~~ +-------------- .. doxygenfunction:: rocprim::unique_by_key(void *, size_t &, const KeyIterator, const ValueIterator, const OutputKeyIterator, const OutputValueIterator, const UniqueCountOutputIterator, const size_t, const EqualityOp, const hipStream_t, const bool) diff --git a/docs/index.rst b/docs/index.rst index 694229ca8..75649aeb3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,12 +1,40 @@ -+++++++++++++++++++++++ - rocPRIM Documentation -+++++++++++++++++++++++ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation -``rocPRIM`` is a header-only library providing HIP parallel primitives to ease the maintainability of performant and yet portable GPU-accelerated code on AMD ROCm platform. +.. _rocprim: + +******************************************************************** + rocPRIM documentation +******************************************************************** + +rocPRIM is a header-only library that provides HIP parallel primitives. The purpose of the library is to ease the maintainability of performant, yet portable GPU-accelerated code on the AMD ROCm platform. rocPRIM is written in HIP and has been optimized for AMD's latest discrete GPUs. For more information refer to :ref:`rocprim-intro`. + +The code is open and hosted at: https://github.com/ROCmSoftwarePlatform/rocPRIM + +The rocPRIM documentation is structured as follows: + +.. grid:: 2 + + .. grid-item-card:: Conceptual + + * :ref:`rocprim-intro` + * :ref:`glossary` + + .. grid-item-card:: API reference + + * :ref:`ops-summary` + * :ref:`data-type-support` + * :ref:`dev-index` + * :ref:`block-index` + * :ref:`warp-index` + * :ref:`thread_ops` + * :ref:`iterators` + * :ref:`intrinsics` + +To contribute to the documentation refer to `Contributing to ROCm `_. + +You can find licensing information on the `Licensing `_ page. -Acknowledgements -================ -The following contributors helped to make this documentation better: -* `v01dXYZ `_ has proposed a new structure for the documentation. diff --git a/docs/intro.rst b/docs/intro.rst deleted file mode 100644 index 4a3b8ffa6..000000000 --- a/docs/intro.rst +++ /dev/null @@ -1,30 +0,0 @@ -Introduction -============ - -Operations and Sequences ------------------------- - -A ``rocPRIM`` operation is a computation over a sequence of objects returning one value (e.g. ``reduce``) , another sequence (e.g. ``sort``) or multiple sequences (e.g. ``partition``). The elements of the sequence could be of any type or class, although template specialization allows ``rocPRIM`` to optimize the computations over the usual numerical datatypes. Operations handle sequences by expecting ``iterators`` as input and mutable ones as output. - -A high level view of the available operations could be consulted there: :doc:`/ops_summary`. As you can see, those are really generic operations that are difficult to avoid on a day to day basis. - -Scope ------ - -An important property of a ``rocPRIM`` operation is its scope defining at which level of the computing model the processing will take place. That means which parts of the GPU will cooperate together to compute the result. -The scope has a direct influence on how the data will be subdivided into chunks to be eventually processed by the computing units or VALUs. - -* *Device/Grid* the operation and data will be split and dispatched to all the CUs. -* :term:`Block` The operation should take place within the same block by the same CU. -* :term:`Warp` as above but with a warp and a VALU. -* *Thread* The operation will take place sequentially in the same thread. We also call those thread-wide operations *Utilities* since it perfectly coincides to utility functions we use on a CPU. - -The scope has an impact on how the operation is initiated: - -* *Device/Grid* it is a kernel, thus it is dispatched with its own grid/block dimensions. -* *Block/Wrap/Thread* it is a function call, and inherits the dimensions of the current kernel. - -This point dictates how synchronization should be done to wait for completion: - -* *Device/Grid* Synchronization is done via wait lists and queue barriers (``stream``). -* *Block/Wrap/Thread* it is in the same control flow of the caller threads. Synchronization is done via memory barriers. diff --git a/docs/reference/acknowledge.rst b/docs/reference/acknowledge.rst new file mode 100644 index 000000000..362cdfa0b --- /dev/null +++ b/docs/reference/acknowledge.rst @@ -0,0 +1,13 @@ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _acknowledge: + +******************************************************************** + Acknowledgements +******************************************************************** + +The following contributors helped to make this documentation better: + +* `v01dXYZ `_ has proposed a new structure for the documentation. diff --git a/docs/data-type-support.rst b/docs/reference/data-type-support.rst similarity index 68% rename from docs/data-type-support.rst rename to docs/reference/data-type-support.rst index aeca5cdc8..35874bad8 100644 --- a/docs/data-type-support.rst +++ b/docs/reference/data-type-support.rst @@ -8,7 +8,7 @@ Data type support ****************************************** -* Supported input and output types. +The following table shows the supported input and output datatypes. .. list-table:: Supported Input/Output Types :header-rows: 1 @@ -59,7 +59,8 @@ Data type support - double - ✅ -* The ⚠️ means that the data type is mostly supported, but there are some API tests, that do not work. - * The ``block_histogram`` test fails with ``int8``. - * The ``device_histogram`` and ``device_reduce_by_key`` doesn't work with ``rocprim::half`` and ``rocprim::bfloat16``. - * The ``device_run_length_encode``, ``warp_exchange`` and ``warp_load`` doesn't work with ``rocprim::half``. +The ⚠️ means that the data type is mostly supported, but there are some API tests, that do not work. + + * The ``block_histogram`` test fails with ``int8``. + * The ``device_histogram`` and ``device_reduce_by_key`` doesn't work with ``rocprim::half`` and ``rocprim::bfloat16``. + * The ``device_run_length_encode``, ``warp_exchange`` and ``warp_load`` doesn't work with ``rocprim::half``. diff --git a/docs/intrinsics.rst b/docs/reference/intrinsics.rst similarity index 72% rename from docs/intrinsics.rst rename to docs/reference/intrinsics.rst index d4b379427..0aac48f55 100644 --- a/docs/intrinsics.rst +++ b/docs/reference/intrinsics.rst @@ -1,15 +1,23 @@ -Intrinsics -========== +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _intrinsics: + +******************************************************************** + Intrinsics +******************************************************************** + Bitwise -------- +======== .. doxygenfunction:: rocprim::get_bit(int x, int i) .. doxygenfunction:: rocprim::bit_count(unsigned int x) .. doxygenfunction:: rocprim::bit_count(unsigned long long x) Warp size ---------- +=========== .. doxygenfunction:: rocprim::warp_size() .. doxygenfunction:: rocprim::host_warp_size(const int device_id, unsigned int& warp_size) @@ -17,32 +25,31 @@ Warp size .. doxygenfunction:: rocprim::device_warp_size() Lane and Warp ID ----------------- +================= .. doxygengroup:: intrinsicsmodule_warp_id :content-only: Flat ID -------- +========== .. doxygengroup:: intrinsicsmodule_flat_id :content-only: Flat Size ---------- +=========== .. doxygenfunction:: rocprim::flat_block_size() .. doxygenfunction:: rocprim::flat_tile_size() Synchronization ---------------- +================= .. doxygenfunction:: rocprim::syncthreads() .. doxygenfunction:: rocprim::wave_barrier() Active threads --------------- - +================== .. doxygenfunction:: rocprim::ballot (int predicate) .. doxygenfunction:: rocprim::masked_bit_count (lane_mask_type x, unsigned int add=0) diff --git a/docs/iterators.rst b/docs/reference/iterators.rst similarity index 75% rename from docs/iterators.rst rename to docs/reference/iterators.rst index fe6784b24..8c88db069 100644 --- a/docs/iterators.rst +++ b/docs/reference/iterators.rst @@ -1,8 +1,15 @@ -Iterators -========= +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _iterators: + +******************************************************************** + Iterators +******************************************************************** Constant --------- +========== .. doxygenclass:: rocprim::constant_iterator :members: @@ -18,7 +25,7 @@ Constant ... Counting --------- +========== .. doxygenclass:: rocprim::counting_iterator :members: @@ -33,7 +40,7 @@ Counting ... Transform ---------- +============ .. doxygenclass:: rocprim::transform_iterator :members: @@ -47,7 +54,7 @@ Transform ... Pairing Values with Indices ---------------------------- +============================= .. doxygenclass:: rocprim::arg_index_iterator :members: @@ -60,7 +67,7 @@ Pairing Values with Indices ... Zip ---- +============== .. doxygenclass:: rocprim::zip_iterator :members: @@ -73,13 +80,13 @@ Zip ... Discard -------- +============== .. doxygenclass:: rocprim::discard_iterator :members: Texture Cache -------------- +================ .. doxygenclass:: rocprim::texture_cache_iterator :members: diff --git a/docs/ops_summary.rst b/docs/reference/ops_summary.rst similarity index 56% rename from docs/ops_summary.rst rename to docs/reference/ops_summary.rst index c8233375c..01d7238fc 100644 --- a/docs/ops_summary.rst +++ b/docs/reference/ops_summary.rst @@ -1,47 +1,55 @@ -Summary of the Operations -========================= +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _ops-summary: + +******************************************************************** + Summary of the Operations +******************************************************************** Basics ------- +========= * ``transform`` applies a function to each element of the sequence, equivalent to the functional operation ``map`` -* ``select`` takes the first N elements of the sequence satisfying a condition (via a selection mask or a predicate function) -* ``unique`` -* ``histogram`` generates a summary of the statistical distribution of the sequence. +* ``select`` takes the first `N`` elements of the sequence satisfying a condition (via a selection mask or a predicate function) +* ``unique`` returns unique elements within a sequence +* ``histogram`` generates a summary of the statistical distribution of the sequence Aggregation ------------ +============ -* ``reduce`` traverses the sequence while accumulating some data, equivalent to the functional operation ``fold_left``. -* ``scan`` is the cumulative version of ``reduce`` which returns the sequence of the intermediate values taken by the accumulator. +* ``reduce`` traverses the sequence while accumulating some data, equivalent to the functional operation ``fold_left`` +* ``scan`` is the cumulative version of ``reduce`` which returns the sequence of the intermediate values taken by the accumulator Differentiation ---------------- +================= -* ``adjacent_difference`` computes the difference between the current element and the previous or next one in the sequence. -* ``discontinuity`` detects value change between the current element and the previous or next one in the sequence. +* ``adjacent_difference`` computes the difference between the current element and the previous or next one in the sequence +* ``discontinuity`` detects value change between the current element and the previous or next one in the sequence Rearrangement -------------- +================ -* ``sort`` rearranges the sequence by sorting it. It could be according to a comparison operator or a value using a radix approach. +* ``sort`` rearranges the sequence by sorting it. It could be according to a comparison operator or a value using a radix approach * ``exchange`` rearranges the elements according to a different stride configuration which is equivalent to a tensor axis transposition -* ``shuffle`` rotates the elements. +* ``shuffle`` rotates the elements Partition/Merge ---------------- +==================== -* ``partition`` divides the sequence into two or more sequences according to a predicate while preserving some ordering properties. -* ``merge`` merges two ordered sequences into one while preserving the order. +* ``partition`` divides the sequence into two or more sequences according to a predicate while preserving some ordering properties +* ``merge`` merges two ordered sequences into one while preserving the order Data Movement -------------- +=============== -* ``store`` stores the sequence to a continuous memory zone. There are variations to use an optimized path or to specify how to store the sequence to better fit the access patterns of the CUs. -* ``load`` the complementary operations of the above ones. +* ``store`` stores the sequence to a continuous memory zone. There are variations to use an optimized path or to specify how to store the sequence to better fit the access patterns of the CUs +* ``load`` the complementary operations of the above ones Other operations ----------------- +====================== * ``run_length_encode`` generates a compact representation of a sequence -* ``binary_search`` finds for each element the index of an element with the same value in another sequence (which has to be sorted). +* ``binary_search`` finds for each element the index of an element with the same value in another sequence (which has to be sorted) +* ``config`` selects a kernel's grid/block dimensions to tune the operation to a GPU diff --git a/docs/reference/reference.rst b/docs/reference/reference.rst new file mode 100644 index 000000000..1b955a7cf --- /dev/null +++ b/docs/reference/reference.rst @@ -0,0 +1,18 @@ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _reference: + +******************************************************************** + rocPRIM API Reference +******************************************************************** + +* :ref:`ops-summary` +* :ref:`data-type-support` +* :ref:`dev-index` +* :ref:`block-index` +* :ref:`warp-index` +* :ref:`thread_ops` +* :ref:`iterators` +* :ref:`intrinsics` diff --git a/docs/thread_ops.rst b/docs/reference/thread_ops.rst similarity index 72% rename from docs/thread_ops.rst rename to docs/reference/thread_ops.rst index 7c4538a38..acc4fdcbb 100644 --- a/docs/thread_ops.rst +++ b/docs/reference/thread_ops.rst @@ -1,18 +1,25 @@ -Thread-Level Operations (Utilities) -=================================== +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _thread_ops: + +******************************************************************** + Thread-Level Operations (Utilities) +******************************************************************** Scan ----- +============== exclusive -......... +----------- .. doxygenfunction:: thread_scan_exclusive(T (&input)[LENGTH], T (&output)[LENGTH], ScanOp scan_op, T prefix, bool apply_prefix = true) .. doxygenfunction:: thread_scan_exclusive(T *input, T *output, ScanOp scan_op, T prefix, bool apply_prefix = true) .. doxygenfunction:: thread_scan_exclusive(T inclusive, T exclusive, T *input, T *output, ScanOp scan_op, Int2Type) inclusive -......... +----------- .. doxygenfunction:: thread_scan_inclusive (T inclusive, T *input, T *output, ScanOp scan_op, Int2Type< LENGTH >) .. doxygenfunction:: thread_scan_inclusive (T *input, T *output, ScanOp scan_op) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 613179623..5c9c38c94 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -1,18 +1,65 @@ -# Anywhere {branch} is used, the branch name will be substituted. -# These comments will also be removed. +defaults: + numbered: False root: index subtrees: - - entries: - - file: intro - - file: ops_summary - - file: data-type-support - - file: device_ops/index - - file: block_ops/index - - file: warp_ops/index - - file: thread_ops - - file: iterators - - file: intrinsics - - file: glossary - - caption: About - entries: - - file: license +- entries: + - file: concepts/concepts.rst + subtrees: + - entries: + - file: concepts/intro.rst + - file: concepts/glossary.rst + - file: reference/reference.rst + title: API Reference + subtrees: + - entries: + - file: reference/ops_summary.rst + - file: reference/data-type-support.rst + - file: device_ops/index.rst + subtrees: + - entries: + - file: device_ops/config.rst + - file: device_ops/transform.rst + - file: device_ops/unique.rst + - file: device_ops/sort.rst + - file: device_ops/merge.rst + - file: device_ops/partition.rst + - file: device_ops/run_lenght_encoding.rst + - file: device_ops/scan.rst + - file: device_ops/select.rst + - file: device_ops/reduce.rst + - file: device_ops/adjacent_difference.rst + - file: device_ops/binary_search.rst + - file: device_ops/histogram.rst + - file: block_ops/index.rst + subtrees: + - entries: + - file: block_ops/ops_classes/index.rst + subtrees: + - entries: + - file: block_ops/ops_classes/load.rst + - file: block_ops/ops_classes/store.rst + - file: block_ops/ops_classes/adjacent_difference.rst + - file: block_ops/ops_classes/discontinuity.rst + - file: block_ops/ops_classes/scan.rst + - file: block_ops/ops_classes/reduce.rst + - file: block_ops/ops_classes/shuffle.rst + - file: block_ops/ops_classes/exchange.rst + - file: block_ops/ops_classes/sort.rst + - file: block_ops/ops_classes/histogram.rst + - file: block_ops/data_mov_funcs.rst + - file: warp_ops/index.rst + subtrees: + - entries: + - file: warp_ops/load.rst + - file: warp_ops/store.rst + - file: warp_ops/reduce.rst + - file: warp_ops/scan.rst + - file: warp_ops/sort.rst + - file: warp_ops/shuffle.rst + - file: warp_ops/exchange.rst + - file: reference/thread_ops.rst + - file: reference/iterators.rst + - file: reference/intrinsics.rst + - file: reference/reorder.rst + - file: reference/acknowledge.rst + - file: license.rst \ No newline at end of file diff --git a/docs/warp_ops/exchange.rst b/docs/warp_ops/exchange.rst index 0410c83b5..88490875b 100644 --- a/docs/warp_ops/exchange.rst +++ b/docs/warp_ops/exchange.rst @@ -1,5 +1,12 @@ -Exchange --------- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _warp-exchange: + +******************************************************************** + Exchange +******************************************************************** .. doxygenclass:: rocprim::warp_exchange :members: diff --git a/docs/warp_ops/index.rst b/docs/warp_ops/index.rst index e8fda8ee2..176bb474a 100644 --- a/docs/warp_ops/index.rst +++ b/docs/warp_ops/index.rst @@ -1,12 +1,17 @@ -Warp-Level Operations -===================== +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation -.. toctree:: +.. _warp-index: - load - store - reduce - scan - sort - shuffle - exchange +******************************************************************** + Warp-Level Operations +******************************************************************** + + * :ref:`warp-load` + * :ref:`warp-store` + * :ref:`warp-reduce` + * :ref:`warp-scan` + * :ref:`warp-sort` + * :ref:`warp-shuffle` + * :ref:`warp-exchange` diff --git a/docs/warp_ops/load.rst b/docs/warp_ops/load.rst index 26568c60f..af7c713c6 100644 --- a/docs/warp_ops/load.rst +++ b/docs/warp_ops/load.rst @@ -1,13 +1,20 @@ -Load ----- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _warp-load: + +******************************************************************** + Load +******************************************************************** Class -..... +============= .. doxygenclass:: rocprim::warp_load :members: Algorithms -.......... +================ .. doxygenenum:: rocprim::warp_load_method diff --git a/docs/warp_ops/reduce.rst b/docs/warp_ops/reduce.rst index fbc70f7d2..24630e238 100644 --- a/docs/warp_ops/reduce.rst +++ b/docs/warp_ops/reduce.rst @@ -1,5 +1,12 @@ -Reduce ------- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _warp-reduce: + +******************************************************************** + Reduce +******************************************************************** .. doxygenclass:: rocprim::warp_reduce :members: diff --git a/docs/warp_ops/scan.rst b/docs/warp_ops/scan.rst index 89c923bd9..0508c9a30 100644 --- a/docs/warp_ops/scan.rst +++ b/docs/warp_ops/scan.rst @@ -1,5 +1,12 @@ -Scan ----- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _warp-scan: + +******************************************************************** + Scan +******************************************************************** .. doxygenclass:: rocprim::warp_scan :members: diff --git a/docs/warp_ops/shuffle.rst b/docs/warp_ops/shuffle.rst index d5aa02ff6..cc4dbeee0 100644 --- a/docs/warp_ops/shuffle.rst +++ b/docs/warp_ops/shuffle.rst @@ -1,5 +1,12 @@ -Shuffle -------- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _warp-shuffle: + +******************************************************************** + Shuffle +******************************************************************** .. doxygenfunction:: rocprim::warp_shuffle (const T &input, const int src_lane, const int width) .. doxygenfunction:: rocprim::warp_shuffle_down (const T &input, const unsigned int delta, const int width) diff --git a/docs/warp_ops/sort.rst b/docs/warp_ops/sort.rst index dee641a5b..e158a35bf 100644 --- a/docs/warp_ops/sort.rst +++ b/docs/warp_ops/sort.rst @@ -1,5 +1,12 @@ -Sort ----- +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _warp-sort: + +******************************************************************** + Sort +******************************************************************** .. doxygenclass:: rocprim::warp_sort :members: diff --git a/docs/warp_ops/store.rst b/docs/warp_ops/store.rst index e6ab791dd..805fe144a 100644 --- a/docs/warp_ops/store.rst +++ b/docs/warp_ops/store.rst @@ -1,13 +1,20 @@ -Store ------ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _warp-store: + +******************************************************************** + Store +******************************************************************** Class -..... +========== .. doxygenclass:: rocprim::warp_store :members: Algorithms -.......... +================ .. doxygenenum:: rocprim::warp_store_method