diff --git a/.wordlist.txt b/.wordlist.txt index 31db060a15..9be9169e8a 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -1,688 +1,4 @@ -AAC -ABI -ACE -ACEs -AccVGPR -AccVGPRs -ALU -AMD -AMDGPU -AMDGPUs -AMDMIGraphX -AMI -AOCC -AOMP -APIC -APIs -APU -ASIC -ASICs -ASan -ASAN -ASm -ATI -AddressSanitizer -AlexNet -Arb -BLAS -BMC -BitCode -Blit -Bluefield -CCD -CDNA -CIFAR -CLI -CLion -CMake -CMakeLists -CMakePackage -CP -CPC -CPF -CPP -CPU -CPUs -CSC -CSE -CSV -CSn -CTests -CU -CUDA -CUs -CXX -Cavium -CentOS -ChatGPT -CoRR -Codespaces -Commitizen -CommonMark -Concretized -Conda -ConnectX -DGEMM -DKMS -DL -DMA -DNN -DNNL -DPM -DRI -DW -DWORD -Dask -DataFrame -DataLoader -DataParallel -DeepSpeed -Dependabot -DevCap -Dockerfile -Doxygen -ELMo -ENDPGM -EPYC -ESXi -FFT -FFTs -FFmpeg -FHS -FMA -FP -Filesystem -Flang -Fortran -Fuyu -GALB -GCD -GCDs -GCN -GDB -GDDR -GDR -GDS -GEMM -GEMMs -GFortran -GiB -GIM -GL -GLXT -GMI -GPG -GPR -GPT -GPU -GPU's -GPUs -GRBM -GenAI -GenZ -GitHub -Gitpod -HBM -HCA -HIPCC -HIPExtension -HIPIFY -HPC -HPCG -HPE -HPL -HSA -HWE -Haswell -Higgs -Hyperparameters -ICV -IDE -IDEs -IMDb -IOMMU -IOP -IOPM -IOV -IRQ -ISA -ISV -ISVs -ImageNet -InfiniBand -Inlines -IntelliSense -Intersphinx -Intra -Ioffe -JSON -Jupyter -KFD -KiB -KVM -Keras -Khronos -LAPACK -LCLK -LDS -LLM -LLMs -LLVM -LM -LSAN -LSan -LTS -LoRA -MEM -MERCHANTABILITY -MFMA -MiB -MIGraphX -MIOpen -MIOpenGEMM -MIVisionX -MLM -MMA -MMIO -MMIOH -MNIST -MPI -MSVC -MVAPICH -MVFFR -Makefile -Makefiles -Matplotlib -Megatron -Mellanox -Mellanox's -Meta's -MirroredStrategy -Multicore -Multithreaded -MyEnvironment -MyST -NBIO -NBIOs -NIC -NICs -NLI -NLP -NPS -NSP -NUMA -NVCC -NVIDIA -NVPTX -NaN -Nano -Navi -Noncoherently -NousResearch's -NumPy -OAM -OAMs -OCP -OEM -OFED -OMP -OMPI -OMPT -OMPX -ONNX -OSS -OSU -OpenCL -OpenCV -OpenFabrics -OpenGL -OpenMP -OpenSSL -OpenVX -PCI -PCIe -PEFT -PIL -PILImage -PRNG -PRs -PaLM -Pageable -PeerDirect -Perfetto -PipelineParallel -PnP -PowerShell -PyPi -PyTorch -Qcycles -RAII -RCCL -RDC -RDMA -RDNA -RHEL -ROC -ROCProfiler -ROCTracer -ROCclr -ROCdbgapi -ROCgdb -ROCk -ROCm -ROCmCC -ROCmSoftwarePlatform -ROCmValidationSuite -ROCr -RST -RW -Radeon -RelWithDebInfo -Req -Rickle -RoCE -Ryzen -SALU -SBIOS -SCA -SDK -SDMA -SDRAM -SENDMSG -SGPR -SGPRs -SHA -SIGQUIT -SIMD -SIMDs -SKU -SKUs -SLES -SMEM -SMI -SMT -SPI -SQs -SRAM -SRAMECC -SVD -SWE -SerDes -Shlens -Skylake -Softmax -Spack -Supermicro -Szegedy -TCA -TCC -TCI -TCIU -TCP -TCR -TF -TFLOPS -TPU -TPUs -TensorBoard -TensorFlow -TensorParallel -ToC -TorchAudio -TorchMIGraphX -TorchScript -TorchServe -TorchVision -TransferBench -TrapStatus -UAC -UC -UCC -UCX -UIF -USM -UTCL -UTIL -Uncached -Unhandled -VALU -VBIOS -VGPR -VGPRs -VM -VMEM -VMWare -VRAM -VSIX -VSkipped -Vanhoucke -Vulkan -WGP -WGPs -WX -WikiText -Wojna -Workgroups -Writebacks -XCD -XCDs -XGBoost -XGBoost's -XGMI -XT -XTX -Xeon -Xilinx -Xnack -Xteam -YAML -YML -YModel -ZeRO -ZenDNN -accuracies -activations -addr -alloc -allocator -allocators -amdgpu -api -atmi -atomics -autogenerated -avx -awk -backend -backends -benchmarking -bfloat -bilinear -bitsandbytes -blit -boson -bosons -buildable -bursty -bzip -cacheable -cd -centos -centric -changelog -chiplet -cmake -cmd -coalescable -codename -collater -comgr -completers -composable -concretization -config -conformant -convolutional -convolves -cpp -csn -cuBLAS -cuFFT -cuLIB -cuRAND -cuSOLVER -cuSPARSE -dataset -datasets -dataspace -datatype -datatypes -dbgapi -de -deallocation -denoise -denoised -denoises -denormalize -deserializers -detections -dev -devicelibs -devsel -dimensionality -disambiguates -distro -el -embeddings -enablement -endpgm -encodings -env -epilog -etcetera -ethernet -exascale -executables -ffmpeg -filesystem -fortran -galb -gcc -gdb -gfortran -gfx -githooks -github -gnupg -grayscale -gzip -heterogenous -hipBLAS -hipBLASLt -hipCUB -hipFFT -hipLIB -hipRAND -hipSOLVER -hipSPARSE -hipSPARSELt -hipTensor -hipamd -hipblas -hipcub -hipfft -hipfort -hipify -hipsolver -hipsparse -hpp -hsa -hsakmt -hyperparameter -ib_core -inband -incrementing -inferencing -inflight -init -initializer -inlining -installable -interprocedural -intra -invariants -invocating -ipo -kdb -latencies -libfabric -libjpeg -libs -linearized -linter -linux -llvm -localscratch -logits -lossy -macOS -matchers -microarchitecture -migraphx -miopen -miopengemm -mivisionx -mkdir -mlirmiopen -mtypes -mvffr -namespace -namespaces -numref -ocl -opencl -opencv -openmp -openssl -optimizers -os -pageable -parallelization -parameterization -passthrough -perfcounter -performant -perl -pragma -pre -prebuilt -precompiled -prefetch -prefetchable -preprocess -preprocessed -preprocessing -prequantized -prerequisites -profiler -protobuf -pseudorandom -py -quasirandom -queueing -rccl -rdc -reStructuredText -reformats -repos -representativeness -req -resampling -rescaling -reusability -roadmap -roc -rocAL -rocALUTION -rocBLAS -rocFFT -rocLIB -rocMLIR -rocPRIM -rocRAND -rocSOLVER -rocSPARSE -rocThrust -rocWMMA -rocalution -rocblas -rocclr -rocfft -rocm -rocminfo -rocprim -rocprof -rocprofiler -rocr -rocrand -rocsolver -rocsparse -rocthrust -roctracer -runtime -runtimes -sL -scalability -scalable -sendmsg -serializers -shader -sharding -sigmoid -sm -smi -softmax -spack -src -stochastically -strided -subdirectory -subexpression -subfolder -subfolders -supercomputing -tensorfloat -th -tokenization -tokenize -tokenized -tokenizer -tokenizes -toolchain -toolchains -toolset -toolsets -torchvision -tqdm -tracebacks -txt -uarch -uncached -uncorrectable -uninstallation -unsqueeze -unstacking -unswitching -untrusted -untuned -upvote -USM -UTCL -UTIL -utils -vL -variational -vdi -vectorizable -vectorization -vectorize -vectorized -vectorizer -vectorizes -vjxb -walkthrough -walkthroughs -wavefront -wavefronts -whitespaces -workgroup -workgroups -writeback -writebacks -wrreq -wzo -xargs -xz -yaml -ysvmadyb -zypper \ No newline at end of file +AQL +builtins +Builtins +NDRange \ No newline at end of file diff --git a/docs/how-to/hip_porting_driver_api.md b/docs/how-to/hip_porting_driver_api.md index 99847dbd11..91b9ccb5fe 100644 --- a/docs/how-to/hip_porting_driver_api.md +++ b/docs/how-to/hip_porting_driver_api.md @@ -122,15 +122,15 @@ By default, in the host code, for the `<<<>>>` statement, hip-clang first emits CUDA applications may want to mix CUDA driver code with HIP code (see example below). This table shows the type equivalence to enable this interaction. -|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**| -| ---- | ---- | ---- | -| hipModule_t | CUmodule | | -| hipFunction_t | CUfunction | | -| hipCtx_t | CUcontext | | -| hipDevice_t | CUdevice | | -| hipStream_t | CUstream | cudaStream_t | -| hipEvent_t | CUevent | cudaEvent_t | -| hipArray | CUarray | cudaArray | +|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**| +| ---- | ---- | ---- | +| `hipModule_t` | `CUmodule` | | +| `hipFunction_t` | `CUfunction` | | +| `hipCtx_t` | `CUcontext` | | +| `hipDevice_t` | `CUdevice` | | +| `hipStream_t` | `CUstream` | `cudaStream_t` | +| `hipEvent_t` | `CUevent` | `cudaEvent_t` | +| `hipArray` | `CUarray` | `cudaArray` | #### Compilation Options diff --git a/docs/how-to/programming_manual.md b/docs/how-to/programming_manual.md index 0c4dfe3a5b..a3bccc6f97 100644 --- a/docs/how-to/programming_manual.md +++ b/docs/how-to/programming_manual.md @@ -7,8 +7,8 @@ hipHostMalloc allocates pinned host memory which is mapped into the address space of all GPUs in the system, the memory can be accessed directly by the GPU device, and can be read or written with much higher bandwidth than pageable memory obtained with functions such as malloc(). There are two use cases for this host memory: -* Faster HostToDevice and DeviceToHost Data Transfers: -The runtime tracks the hipHostMalloc allocations and can avoid some of the setup required for regular unpinned memory. For exact measurements on a specific system, experiment with --unpinned and --pinned switches for the hipBusBandwidth tool. +* Faster `HostToDevice` and `DeviceToHost` Data Transfers: +The runtime tracks the hipHostMalloc allocations and can avoid some of the setup required for regular unpinned memory. For exact measurements on a specific system, experiment with `--unpinned` and `--pinned` switches for the `hipBusBandwidth` tool. * Zero-Copy GPU Access: GPU can directly access the host memory over the CPU/GPU interconnect, without need to copy the data. This avoids the need for the copy, but during the kernel access each memory access must traverse the interconnect, which can be tens of times slower than accessing the GPU's local device memory. Zero-copy memory can be a good choice when the memory accesses are infrequent (perhaps only once). Zero-copy memory is typically "Coherent" and thus not cached by the GPU but this can be overridden if desired. @@ -136,13 +136,13 @@ Note, Direct Dispatch is implemented on Linux. It is currently not supported on ## HIP Runtime Compilation -HIP now supports runtime compilation (HIPRTC), the usage of which will provide the possibility of optimizations and performance improvement compared with other APIs via regular offline static compilation. +HIP now supports runtime compilation (HIP RTC), the usage of which will provide the possibility of optimizations and performance improvement compared with other APIs via regular offline static compilation. -HIPRTC APIs accept HIP source files in character string format as input parameters and create handles of programs by compiling the HIP source files without spawning separate processes. +HIP RTC APIs accept HIP source files in character string format as input parameters and create handles of programs by compiling the HIP source files without spawning separate processes. -For more details on HIPRTC APIs, refer to [HIP Runtime API Reference](https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/index.html). +For more details on HIP RTC APIs, refer to [HIP Runtime API Reference](https://rocm.docs.amd.com/projects/HIP/en/latest/doxygen/html/index.html). -For Linux developers, the link [here](https://github.com/ROCm/hip-tests/blob/develop/samples/2_Cookbook/23_cmake_hiprtc/saxpy.cpp) shows an example how to program HIP application using runtime compilation mechanism, and a detailed [HIPRTC programming guide](./hip_rtc) is also available. +For Linux developers, the link [here](https://github.com/ROCm/hip-tests/blob/develop/samples/2_Cookbook/23_cmake_hiprtc/saxpy.cpp) shows an example how to program HIP application using runtime compilation mechanism, and a detailed [HIP RTC programming guide](./hip_rtc) is also available. ## HIP Graph diff --git a/docs/reference/terms.md b/docs/reference/terms.md index ce6d51f3ec..4d4be12296 100644 --- a/docs/reference/terms.md +++ b/docs/reference/terms.md @@ -12,10 +12,10 @@ | |thread|thread|work-item| | |warp|warp|sub-group| ||||| -|Thread-
index | threadIdx.x | threadIdx.x | get_local_id(0) | -|Block-
index | blockIdx.x | blockIdx.x | get_group_id(0) | -|Block-
dim | blockDim.x | blockDim.x | get_local_size(0) | -|Grid-dim | gridDim.x | gridDim.x | get_num_groups(0) | +|Thread-
index | `threadIdx.x` | `threadIdx.x` | `get_local_id(0)` | +|Block-
index | `blockIdx.x` | `blockIdx.x` | `get_group_id(0)` | +|Block-
dim | `blockDim.x` | `blockDim.x` | `get_local_size(0)` | +|Grid-dim | `gridDim.x` | `gridDim.x` | `get_num_groups(0)` | ||||| |Device Kernel|`__global__`|`__global__`|`__kernel`| |Device Function|`__device__`|`__device__`|Implied in device compilation| @@ -35,4 +35,4 @@ ## Notes -The indexing functions (starting with `thread-index`) show the terminology for a 1D grid. Some APIs use reverse order of xyz / 012 indexing for 3D grids. +The indexing functions (starting with `thread-index`) show the terminology for a 1D grid. Some APIs use reverse order of `xyz` / 012 indexing for 3D grids.