From cfc239ba7f792b8150f00ba94506f268d58d2f7f Mon Sep 17 00:00:00 2001 From: Riccardo Balin Date: Thu, 31 Oct 2024 20:16:49 +0000 Subject: [PATCH 1/3] Updated LibTorch docs for Aurora --- .../data-science/frameworks/libtorch.md | 295 +++++++++++------- 1 file changed, 174 insertions(+), 121 deletions(-) diff --git a/docs/aurora/data-science/frameworks/libtorch.md b/docs/aurora/data-science/frameworks/libtorch.md index 3da31b7b7..133f3219e 100644 --- a/docs/aurora/data-science/frameworks/libtorch.md +++ b/docs/aurora/data-science/frameworks/libtorch.md @@ -9,32 +9,105 @@ During compilation, Intel optimizations will be activated automatically once the ## Environment Setup To use LibTorch on Aurora, load the ML frameworks module +```bash +module load frameworks/2024.2.1_u1 ``` -module use /soft/modulefiles -module load frameworks/2023.12.15.001 -``` -which will also load the consistent oneAPI SDK and `cmake`. +which will also load the consistent oneAPI SDK (version 2024.2) and `cmake`. ## Torch and IPEX libraries With the ML frameworks module loaded as shown above, run -``` +```bash python -c 'import torch; print(torch.__path__[0])' python -c 'import torch;print(torch.utils.cmake_prefix_path)' ``` to find the path to the Torch libraries, include files, and CMake files. For the path to the IPEX dynamic library, run -``` +```bash python -c 'import torch; print(torch.__path__[0].replace("torch","intel_extension_for_pytorch"))' ``` +## Linking LibTorch and IPEX Libraries + +When using the CMake build system, LibTorch and IPEX libraries can be linked to an example C++ application using the following `CMakeLists.txt` file +```bash +cmake_minimum_required(VERSION 3.5 FATAL_ERROR) +cmake_policy(SET CMP0074 NEW) +project(project-name) + +find_package(Torch REQUIRED) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -Wl,--no-as-needed") +set(TORCH_LIBS ${TORCH_LIBRARIES}) + +find_library(IPEX_LIB intel-ext-pt-gpu PATHS ${INTEL_EXTENSION_FOR_PYTORCH_PATH}/lib NO_DEFAULT_PATH REQUIRED) +set(TORCH_LIBS ${TORCH_LIBS} ${IPEX_LIB}) +include_directories(SYSTEM ${INTEL_EXTENSION_FOR_PYTORCH_PATH}/include) + +add_executable(exe main.cpp) +target_link_libraries(exe ${TORCH_LIBS}) + +set_property(TARGET exe PROPERTY CXX_STANDARD 17) +``` + +and configuring the build with +``` +cmake \ + -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` \ + -DINTEL_EXTENSION_FOR_PYTORCH_PATH=`python -c 'import torch; print(torch.__path__[0].replace("torch","intel_extension_for_pytorch"))'` \ + ./ +make +``` + + +## Device Introspection + +Similarly to PyTorch, LibTorch provides API to perform instrospection on the devices available on the system. +The simple code below shows how to check if XPU devices are available, how many are present, and how to loop through them to discover some properties. + +```bash +#include +#include + +int main(int argc, const char* argv[]) +{ + torch::DeviceType device; + int num_devices = 0; + if (torch::xpu::is_available()) { + std::cout << "XPU devices detected" << std::endl; + device = torch::kXPU; + + num_devices = torch::xpu::device_count(); + std::cout << "Number of XPU devices: " << num_devices << std::endl; + + + for (int i = 0; i < num_devices; ++i) { + c10::xpu::set_device(i); + std::cout << "Device " << i << ":" << std::endl; + //std::string device_name = c10::xpu::get_device_name(); + //std::cout << "Device " << i << ": " << device_name << std::endl; + + c10::xpu::DeviceProp device_prop{}; + c10::xpu::get_device_properties(&device_prop, i); + std::cout << " Name: " << device_prop.name << std::endl; + std::cout << " Total memory: " << device_prop.global_mem_size / (1024 * 1024) << " MB" << std::endl; + } + } else { + device = torch::kCPU; + std::cout << "No XPU devices detected, setting device to CPU" << std::endl; + } + + return 0; +} +``` + ## Model Inferencing Using the Torch API -This example shows how to perform inference on the ResNet50 model using only the LibTorch API. -First, get a jit-traced version of the model running `resnet50_trace.py` below. + +This example shows how to perform inference with the ResNet50 model using LibTorch. +First, get a jit-traced version of the model executing `python resnet50_trace.py` (shwn below) on a compute node. ``` import torch import torchvision @@ -58,81 +131,53 @@ print(f"Inference time: {toc-tic}") torch.jit.save(model_jit, f"resnet50_jit.pt") ``` -Then, use the source code in `inference-example.cpp` -``` +Then, build `inference-example.cpp` (shown below) +```bash #include #include -#include int main(int argc, const char* argv[]) { - torch::jit::script::Module model; - try { - model = torch::jit::load(argv[1]); - std::cout << "Loaded the model\n"; - } - catch (const c10::Error& e) { - std::cerr << "error loading the model\n"; - return -1; - } - // Upload model to GPU - model.to(torch::Device(torch::kXPU)); - std::cout << "Model offloaded to GPU\n\n"; - - auto options = torch::TensorOptions() + torch::jit::script::Module model; + try { + model = torch::jit::load(argv[1]); + std::cout << "Loaded the model\n"; + } + catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + return -1; + } + + model.to(torch::Device(torch::kXPU)); + std::cout << "Model offloaded to GPU\n\n"; + + auto options = torch::TensorOptions() .dtype(torch::kFloat32) .device(torch::kXPU); - torch::Tensor input_tensor = torch::rand({1,3,224,224}, options); - assert(input_tensor.dtype() == torch::kFloat32); - assert(input_tensor.device().type() == torch::kXPU); - std::cout << "Created the input tesor on GPU\n"; + torch::Tensor input_tensor = torch::rand({1,3,224,224}, options); + assert(input_tensor.dtype() == torch::kFloat32); + assert(input_tensor.device().type() == torch::kXPU); + std::cout << "Created the input tesor on GPU\n"; - torch::Tensor output = model.forward({input_tensor}).toTensor(); - std::cout << "Performed inference\n\n"; + torch::Tensor output = model.forward({input_tensor}).toTensor(); + std::cout << "Performed inference\n\n"; - std::cout << "Predicted tensor is : \n"; - std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/10) << '\n'; + std::cout << "Slice of predicted tensor is : \n"; + std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/10) << '\n'; - return 0; + return 0; } ``` -and the `CMakeLists.txt` file - -``` -cmake_minimum_required(VERSION 3.5 FATAL_ERROR) -cmake_policy(SET CMP0074 NEW) -project(inference-example) - -find_package(Torch REQUIRED) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -Wl,--no-as-needed") - -add_executable(inference-example inference-example.cpp) -target_link_libraries(inference-example "${TORCH_LIBRARIES}" "${INTEL_EXTENSION_FOR_PYTORCH_PATH}/lib/libintel-ext-pt-gpu.so") - -set_property(TARGET inference-example PROPERTY CXX_STANDARD 17) -``` - -to build the inference example. - -Finally, execute the `doConfig.sh` script below -``` -#!/bin/bash - -cmake \ - -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` \ - -DINTEL_EXTENSION_FOR_PYTORCH_PATH=`python -c 'import torch; print(torch.__path__[0].replace("torch","intel_extension_for_pytorch"))'` \ - ./ +and execute it with `./inference-example ./resnet50_jit.pt`. -make -./inference-example ./resnet50_jit.pt -``` ## LibTorch Interoperability with SYCL Pipelines -The LibTorch API can be integrated with data pilelines using SYCL to offload and operate on the input and output data on the Intel Max 1550 GPU. -The code below extends the above example of performing inference with the ResNet50 model by first generating the input data on the CPU, then offloading it to the GPU with SYCL, and finally passing the device pointer to LibTorch for inference. + +The LibTorch API can be integrated with data pilelines using SYCL to operate on input and output data already offloaded to the Intel Max 1550 GPU. +The code below extends the above example of performing inference with the ResNet50 model by first generating the input data on the CPU, then offloading it to the GPU with SYCL, and finally passing the device pointer to LibTorch for inference using `torch::from_blob()`, which create a Torch tensor from a data pointer with zero-copy. The source code for `inference-example.cpp` is modified as follows -``` +```bash #include #include #include @@ -143,78 +188,86 @@ const int N_BATCH = 1; const int N_CHANNELS = 3; const int N_PIXELS = 224; const int INPUTS_SIZE = N_BATCH*N_CHANNELS*N_PIXELS*N_PIXELS; +const int OUTPUTS_SIZE = N_BATCH*N_CHANNELS; int main(int argc, const char* argv[]) { - torch::jit::script::Module model; - try { - model = torch::jit::load(argv[1]); - std::cout << "Loaded the model\n"; - } - catch (const c10::Error& e) { - std::cerr << "error loading the model\n"; - return -1; - } - // Upload model to GPU - model.to(torch::Device(torch::kXPU)); - std::cout << "Model offloaded to GPU\n\n"; - - // Create the input data on the host - std::vector inputs(INPUTS_SIZE); - srand(12345); - for (int i=0; i (rand()) / static_cast (RAND_MAX); - } - std::cout << "Generated input data on the host \n\n"; - - // Move input data to the device with SYCL - sycl::queue Q(sycl::gpu_selector_v); - std::cout << "SYCL running on " + torch::jit::script::Module model; + try { + model = torch::jit::load(argv[1]); + std::cout << "Loaded the model\n"; + } + catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + return -1; + } + + model.to(torch::Device(torch::kXPU)); + std::cout << "Model offloaded to GPU\n\n"; + + // Create the input data on the host + std::vector inputs(INPUTS_SIZE); + srand(12345); + for (int i=0; i (rand()) / static_cast (RAND_MAX); + } + std::cout << "Generated input data on the host \n\n"; + + // Move input data to the device with SYCL + sycl::queue Q(sycl::gpu_selector_v); + std::cout << "SYCL running on " << Q.get_device().get_info() << "\n\n"; - float *d_inputs = sycl::malloc_device(INPUTS_SIZE, Q); - Q.memcpy((void *) d_inputs, (void *) inputs.data(), INPUTS_SIZE*sizeof(float)); - Q.wait(); - - // Convert input array to Torch tensor - auto options = torch::TensorOptions() + float *d_inputs = sycl::malloc_device(INPUTS_SIZE, Q); + Q.memcpy((void *) d_inputs, (void *) inputs.data(), INPUTS_SIZE*sizeof(float)); + Q.wait(); + + // Pre-allocate the output array on device and fill with a number + double *d_outputs = sycl::malloc_device(OUTPUTS_SIZE, Q); + Q.submit([&](sycl::handler &cgh) { + cgh.parallel_for(OUTPUTS_SIZE, [=](sycl::id<1> idx) { + d_outputs[idx] = 1.2345; + }); + }); + Q.wait(); + std::cout << "Offloaded input data to the GPU \n\n"; + + // Convert input array to Torch tensor + auto options = torch::TensorOptions() .dtype(torch::kFloat32) .device(torch::kXPU); - torch::Tensor input_tensor = at::from_blob(d_inputs, {N_BATCH,N_CHANNELS,N_PIXELS,N_PIXELS}, - nullptr, at::device(torch::kXPU).dtype(torch::kFloat32), - torch::kXPU) - .to(torch::kXPU); - assert(input_tensor.dtype() == torch::kFloat32); - assert(input_tensor.device().type() == torch::kXPU); - std::cout << "Created the input tesor on GPU\n"; - - // Perform inference - torch::Tensor output = model.forward({input_tensor}).toTensor(); - std::cout << "Performed inference\n\n"; - - std::cout << "Predicted tensor is : \n"; - std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/10) << '\n'; - - return 0; + torch::Tensor input_tensor = torch::from_blob( + d_inputs, + {N_BATCH,N_CHANNELS,N_PIXELS,N_PIXELS}, + options); + assert(input_tensor.dtype() == torch::kFloat32); + assert(input_tensor.device().type() == torch::kXPU); + std::cout << "Created the input Torch tesor on GPU\n\n"; + + // Perform inference + torch::NoGradGuard no_grad; // equivalent to "with torch.no_grad():" in PyTorch + torch::Tensor output = model.forward({input_tensor}).toTensor(); + std::cout << "Performed inference\n\n"; + + // Copy the output Torch tensor to the SYCL pointer + auto output_tensor_ptr = output.contiguous().data_ptr(); + Q.memcpy((void *) d_outputs, (void *) output_tensor_ptr, OUTPUTS_SIZE*sizeof(double)); + Q.wait(); + std::cout << "Copied output Torch tensor to SYCL pointer\n"; + + return 0; } ``` -and the CMake commands also change to include -``` -#!/bin/bash - +Note that an additional C++ flag is needed in this case, as shown below in the `cmake` command +```bash cmake \ -DCMAKE_CXX_FLAGS="-std=c++17 -fsycl" \ -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` \ -DINTEL_EXTENSION_FOR_PYTORCH_PATH=`python -c 'import torch; print(torch.__path__[0].replace("torch","intel_extension_for_pytorch"))'` \ ./ - -make -./inference-example ./resnet50_jit.pt ``` -## Known Issues -* The LibTorch introspection API that are available for CUDA devices, such as `torch::cuda::is_available()`, are still under development for Intel Max 1550 GPU. From af0abdd90be303fe1e094aaddb0137cdec2ef723 Mon Sep 17 00:00:00 2001 From: balin Date: Thu, 31 Oct 2024 21:16:36 +0000 Subject: [PATCH 2/3] Added LibTorch docs for Polaris --- .../data-science/frameworks/libtorch.md | 7 +- .../frameworks/libtorch.md | 149 ++++++++++++++++++ mkdocs.yml | 1 + 3 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 docs/polaris/data-science-workflows/frameworks/libtorch.md diff --git a/docs/aurora/data-science/frameworks/libtorch.md b/docs/aurora/data-science/frameworks/libtorch.md index 133f3219e..c8cd22e68 100644 --- a/docs/aurora/data-science/frameworks/libtorch.md +++ b/docs/aurora/data-science/frameworks/libtorch.md @@ -82,12 +82,9 @@ int main(int argc, const char* argv[]) num_devices = torch::xpu::device_count(); std::cout << "Number of XPU devices: " << num_devices << std::endl; - for (int i = 0; i < num_devices; ++i) { c10::xpu::set_device(i); std::cout << "Device " << i << ":" << std::endl; - //std::string device_name = c10::xpu::get_device_name(); - //std::cout << "Device " << i << ": " << device_name << std::endl; c10::xpu::DeviceProp device_prop{}; c10::xpu::get_device_properties(&device_prop, i); @@ -107,8 +104,8 @@ int main(int argc, const char* argv[]) ## Model Inferencing Using the Torch API This example shows how to perform inference with the ResNet50 model using LibTorch. -First, get a jit-traced version of the model executing `python resnet50_trace.py` (shwn below) on a compute node. -``` +First, get a jit-traced version of the model executing `python resnet50_trace.py` (shown below) on a compute node. +```bash import torch import torchvision import intel_extension_for_pytorch as ipex diff --git a/docs/polaris/data-science-workflows/frameworks/libtorch.md b/docs/polaris/data-science-workflows/frameworks/libtorch.md new file mode 100644 index 000000000..9ae999256 --- /dev/null +++ b/docs/polaris/data-science-workflows/frameworks/libtorch.md @@ -0,0 +1,149 @@ +# LibTorch C++ Library + +LibTorch is a C++ library for Torch, with many of the API that are available in PyTorch. Users can find more information on the [PyTorch documentation](https://pytorch.org/cppdocs/installing.html). +This is useful to integrate the Torch ML framework into traditional HPC simulation codes and therefore enable training and inferecing of ML models. +During compilation, Intel optimizations will be activated automatically once the IPEX dynamic library is linked. + + +## Environment Setup + +To use LibTorch on Polaris, load the ML frameworks module +```bash +module use /soft/modulefiles +module load conda/2024-04-29 +conda activate +``` +which will also load, `PrgEnv-gnu/8.5.0` and `cmake`. + + +## Torch Libraries + +With the ML frameworks module loaded as shown above, run +```bash +python -c 'import torch; print(torch.__path__[0])' +python -c 'import torch;print(torch.utils.cmake_prefix_path)' +``` +to find the path to the Torch libraries, include files, and CMake files. + + +## Linking the Torch Libraries + +When using the CMake build system, the LibTorch libraries can be linked to an example C++ application using the following `CMakeLists.txt` file +```bash +cmake_minimum_required(VERSION 3.5 FATAL_ERROR) +cmake_policy(SET CMP0074 NEW) +project(project-name) + +find_package(Torch REQUIRED) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -Wl,--no-as-needed") +set(TORCH_LIBS ${TORCH_LIBRARIES}) + +add_executable(exe main.cpp) +target_link_libraries(exe ${TORCH_LIBS}) + +set_property(TARGET exe PROPERTY CXX_STANDARD 17) +``` + +and configuring the build with +``` +cmake \ + -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` \ + ./ +make +``` + + +## Device Introspection + +Similarly to PyTorch, LibTorch provides API to perform instrospection on the devices available on the system. +The simple code below shows how to check if CUDA devices are available, how many are present, and how to loop through them to discover some properties. + +```bash +#include + +int main(int argc, const char* argv[]) +{ + torch::DeviceType device; + int num_devices = 0; + if (torch::cuda::is_available()) { + std::cout << "CUDA devices detected" << std::endl; + device = torch::kCUDA; + + num_devices = torch::cuda::device_count(); + std::cout << "Number of CUDA devices: " << num_devices << std::endl; + } else { + device = torch::kCPU; + std::cout << "No CUDA devices detected, setting device to CPU" << std::endl; + } + + return 0; +} +``` + + +## Model Inferencing Using the Torch API + +This example shows how to perform inference with the ResNet50 model using LibTorch. +First, get a jit-traced version of the model executing `python resnet50_trace.py` (shown below) on a compute node. +```bash +import torch +import torchvision +from time import perf_counter + +device = 'cuda' + +model = torchvision.models.resnet50() +model.to(device) +model.eval() + +dummy_input = torch.rand(1, 3, 224, 224).to(device) + +model_jit = torch.jit.trace(model, dummy_input) +tic = perf_counter() +predictions = model_jit(dummy_input) +toc = perf_counter() +print(f"Inference time: {toc-tic}") + +torch.jit.save(model_jit, f"resnet50_jit.pt") +``` + +Then, build `inference-example.cpp` (shown below) +```bash +#include +#include + +int main(int argc, const char* argv[]) { + torch::jit::script::Module model; + try { + model = torch::jit::load(argv[1]); + std::cout << "Loaded the model\n"; + } + catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + return -1; + } + + model.to(torch::Device(torch::kCUDA)); + std::cout << "Model offloaded to GPU\n\n"; + + auto options = torch::TensorOptions() + .dtype(torch::kFloat32) + .device(torch::kCUDA); + torch::Tensor input_tensor = torch::rand({1,3,224,224}, options); + assert(input_tensor.dtype() == torch::kFloat32); + assert(input_tensor.device().type() == torch::kCUDA); + std::cout << "Created the input tesor on GPU\n"; + + torch::Tensor output = model.forward({input_tensor}).toTensor(); + std::cout << "Performed inference\n\n"; + + std::cout << "Slice of predicted tensor is : \n"; + std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/10) << '\n'; + + return 0; +} +``` + +and execute it with `./inference-example ./resnet50_jit.pt`. + + diff --git a/mkdocs.yml b/mkdocs.yml index dd4d1d98a..f19acb197 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -78,6 +78,7 @@ nav: - PyTorch: polaris/data-science-workflows/frameworks/pytorch.md - Jax: polaris/data-science-workflows/frameworks/jax.md - DeepSpeed: polaris/data-science-workflows/frameworks/deepspeed.md + - LibTorch: polaris/data-science-workflows/frameworks/libtorch.md - Applications: - Megatron-DeepSpeed: polaris/data-science-workflows/applications/megatron-deepspeed.md - gpt-neox: polaris/data-science-workflows/applications/gpt-neox.md From a174961975157b9c4a12bc84f6a2d6c6c761b03d Mon Sep 17 00:00:00 2001 From: rickybalin Date: Thu, 31 Oct 2024 15:40:32 -0600 Subject: [PATCH 3/3] fixed some typos --- docs/aurora/data-science/frameworks/libtorch.md | 8 ++++---- .../polaris/data-science-workflows/frameworks/libtorch.md | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/aurora/data-science/frameworks/libtorch.md b/docs/aurora/data-science/frameworks/libtorch.md index c8cd22e68..3ccf158f3 100644 --- a/docs/aurora/data-science/frameworks/libtorch.md +++ b/docs/aurora/data-science/frameworks/libtorch.md @@ -67,7 +67,7 @@ make Similarly to PyTorch, LibTorch provides API to perform instrospection on the devices available on the system. The simple code below shows how to check if XPU devices are available, how many are present, and how to loop through them to discover some properties. -```bash +```c++ #include #include @@ -105,7 +105,7 @@ int main(int argc, const char* argv[]) This example shows how to perform inference with the ResNet50 model using LibTorch. First, get a jit-traced version of the model executing `python resnet50_trace.py` (shown below) on a compute node. -```bash +```python import torch import torchvision import intel_extension_for_pytorch as ipex @@ -129,7 +129,7 @@ torch.jit.save(model_jit, f"resnet50_jit.pt") ``` Then, build `inference-example.cpp` (shown below) -```bash +```c++ #include #include @@ -174,7 +174,7 @@ The LibTorch API can be integrated with data pilelines using SYCL to operate on The code below extends the above example of performing inference with the ResNet50 model by first generating the input data on the CPU, then offloading it to the GPU with SYCL, and finally passing the device pointer to LibTorch for inference using `torch::from_blob()`, which create a Torch tensor from a data pointer with zero-copy. The source code for `inference-example.cpp` is modified as follows -```bash +```c++ #include #include #include diff --git a/docs/polaris/data-science-workflows/frameworks/libtorch.md b/docs/polaris/data-science-workflows/frameworks/libtorch.md index 9ae999256..f8959b8bd 100644 --- a/docs/polaris/data-science-workflows/frameworks/libtorch.md +++ b/docs/polaris/data-science-workflows/frameworks/libtorch.md @@ -13,7 +13,7 @@ module use /soft/modulefiles module load conda/2024-04-29 conda activate ``` -which will also load, `PrgEnv-gnu/8.5.0` and `cmake`. +which will also loads `PrgEnv-gnu/8.5.0` and `cmake`. ## Torch Libraries @@ -58,7 +58,7 @@ make Similarly to PyTorch, LibTorch provides API to perform instrospection on the devices available on the system. The simple code below shows how to check if CUDA devices are available, how many are present, and how to loop through them to discover some properties. -```bash +```c++ #include int main(int argc, const char* argv[]) @@ -85,7 +85,7 @@ int main(int argc, const char* argv[]) This example shows how to perform inference with the ResNet50 model using LibTorch. First, get a jit-traced version of the model executing `python resnet50_trace.py` (shown below) on a compute node. -```bash +```python import torch import torchvision from time import perf_counter @@ -108,7 +108,7 @@ torch.jit.save(model_jit, f"resnet50_jit.pt") ``` Then, build `inference-example.cpp` (shown below) -```bash +```c++ #include #include