generated from yandexdataschool/sphinx-lesson-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add draft for async execution lesson
The text is a copy of an ENCCS lesson, in english.to have Can be used as a sample in Markdown for functions API signatures and type-alongs.
- Loading branch information
Showing
23 changed files
with
1,493 additions
and
0 deletions.
There are no files selected for viewing
414 changes: 414 additions & 0 deletions
414
content/10_AsynchronousExecution/AsynchronousExecution.md
Large diffs are not rendered by default.
Oops, something went wrong.
88 changes: 88 additions & 0 deletions
88
content/10_AsynchronousExecution/Code/Async1/Solution/async_cpu.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <algorithm> | ||
#include <math.h> | ||
#include <time.h> | ||
|
||
static constexpr int numIterations = 100; | ||
static constexpr int numValuesToPrint = 10; | ||
|
||
void func1(const float* in, float* out, int numElements) | ||
{ | ||
for (int i = 0; i < numElements; i++) | ||
{ | ||
float value = in[i]; | ||
for (int iter = 0; iter < numIterations; iter++) | ||
{ | ||
value = std::sin(value); | ||
} | ||
out[i] = value; | ||
} | ||
} | ||
|
||
void func2(const float* in, float* out, int numElements) | ||
{ | ||
for (int i = 0; i < numElements; i++) | ||
{ | ||
float value = in[i]; | ||
for (int iter = 0; iter < numIterations; iter++) | ||
{ | ||
value = -std::sin(value); | ||
} | ||
out[i] = value; | ||
} | ||
} | ||
|
||
int main(int argc, char* argv[]) | ||
{ | ||
|
||
int numElements = (argc > 1) ? atoi(argv[1]) : 1000000; | ||
|
||
printf("Transforming %d values.\n", numElements); | ||
|
||
float* data1 = (float*)calloc(numElements, sizeof(float)); | ||
float* data2 = (float*)calloc(numElements, sizeof(float)); | ||
|
||
srand(1214134); | ||
for (int i = 0; i < numElements; i++) | ||
{ | ||
data1[i] = float(rand())/float(RAND_MAX + 1.0); | ||
data2[i] = float(rand())/float(RAND_MAX + 1.0); | ||
} | ||
|
||
// Timing | ||
clock_t start = clock(); | ||
|
||
func1(data1, data1, numElements); | ||
func2(data2, data2, numElements); | ||
|
||
// Timing | ||
clock_t finish = clock(); | ||
|
||
printf("The results are:\n"); | ||
for (int i = 0; i < numValuesToPrint; i++) | ||
{ | ||
printf("%f, %f\n", data1[i], data2[i]); | ||
} | ||
printf("...\n"); | ||
for (int i = numElements - numValuesToPrint; i < numElements; i++) | ||
{ | ||
printf("%f, %f\n", data1[i], data2[i]); | ||
} | ||
double sum1 = 0.0; | ||
double sum2 = 0.0; | ||
for (int i = 0; i < numElements; i++) | ||
{ | ||
sum1 += data1[i]; | ||
sum2 += data2[i]; | ||
} | ||
printf("The summs are: %f and %f\n", sum1, sum2); | ||
|
||
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC); | ||
|
||
// Release the memory | ||
free(data1); | ||
free(data2); | ||
|
||
return 0; | ||
} |
108 changes: 108 additions & 0 deletions
108
content/10_AsynchronousExecution/Code/Async1/Solution/async_gpu_1.cu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <algorithm> | ||
#include <time.h> | ||
|
||
#define BLOCK_SIZE 256 | ||
|
||
static constexpr int numIterations = 100; | ||
static constexpr int numValuesToPrint = 10; | ||
|
||
__global__ void func1_kernel(const float* in, float* out, int numElements) | ||
{ | ||
int i = threadIdx.x + blockIdx.x*blockDim.x; | ||
if (i < numElements) | ||
{ | ||
float value = in[i]; | ||
for (int iter = 0; iter < numIterations; iter++) | ||
{ | ||
value = sinf(value); | ||
} | ||
out[i] = value; | ||
} | ||
} | ||
|
||
__global__ void func2_kernel(const float* in, float* out, int numElements) | ||
{ | ||
int i = threadIdx.x + blockIdx.x*blockDim.x; | ||
if (i < numElements) | ||
{ | ||
float value = in[i]; | ||
for (int iter = 0; iter < numIterations; iter++) | ||
{ | ||
value = -sinf(value); | ||
} | ||
out[i] = value; | ||
} | ||
} | ||
|
||
int main(int argc, char* argv[]) | ||
{ | ||
|
||
int numElements = (argc > 1) ? atoi(argv[1]) : 1000000; | ||
|
||
printf("Transforming %d values.\n", numElements); | ||
|
||
float* h_data1 = (float*)calloc(numElements, sizeof(float)); | ||
float* h_data2 = (float*)calloc(numElements, sizeof(float)); | ||
|
||
srand(1214134); | ||
for (int i = 0; i < numElements; i++) | ||
{ | ||
h_data1[i] = float(rand())/float(RAND_MAX + 1.0); | ||
h_data2[i] = float(rand())/float(RAND_MAX + 1.0); | ||
} | ||
|
||
int threadsPerBlock = BLOCK_SIZE; | ||
int numBlocks = numElements/BLOCK_SIZE + 1; | ||
|
||
float* d_data1; | ||
float* d_data2; | ||
|
||
cudaMalloc((void**)&d_data1, numElements*sizeof(float)); | ||
cudaMalloc((void**)&d_data2, numElements*sizeof(float)); | ||
|
||
// Timing | ||
clock_t start = clock(); | ||
|
||
cudaMemcpy(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice); | ||
func1_kernel<<<numBlocks, threadsPerBlock>>>(d_data1, d_data1, numElements); | ||
cudaMemcpy(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost); | ||
|
||
cudaMemcpy(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice); | ||
func2_kernel<<<numBlocks, threadsPerBlock>>>(d_data2, d_data2, numElements); | ||
cudaMemcpy(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost); | ||
|
||
// Timing | ||
clock_t finish = clock(); | ||
|
||
printf("The results are:\n"); | ||
for (int i = 0; i < numValuesToPrint; i++) | ||
{ | ||
printf("%f, %f\n", h_data1[i], h_data2[i]); | ||
} | ||
printf("...\n"); | ||
for (int i = numElements - numValuesToPrint; i < numElements; i++) | ||
{ | ||
printf("%f, %f\n", h_data1[i], h_data2[i]); | ||
} | ||
double sum1 = 0.0; | ||
double sum2 = 0.0; | ||
for (int i = 0; i < numElements; i++) | ||
{ | ||
sum1 += h_data1[i]; | ||
sum2 += h_data2[i]; | ||
} | ||
printf("The summs are: %f and %f\n", sum1, sum2); | ||
|
||
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC); | ||
|
||
// Release the memory | ||
free(h_data1); | ||
free(h_data2); | ||
|
||
cudaFree(d_data1); | ||
cudaFree(d_data2); | ||
|
||
return 0; | ||
} |
119 changes: 119 additions & 0 deletions
119
content/10_AsynchronousExecution/Code/Async1/Solution/async_gpu_2.cu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <algorithm> | ||
#include <time.h> | ||
|
||
#define BLOCK_SIZE 256 | ||
|
||
static constexpr int numIterations = 100; | ||
static constexpr int numValuesToPrint = 10; | ||
|
||
__global__ void func1_kernel(const float* in, float* out, int numElements) | ||
{ | ||
int i = threadIdx.x + blockIdx.x*blockDim.x; | ||
if (i < numElements) | ||
{ | ||
float value = in[i]; | ||
for (int iter = 0; iter < numIterations; iter++) | ||
{ | ||
value = sinf(value); | ||
} | ||
out[i] = value; | ||
} | ||
} | ||
|
||
__global__ void func2_kernel(const float* in, float* out, int numElements) | ||
{ | ||
int i = threadIdx.x + blockIdx.x*blockDim.x; | ||
if (i < numElements) | ||
{ | ||
float value = in[i]; | ||
for (int iter = 0; iter < numIterations; iter++) | ||
{ | ||
value = -sinf(value); | ||
} | ||
out[i] = value; | ||
} | ||
} | ||
|
||
int main(int argc, char* argv[]) | ||
{ | ||
|
||
int numElements = (argc > 1) ? atoi(argv[1]) : 1000000; | ||
|
||
printf("Transforming %d values.\n", numElements); | ||
|
||
float* h_data1; | ||
float* h_data2; | ||
|
||
cudaMallocHost((void**)&h_data1, numElements*sizeof(float)); | ||
cudaMallocHost((void**)&h_data2, numElements*sizeof(float)); | ||
|
||
srand(1214134); | ||
for (int i = 0; i < numElements; i++) | ||
{ | ||
h_data1[i] = float(rand())/float(RAND_MAX + 1.0); | ||
h_data2[i] = float(rand())/float(RAND_MAX + 1.0); | ||
} | ||
|
||
int threadsPerBlock = BLOCK_SIZE; | ||
int numBlocks = numElements/BLOCK_SIZE + 1; | ||
|
||
float* d_data1; | ||
float* d_data2; | ||
|
||
cudaMalloc((void**)&d_data1, numElements*sizeof(float)); | ||
cudaMalloc((void**)&d_data2, numElements*sizeof(float)); | ||
|
||
cudaStream_t stream1; | ||
cudaStream_t stream2; | ||
|
||
cudaStreamCreate(&stream1); | ||
cudaStreamCreate(&stream2); | ||
|
||
// Timing | ||
clock_t start = clock(); | ||
|
||
cudaMemcpyAsync(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice, stream1); | ||
func1_kernel<<<numBlocks, threadsPerBlock, 0, stream1>>>(d_data1, d_data1, numElements); | ||
cudaMemcpyAsync(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream1); | ||
|
||
cudaMemcpyAsync(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice, stream2); | ||
func2_kernel<<<numBlocks, threadsPerBlock, 0, stream2>>>(d_data2, d_data2, numElements); | ||
cudaMemcpyAsync(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream2); | ||
|
||
cudaDeviceSynchronize(); | ||
|
||
// Timing | ||
clock_t finish = clock(); | ||
|
||
printf("The results are:\n"); | ||
for (int i = 0; i < numValuesToPrint; i++) | ||
{ | ||
printf("%f, %f\n", h_data1[i], h_data2[i]); | ||
} | ||
printf("...\n"); | ||
for (int i = numElements - numValuesToPrint; i < numElements; i++) | ||
{ | ||
printf("%f, %f\n", h_data1[i], h_data2[i]); | ||
} | ||
double sum1 = 0.0; | ||
double sum2 = 0.0; | ||
for (int i = 0; i < numElements; i++) | ||
{ | ||
sum1 += h_data1[i]; | ||
sum2 += h_data2[i]; | ||
} | ||
printf("The summs are: %f and %f\n", sum1, sum2); | ||
|
||
printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC); | ||
|
||
// Release the memory | ||
cudaFreeHost(h_data1); | ||
cudaFreeHost(h_data2); | ||
|
||
cudaFree(d_data1); | ||
cudaFree(d_data2); | ||
|
||
return 0; | ||
} |
Oops, something went wrong.