Skip to content

Commit

Permalink
Add draft for async execution lesson
Browse files Browse the repository at this point in the history
The text is a copy of an ENCCS lesson, in english.to have  Can be used
as a sample in Markdown for functions API signatures and type-alongs.
  • Loading branch information
zhmurov committed Aug 13, 2024
1 parent c7234fc commit 723b1f3
Show file tree
Hide file tree
Showing 23 changed files with 1,493 additions and 0 deletions.
414 changes: 414 additions & 0 deletions content/10_AsynchronousExecution/AsynchronousExecution.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <math.h>
#include <time.h>

static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;

void func1(const float* in, float* out, int numElements)
{
for (int i = 0; i < numElements; i++)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = std::sin(value);
}
out[i] = value;
}
}

void func2(const float* in, float* out, int numElements)
{
for (int i = 0; i < numElements; i++)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = -std::sin(value);
}
out[i] = value;
}
}

int main(int argc, char* argv[])
{

int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;

printf("Transforming %d values.\n", numElements);

float* data1 = (float*)calloc(numElements, sizeof(float));
float* data2 = (float*)calloc(numElements, sizeof(float));

srand(1214134);
for (int i = 0; i < numElements; i++)
{
data1[i] = float(rand())/float(RAND_MAX + 1.0);
data2[i] = float(rand())/float(RAND_MAX + 1.0);
}

// Timing
clock_t start = clock();

func1(data1, data1, numElements);
func2(data2, data2, numElements);

// Timing
clock_t finish = clock();

printf("The results are:\n");
for (int i = 0; i < numValuesToPrint; i++)
{
printf("%f, %f\n", data1[i], data2[i]);
}
printf("...\n");
for (int i = numElements - numValuesToPrint; i < numElements; i++)
{
printf("%f, %f\n", data1[i], data2[i]);
}
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < numElements; i++)
{
sum1 += data1[i];
sum2 += data2[i];
}
printf("The summs are: %f and %f\n", sum1, sum2);

printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

// Release the memory
free(data1);
free(data2);

return 0;
}
108 changes: 108 additions & 0 deletions content/10_AsynchronousExecution/Code/Async1/Solution/async_gpu_1.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;

__global__ void func1_kernel(const float* in, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = sinf(value);
}
out[i] = value;
}
}

__global__ void func2_kernel(const float* in, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = -sinf(value);
}
out[i] = value;
}
}

int main(int argc, char* argv[])
{

int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;

printf("Transforming %d values.\n", numElements);

float* h_data1 = (float*)calloc(numElements, sizeof(float));
float* h_data2 = (float*)calloc(numElements, sizeof(float));

srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
}

int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/BLOCK_SIZE + 1;

float* d_data1;
float* d_data2;

cudaMalloc((void**)&d_data1, numElements*sizeof(float));
cudaMalloc((void**)&d_data2, numElements*sizeof(float));

// Timing
clock_t start = clock();

cudaMemcpy(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice);
func1_kernel<<<numBlocks, threadsPerBlock>>>(d_data1, d_data1, numElements);
cudaMemcpy(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost);

cudaMemcpy(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice);
func2_kernel<<<numBlocks, threadsPerBlock>>>(d_data2, d_data2, numElements);
cudaMemcpy(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost);

// Timing
clock_t finish = clock();

printf("The results are:\n");
for (int i = 0; i < numValuesToPrint; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
printf("...\n");
for (int i = numElements - numValuesToPrint; i < numElements; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < numElements; i++)
{
sum1 += h_data1[i];
sum2 += h_data2[i];
}
printf("The summs are: %f and %f\n", sum1, sum2);

printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

// Release the memory
free(h_data1);
free(h_data2);

cudaFree(d_data1);
cudaFree(d_data2);

return 0;
}
119 changes: 119 additions & 0 deletions content/10_AsynchronousExecution/Code/Async1/Solution/async_gpu_2.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <time.h>

#define BLOCK_SIZE 256

static constexpr int numIterations = 100;
static constexpr int numValuesToPrint = 10;

__global__ void func1_kernel(const float* in, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = sinf(value);
}
out[i] = value;
}
}

__global__ void func2_kernel(const float* in, float* out, int numElements)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < numElements)
{
float value = in[i];
for (int iter = 0; iter < numIterations; iter++)
{
value = -sinf(value);
}
out[i] = value;
}
}

int main(int argc, char* argv[])
{

int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;

printf("Transforming %d values.\n", numElements);

float* h_data1;
float* h_data2;

cudaMallocHost((void**)&h_data1, numElements*sizeof(float));
cudaMallocHost((void**)&h_data2, numElements*sizeof(float));

srand(1214134);
for (int i = 0; i < numElements; i++)
{
h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
}

int threadsPerBlock = BLOCK_SIZE;
int numBlocks = numElements/BLOCK_SIZE + 1;

float* d_data1;
float* d_data2;

cudaMalloc((void**)&d_data1, numElements*sizeof(float));
cudaMalloc((void**)&d_data2, numElements*sizeof(float));

cudaStream_t stream1;
cudaStream_t stream2;

cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);

// Timing
clock_t start = clock();

cudaMemcpyAsync(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice, stream1);
func1_kernel<<<numBlocks, threadsPerBlock, 0, stream1>>>(d_data1, d_data1, numElements);
cudaMemcpyAsync(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream1);

cudaMemcpyAsync(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice, stream2);
func2_kernel<<<numBlocks, threadsPerBlock, 0, stream2>>>(d_data2, d_data2, numElements);
cudaMemcpyAsync(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream2);

cudaDeviceSynchronize();

// Timing
clock_t finish = clock();

printf("The results are:\n");
for (int i = 0; i < numValuesToPrint; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
printf("...\n");
for (int i = numElements - numValuesToPrint; i < numElements; i++)
{
printf("%f, %f\n", h_data1[i], h_data2[i]);
}
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < numElements; i++)
{
sum1 += h_data1[i];
sum2 += h_data2[i];
}
printf("The summs are: %f and %f\n", sum1, sum2);

printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);

// Release the memory
cudaFreeHost(h_data1);
cudaFreeHost(h_data2);

cudaFree(d_data1);
cudaFree(d_data2);

return 0;
}
Loading

0 comments on commit 723b1f3

Please sign in to comment.