Add draft for async execution lesson

The text is a copy of an ENCCS lesson, in english.to have Can be used as a sample in Markdown for functions API signatures and type-alongs.
yandexdataschool · Aug 13, 2024 · 723b1f3 · 723b1f3
1 parent c7234fc
commit 723b1f3
Show file tree

Hide file tree

Showing 23 changed files with 1,493 additions and 0 deletions.
diff --git a/content/10_AsynchronousExecution/AsynchronousExecution.md b/content/10_AsynchronousExecution/AsynchronousExecution.md
diff --git a/content/10_AsynchronousExecution/Code/Async1/Solution/async_cpu.cpp b/content/10_AsynchronousExecution/Code/Async1/Solution/async_cpu.cpp
@@ -0,0 +1,88 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <math.h>
+#include <time.h>
+
+static constexpr int numIterations = 100;
+static constexpr int numValuesToPrint = 10;
+
+void func1(const float* in, float* out, int numElements)
+{
+    for (int i = 0; i < numElements; i++)
+    {
+        float value = in[i];
+        for (int iter = 0; iter < numIterations; iter++)
+        {
+            value = std::sin(value);
+        }
+        out[i] = value;
+    }
+}
+
+void func2(const float* in, float* out, int numElements)
+{
+    for (int i = 0; i < numElements; i++)
+    {
+        float value = in[i];
+        for (int iter = 0; iter < numIterations; iter++)
+        {
+            value = -std::sin(value);
+        }
+        out[i] = value;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+
+    int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;
+
+    printf("Transforming %d values.\n", numElements);
+
+    float* data1   = (float*)calloc(numElements, sizeof(float));
+    float* data2   = (float*)calloc(numElements, sizeof(float));
+
+    srand(1214134);
+    for (int i = 0; i < numElements; i++)
+    {
+        data1[i] = float(rand())/float(RAND_MAX + 1.0);
+        data2[i] = float(rand())/float(RAND_MAX + 1.0);
+    }
+
+    // Timing
+    clock_t start = clock();
+
+    func1(data1, data1, numElements);
+    func2(data2, data2, numElements);
+
+    // Timing
+    clock_t finish = clock();
+
+    printf("The results are:\n");
+    for (int i = 0; i < numValuesToPrint; i++)
+    {
+        printf("%f, %f\n", data1[i], data2[i]);
+    }
+    printf("...\n");
+    for (int i = numElements - numValuesToPrint; i < numElements; i++)
+    {
+        printf("%f, %f\n", data1[i], data2[i]);
+    }
+    double sum1 = 0.0;
+    double sum2 = 0.0;
+    for (int i = 0; i < numElements; i++)
+    {
+        sum1 += data1[i];
+        sum2 += data2[i];
+    }
+    printf("The summs are: %f and %f\n", sum1, sum2);
+
+    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
+
+    // Release the memory
+    free(data1);
+    free(data2);
+
+    return 0;
+}
diff --git a/content/10_AsynchronousExecution/Code/Async1/Solution/async_gpu_1.cu b/content/10_AsynchronousExecution/Code/Async1/Solution/async_gpu_1.cu
@@ -0,0 +1,108 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <time.h>
+
+#define BLOCK_SIZE 256
+
+static constexpr int numIterations = 100;
+static constexpr int numValuesToPrint = 10;
+
+__global__ void func1_kernel(const float* in, float* out, int numElements)
+{
+    int i = threadIdx.x + blockIdx.x*blockDim.x;
+    if (i < numElements)
+    {
+        float value = in[i];
+        for (int iter = 0; iter < numIterations; iter++)
+        {
+            value = sinf(value);
+        }
+        out[i] = value;
+    }
+}
+
+__global__ void func2_kernel(const float* in, float* out, int numElements)
+{
+    int i = threadIdx.x + blockIdx.x*blockDim.x;
+    if (i < numElements)
+    {
+        float value = in[i];
+        for (int iter = 0; iter < numIterations; iter++)
+        {
+            value = -sinf(value);
+        }
+        out[i] = value;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+
+    int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;
+
+    printf("Transforming %d values.\n", numElements);
+
+    float* h_data1   = (float*)calloc(numElements, sizeof(float));
+    float* h_data2   = (float*)calloc(numElements, sizeof(float));
+
+    srand(1214134);
+    for (int i = 0; i < numElements; i++)
+    {
+        h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
+        h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
+    }
+
+    int threadsPerBlock = BLOCK_SIZE;
+    int numBlocks = numElements/BLOCK_SIZE + 1;
+
+    float* d_data1;
+    float* d_data2;
+
+    cudaMalloc((void**)&d_data1, numElements*sizeof(float));
+    cudaMalloc((void**)&d_data2, numElements*sizeof(float));
+
+    // Timing
+    clock_t start = clock();
+
+    cudaMemcpy(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice);
+    func1_kernel<<<numBlocks, threadsPerBlock>>>(d_data1, d_data1, numElements);
+    cudaMemcpy(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost);
+
+    cudaMemcpy(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice);
+    func2_kernel<<<numBlocks, threadsPerBlock>>>(d_data2, d_data2, numElements);
+    cudaMemcpy(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost);
+
+    // Timing
+    clock_t finish = clock();
+
+    printf("The results are:\n");
+    for (int i = 0; i < numValuesToPrint; i++)
+    {
+        printf("%f, %f\n", h_data1[i], h_data2[i]);
+    }
+    printf("...\n");
+    for (int i = numElements - numValuesToPrint; i < numElements; i++)
+    {
+        printf("%f, %f\n", h_data1[i], h_data2[i]);
+    }
+    double sum1 = 0.0;
+    double sum2 = 0.0;
+    for (int i = 0; i < numElements; i++)
+    {
+        sum1 += h_data1[i];
+        sum2 += h_data2[i];
+    }
+    printf("The summs are: %f and %f\n", sum1, sum2);
+
+    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
+
+    // Release the memory
+    free(h_data1);
+    free(h_data2);
+
+    cudaFree(d_data1);
+    cudaFree(d_data2);
+
+    return 0;
+}
diff --git a/content/10_AsynchronousExecution/Code/Async1/Solution/async_gpu_2.cu b/content/10_AsynchronousExecution/Code/Async1/Solution/async_gpu_2.cu
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <time.h>
+
+#define BLOCK_SIZE 256
+
+static constexpr int numIterations = 100;
+static constexpr int numValuesToPrint = 10;
+
+__global__ void func1_kernel(const float* in, float* out, int numElements)
+{
+    int i = threadIdx.x + blockIdx.x*blockDim.x;
+    if (i < numElements)
+    {
+        float value = in[i];
+        for (int iter = 0; iter < numIterations; iter++)
+        {
+            value = sinf(value);
+        }
+        out[i] = value;
+    }
+}
+
+__global__ void func2_kernel(const float* in, float* out, int numElements)
+{
+    int i = threadIdx.x + blockIdx.x*blockDim.x;
+    if (i < numElements)
+    {
+        float value = in[i];
+        for (int iter = 0; iter < numIterations; iter++)
+        {
+            value = -sinf(value);
+        }
+        out[i] = value;
+    }
+}
+
+int main(int argc, char* argv[])
+{
+
+    int numElements = (argc > 1) ? atoi(argv[1]) : 1000000;
+
+    printf("Transforming %d values.\n", numElements);
+
+    float* h_data1;
+    float* h_data2;
+
+    cudaMallocHost((void**)&h_data1, numElements*sizeof(float));
+    cudaMallocHost((void**)&h_data2, numElements*sizeof(float));
+
+    srand(1214134);
+    for (int i = 0; i < numElements; i++)
+    {
+        h_data1[i] = float(rand())/float(RAND_MAX + 1.0);
+        h_data2[i] = float(rand())/float(RAND_MAX + 1.0);
+    }
+
+    int threadsPerBlock = BLOCK_SIZE;
+    int numBlocks = numElements/BLOCK_SIZE + 1;
+
+    float* d_data1;
+    float* d_data2;
+
+    cudaMalloc((void**)&d_data1, numElements*sizeof(float));
+    cudaMalloc((void**)&d_data2, numElements*sizeof(float));
+
+    cudaStream_t stream1;
+    cudaStream_t stream2;
+
+    cudaStreamCreate(&stream1);
+    cudaStreamCreate(&stream2);
+
+    // Timing
+    clock_t start = clock();
+
+    cudaMemcpyAsync(d_data1, h_data1, numElements*sizeof(float), cudaMemcpyHostToDevice, stream1);
+    func1_kernel<<<numBlocks, threadsPerBlock, 0, stream1>>>(d_data1, d_data1, numElements);
+    cudaMemcpyAsync(h_data1, d_data1, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream1);
+
+    cudaMemcpyAsync(d_data2, h_data2, numElements*sizeof(float), cudaMemcpyHostToDevice, stream2);
+    func2_kernel<<<numBlocks, threadsPerBlock, 0, stream2>>>(d_data2, d_data2, numElements);
+    cudaMemcpyAsync(h_data2, d_data2, numElements*sizeof(float), cudaMemcpyDeviceToHost, stream2);
+
+    cudaDeviceSynchronize();
+
+    // Timing
+    clock_t finish = clock();
+
+    printf("The results are:\n");
+    for (int i = 0; i < numValuesToPrint; i++)
+    {
+        printf("%f, %f\n", h_data1[i], h_data2[i]);
+    }
+    printf("...\n");
+    for (int i = numElements - numValuesToPrint; i < numElements; i++)
+    {
+        printf("%f, %f\n", h_data1[i], h_data2[i]);
+    }
+    double sum1 = 0.0;
+    double sum2 = 0.0;
+    for (int i = 0; i < numElements; i++)
+    {
+        sum1 += h_data1[i];
+        sum2 += h_data2[i];
+    }
+    printf("The summs are: %f and %f\n", sum1, sum2);
+
+    printf("It took %f seconds\n", (double)(finish - start) / CLOCKS_PER_SEC);
+
+    // Release the memory
+    cudaFreeHost(h_data1);
+    cudaFreeHost(h_data2);
+
+    cudaFree(d_data1);
+    cudaFree(d_data2);
+
+    return 0;
+}