-
Notifications
You must be signed in to change notification settings - Fork 0
/
vec.cu
104 lines (92 loc) · 2.31 KB
/
vec.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#include"stdio.h"
#include"time.h"
__global__ void gpu_1(float *da1,float *db1,float *dc1,int n)
{
for(int i=0;i<n;i++)
{
//dc1[i]=db1[i]+da1[i];
dc1[i]=db1[i]*da1[i];
}
}
__global__ void gpu_2(float *da1,float *db1,float *dc1,int n)
{
int tid=threadIdx.x;
const int t_n=blockDim.x;
printf("%d\n",t_n);
while(tid<n)
{
//dc1[tid]=db1[tid]+da1[tid];
dc1[tid]=db1[tid]*da1[tid];
tid+=t_n;
}
}
__global__ void gpu_3(float *da1,float *db1,float *dc1,int n)
{
const int tidx=threadIdx.x;//当前线程的编号
const int bidx=blockIdx.x;//当前block的个数
const int t_n=gridDim.x*blockDim.x;//总block的个数乘总维度
int tid=bidx*blockDim.x+tidx;
//printf("%d\n",t_n);
while(tid<n)
{
//dc1[tid]=db1[tid]+da1[tid];
dc1[tid]=db1[tid]*da1[tid];
tid+=t_n;
}
}
int main()
{
const int arrsize=99999;
const int ARRAY_BYTES = arrsize * sizeof(float);
float a[arrsize];
float b[arrsize];
float c[arrsize];
for(int i=0;i<arrsize;i++)
{
b[i]=(float)(i+1);
a[i]=(float)(i+1);
}
clock_t start,end;
clock_t start_gpu1,end_gpu1;
start=clock();
for(int i=0;i<arrsize;i++)
{
//c[i]=b[i]+a[i];
c[i]=b[i]*a[i];
}
end=clock();
double during=(double)(end-start)/CLOCKS_PER_SEC;
printf("耗时%f秒\n",during);
printf("%f\n",c[0]);
/***********************************gpu单block单thread**************************************/
float *da;
float *db;
float *dc;
float ga1[arrsize];
float gb1[arrsize];
float gc1[arrsize];
for(int i=0;i<arrsize;i++)
{
gb1[i]=(float)(i+1);
ga1[i]=(float)(i+1);
}
cudaMalloc((void**) &da,ARRAY_BYTES);
cudaMalloc((void**) &db,ARRAY_BYTES);
cudaMalloc((void**) &dc,ARRAY_BYTES);
cudaMemcpy(da,ga1,ARRAY_BYTES,cudaMemcpyHostToDevice);
cudaMemcpy(db,gb1,ARRAY_BYTES,cudaMemcpyHostToDevice);
start_gpu1=clock();
//gpu_1<<<1,1>>>(da,db,dc,arrsize);
/***********************************gpu单block多thread**************************************/
gpu_2<<<1,1024>>>(da,db,dc,arrsize);
/***********************************gpu多block多thread**************************************/
//gpu_3<<<5000,1024>>>(da,db,dc,arrsize);
end_gpu1=clock();
cudaMemcpy(gc1,dc,ARRAY_BYTES,cudaMemcpyDeviceToHost);
double during1=(double)(end_gpu1-start_gpu1)/CLOCKS_PER_SEC;
printf("耗时%f秒\n",during1);
printf("%f\n",gc1[0]);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
}