-
Notifications
You must be signed in to change notification settings - Fork 0
/
neon_array.hpp
155 lines (119 loc) · 3.11 KB
/
neon_array.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#ifndef NEON_ARRAY_HPP_INCLUDED
#define NEON_ARRAY_HPP_INCLUDED
#ifdef _WIN32
#include <malloc.h>
#endif
#include <cassert>
#include <cstddef>
#include <arm_neon.h>
namespace expression_template_simd
{
template <typename Real>
class valarray_rep_neon;
INLINE float32x4_t add(const float32x4_t& lhs, const float32x4_t& rhs)
{
return vaddq_f32(lhs, rhs);
}
INLINE float32x4_t mul(const float32x4_t& lhs, const float32x4_t& rhs)
{
return vmulq_f32(lhs, rhs);
}
INLINE float32x4_t madd(const float32x4_t& a, const float32x4_t& b, const float32x4_t& c)
{
return vmlaq_f32(a, b, c);
}
INLINE float32x4_t square_root(const float32x4_t& v)
{
return vmulq_f32(v, vrsqrteq_f32(v));
}
INLINE float get(const float32x4_t& value, std::size_t i)
{
#ifdef _WIN32
return value.n128_f32[i];
#else
return 0.0f;
#endif
}
template <>
class valarray_rep_neon<float>
{
public:
typedef float value_type;
typedef float32x4_t element_type;
INLINE valarray_rep_neon(std::size_t size)
: _size(size)
, _elements((size / element_size()) + ((size % element_size() == 0) ? 0 : 1))
{
_values = (element_type*)_mm_malloc(_elements * sizeof(element_type), alignment());
}
INLINE valarray_rep_neon(std::size_t size, value_type value)
: _size(size)
, _elements((size / element_size()) + ((size % element_size() == 0) ? 0 : 1))
{
_values = (element_type*)_mm_malloc(_elements * sizeof(element_type), alignment());
const float32x4_t value_sse = vdupq_n_f32(value);
for (std::size_t i = 0; i < _elements; ++i)
_values[i] = value_sse;
}
INLINE ~valarray_rep_neon()
{
_mm_free(_values);
}
INLINE valarray_rep_neon(const valarray_rep_neon& copy)
: _size(copy._size)
, _elements(copy._elements)
{
_values = (element_type*)_mm_malloc(_elements, alignment());
swap(copy);
}
INLINE valarray_rep_neon& operator= (const valarray_rep_neon& copy)
{
swap(copy);
return *this;
}
INLINE element_type operator() (std::size_t i) const
{
assert(i < _elements);
return _values[i];
}
INLINE element_type& operator() (std::size_t i)
{
assert(i < _elements);
return _values[i];
}
INLINE float operator[] (std::size_t i) const
{
assert(i < _size);
const std::size_t element = i / element_size();
const std::size_t index = i % element_size();
return get(_values[element], index);
}
INLINE std::size_t size() const
{
return _size;
}
INLINE std::size_t elements() const
{
return _elements;
}
INLINE static std::size_t alignment()
{
return sizeof(element_type);
}
INLINE static std::size_t element_size()
{
return sizeof(element_type) / sizeof(value_type);
}
INLINE void swap(const valarray_rep_neon& copy)
{
assert(_size == copy._size);
for (std::size_t i = 0; i < _elements; ++i)
_values[i] = copy._values[i];
}
private:
std::size_t _size;
std::size_t _elements;
element_type* _values;
} ; // end class valarray_rep_sse<float>
} // end namespace expression_template_simd
#endif // end NEON_ARRAY_HPP_INCLUDED