-
Notifications
You must be signed in to change notification settings - Fork 360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
OptiX testrender overhaul #1829
Changes from 105 commits
8deb471
4a85d18
5475f08
79d39d6
e7adbe5
c97b0c0
418e916
4cff2d8
20d452b
4b16643
c6a2240
8f4bb85
4c63724
9e54203
a34ccc9
a3f7e85
0c92f24
b0229ed
4f4ccce
7dcc2b2
e17b590
fd0d6cc
d976332
d391c73
77485d3
14445e4
ccba6f7
48adb13
a0b9477
ddb401d
fc439e6
77de801
2bd8f7e
0a1f025
eae4ef8
06bea21
a3707d6
0bf4c56
5c3a786
1ebcd09
107e1ef
80e3043
6ed6442
a81b9b2
d80bc21
fa5c7d0
d2d519b
0cf56ea
f465410
595663a
ba60cd0
8cb6d3c
49233be
2b5d5c6
91f6495
ab3008a
c54fb95
55b7b76
705c8ca
a388b50
dca249c
0066ab1
7e44a37
1a96921
dd1b3b6
a13146d
8ad387c
31abe77
159c1f2
0f6b710
99a92b6
4d591ce
e59210d
f7d79cd
a3445dd
e937b66
648ebfc
ec7a026
221a150
b75b9eb
6d75456
7938f5b
8a69ba9
a70fe49
07e874e
64b405b
169b071
d0626af
a6dc8d3
281074e
efa8c2c
7d389df
3c70d81
500a289
064921d
88cd20c
f329ba1
2c60ac9
f6d65d5
9b7c31c
7958fe1
1a71f24
33d3576
e73c4c3
6bb82fe
41ffaf9
ddcfd22
02bb427
db46f09
eb5a865
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,17 +10,51 @@ | |
|
||
OSL_NAMESPACE_ENTER | ||
|
||
|
||
#ifdef __CUDACC__ | ||
// std::upper_bound is not supported in device code, so define a version of it here. | ||
// Adapted from the LLVM Project, see https://llvm.org/LICENSE.txt for license information. | ||
template<typename T> | ||
inline OSL_HOSTDEVICE const T* | ||
upper_bound_cuda(const T* data, int count, const T value) | ||
{ | ||
const T* first = data; | ||
const T value_ = value; | ||
int len = count; | ||
while (len != 0) { | ||
int l2 = len / 2; | ||
const T* m = first; | ||
m += l2; | ||
if (value_ < *m) | ||
len = l2; | ||
else { | ||
first = ++m; | ||
len -= l2 + 1; | ||
} | ||
} | ||
return first; | ||
} | ||
#endif | ||
|
||
|
||
struct Background { | ||
OSL_HOSTDEVICE | ||
Background() : values(0), rows(0), cols(0) {} | ||
|
||
OSL_HOSTDEVICE | ||
~Background() | ||
{ | ||
#ifndef __CUDACC__ | ||
delete[] values; | ||
delete[] rows; | ||
delete[] cols; | ||
#endif | ||
} | ||
|
||
template<typename F, typename T> void prepare(int resolution, F cb, T* data) | ||
template<typename F, typename T> | ||
void prepare(int resolution, F cb, T* data) | ||
{ | ||
// These values are set via set_variables() in CUDA | ||
res = resolution; | ||
if (res < 32) | ||
res = 32; // validate | ||
|
@@ -29,6 +63,7 @@ struct Background { | |
values = new Vec3[res * res]; | ||
rows = new float[res]; | ||
cols = new float[res * res]; | ||
|
||
for (int y = 0, i = 0; y < res; y++) { | ||
for (int x = 0; x < res; x++, i++) { | ||
values[i] = cb(map(x + 0.5f, y + 0.5f), data); | ||
|
@@ -43,8 +78,9 @@ struct Background { | |
cols[i - res + x] /= cols[i - 1]; | ||
} | ||
// normalize the pdf across all scanlines | ||
for (int y = 0; y < res; y++) | ||
for (int y = 0; y < res; y++) { | ||
rows[y] /= rows[res - 1]; | ||
} | ||
|
||
// both eval and sample below return a "weight" that is | ||
// value[i] / row*col_pdf, so might as well bake it into the table | ||
|
@@ -65,6 +101,7 @@ struct Background { | |
#endif | ||
} | ||
|
||
OSL_HOSTDEVICE | ||
Vec3 eval(const Vec3& dir, float& pdf) const | ||
{ | ||
// map from sphere to unit-square | ||
|
@@ -90,6 +127,7 @@ struct Background { | |
return values[i]; | ||
} | ||
|
||
OSL_HOSTDEVICE | ||
Vec3 sample(float rx, float ry, Dual2<Vec3>& dir, float& pdf) const | ||
{ | ||
float row_pdf, col_pdf; | ||
|
@@ -101,8 +139,96 @@ struct Background { | |
return values[y * res + x]; | ||
} | ||
|
||
#ifdef __CUDACC__ | ||
OSL_HOSTDEVICE | ||
void set_variables(Vec3* values_in, float* rows_in, float* cols_in, | ||
int res_in) | ||
{ | ||
values = values_in; | ||
rows = rows_in; | ||
cols = cols_in; | ||
res = res_in; | ||
invres = __frcp_rn(res); | ||
invjacobian = __fdiv_rn(res * res, float(4 * M_PI)); | ||
assert(res >= 32); | ||
} | ||
|
||
template<typename F> | ||
OSL_HOSTDEVICE void prepare_cuda(int stride, int idx, F cb) | ||
{ | ||
prepare_cuda_01(stride, idx, cb); | ||
if (idx == 0) | ||
prepare_cuda_02(); | ||
prepare_cuda_03(stride, idx); | ||
} | ||
|
||
// Pre-compute the 'values' table in parallel | ||
template<typename F> | ||
OSL_HOSTDEVICE void prepare_cuda_01(int stride, int idx, F cb) | ||
{ | ||
for (int y = 0; y < res; y++) { | ||
const int row_start = y * res; | ||
const int row_end = row_start + res; | ||
int i = row_start + idx; | ||
for (int x = idx; x < res; x += stride, i += stride) { | ||
if (i >= row_end) | ||
continue; | ||
values[i] = cb(map(x + 0.5f, y + 0.5f)); | ||
} | ||
} | ||
} | ||
|
||
// Compute 'cols' and 'rows' using a single thread | ||
OSL_HOSTDEVICE void prepare_cuda_02() | ||
{ | ||
for (int y = 0, i = 0; y < res; y++) { | ||
for (int x = 0; x < res; x++, i++) { | ||
cols[i] = std::max(std::max(values[i].x, values[i].y), | ||
values[i].z) | ||
+ ((x > 0) ? cols[i - 1] : 0.0f); | ||
} | ||
rows[y] = cols[i - 1] + ((y > 0) ? rows[y - 1] : 0.0f); | ||
// normalize the pdf for this scanline (if it was non-zero) | ||
if (cols[i - 1] > 0) { | ||
for (int x = 0; x < res; x++) { | ||
cols[i - res + x] = __fdiv_rn(cols[i - res + x], | ||
cols[i - 1]); | ||
} | ||
} | ||
} | ||
} | ||
|
||
// Normalize the row PDFs and finalize the 'values' table | ||
OSL_HOSTDEVICE void prepare_cuda_03(int stride, int idx) | ||
{ | ||
// normalize the pdf across all scanlines | ||
for (int y = idx; y < res; y += stride) { | ||
rows[y] = __fdiv_rn(rows[y], rows[res - 1]); | ||
} | ||
|
||
// both eval and sample below return a "weight" that is | ||
// value[i] / row*col_pdf, so might as well bake it into the table | ||
for (int y = 0; y < res; y++) { | ||
float row_pdf = rows[y] - (y > 0 ? rows[y - 1] : 0.0f); | ||
const int row_start = y * res; | ||
const int row_end = row_start + res; | ||
int i = row_start + idx; | ||
for (int x = idx; x < res; x += stride, i += stride) { | ||
if (i >= row_end) | ||
continue; | ||
float col_pdf = cols[i] - (x > 0 ? cols[i - 1] : 0.0f); | ||
const float divisor = __fmul_rn(__fmul_rn(row_pdf, col_pdf), | ||
invjacobian); | ||
values[i].x = __fdiv_rn(values[i].x, divisor); | ||
values[i].y = __fdiv_rn(values[i].y, divisor); | ||
values[i].z = __fdiv_rn(values[i].z, divisor); | ||
} | ||
} | ||
} | ||
#endif | ||
|
||
private: | ||
Dual2<Vec3> map(float x, float y) const | ||
OSL_HOSTDEVICE Dual2<Vec3> map(float x, float y) const | ||
{ | ||
// pixel coordinates of entry (x,y) | ||
Dual2<float> u = Dual2<float>(x, 1, 0) * invres; | ||
|
@@ -115,14 +241,20 @@ struct Background { | |
return make_Vec3(sin_phi * ct, sin_phi * st, cos_phi); | ||
} | ||
|
||
static float sample_cdf(const float* data, unsigned int n, float x, | ||
unsigned int* idx, float* pdf) | ||
static OSL_HOSTDEVICE float sample_cdf(const float* data, unsigned int n, | ||
float x, unsigned int* idx, | ||
float* pdf) | ||
{ | ||
OSL_DASSERT(x >= 0); | ||
OSL_DASSERT(x < 1); | ||
OSL_DASSERT(x >= 0.0f); | ||
OSL_DASSERT(x < 1.0f); | ||
#ifndef __CUDACC__ | ||
*idx = std::upper_bound(data, data + n, x) - data; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see any reason to keep using the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll leave it as CUDA-only for now to possibly do some linear search experiments, to see if that is a win on the GPU. |
||
#else | ||
*idx = upper_bound_cuda(data, n, x) - data; | ||
#endif | ||
OSL_DASSERT(*idx < n); | ||
OSL_DASSERT(x < data[*idx]); | ||
|
||
float scaled_sample; | ||
if (*idx == 0) { | ||
*pdf = data[0]; | ||
|
@@ -137,12 +269,12 @@ struct Background { | |
return std::min(scaled_sample, 0.99999994f); | ||
} | ||
|
||
Vec3* values; // actual map | ||
float* rows; // probability of choosing a given row 'y' | ||
float* cols; // probability of choosing a given column 'x', given that we've chosen row 'y' | ||
int res; // resolution in pixels of the precomputed table | ||
float invres; // 1 / resolution | ||
float invjacobian; | ||
Vec3* values = nullptr; // actual map | ||
float* rows = nullptr; // probability of choosing a given row 'y' | ||
float* cols = nullptr; // probability of choosing a given column 'x', given that we've chosen row 'y' | ||
int res = -1; // resolution in pixels of the precomputed table | ||
float invres = 0.0f; // 1 / resolution | ||
float invjacobian = 0.0f; | ||
}; | ||
|
||
OSL_NAMESPACE_EXIT |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Many times linear search will be faster on gpu :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In my experience not really, though it would be worth benchmarking. Also I believe this is used for the background pdf which can be a few thousand elements long.