Skip to content

Commit

Permalink
Fix AVX512 issue, add opt param
Browse files Browse the repository at this point in the history
  • Loading branch information
msg7086 committed Apr 20, 2020
1 parent a1302a6 commit a7fb01f
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 7 deletions.
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Neo FFT3D Copyright(C) 2020 Xinyue Lu, and previous developers

FFT3DFilter is a 3D Frequency Domain filter - strong denoiser and moderate sharpener. It was originally written by Alexander G. Balakhnin aka Fizick, and later modified by martin53 for AviSynth 2.6 and later modified by Ferenc Pintér aka pinterf for further improvement, high bit depth, and more. Kudos to them for creating and improving this fantastic tool.

In this project, legacy format like YUY2 has been removed, legacy parameter like multiplane has been removed, and SIMD code has been completely re-written for all core parts of the code. Due to API change, the project has been renamed from FFT3DFilter to Neo_FFT3D to avoid confusion. SSE is required to run optimized routine. AVX and AVX512 routine is also available. AVX512 is untested due to lack of hardware.
In this project, legacy format like YUY2 has been removed, legacy parameter like multiplane has been removed, and SIMD code has been completely re-written for all core parts of the code. Due to API change, the project has been renamed from FFT3DFilter to Neo_FFT3D to avoid confusion. SSE is required to run optimized routine. AVX and AVX512 routine is also available.

## Usage

Expand Down Expand Up @@ -71,6 +71,20 @@ Parameters:

Default: 0.

- *opt*

Sets which CPU optimizations to use.

<0 - Auto detect
0 - Auto detect with AVX512 disabled
1 - Use C
2 - Use up to SSE
3 - Use up to AVX
4 - Use up to AVX512

Default: 0.


### Note on MT (multi-threading)

Neo_FFT3D is MT-compatible. Do not invoke more than 6 threads, or otherwise it'll start using lots of memory. The sweet spot is likely be 2-4.
Expand Down
5 changes: 4 additions & 1 deletion src/fft3d.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ struct FFT3D final : Filter {
Param {"l", Integer},
Param {"t", Integer},
Param {"r", Integer},
Param {"b", Integer}
Param {"b", Integer},
Param {"opt", Integer}
};
}
void Initialize(InDelegator* in, DSVideoInfo in_vi, FetchFrameFunctor* fetch_frame) override
Expand Down Expand Up @@ -100,6 +101,7 @@ struct FFT3D final : Filter {
0.0f,
2.0f, 50.0f,
0, 0, 0, 0,
0,
in_vi
};
in->Read("beta", ep->beta);
Expand Down Expand Up @@ -134,6 +136,7 @@ struct FFT3D final : Filter {
in->Read("t", ep->t); ep->t = MAX(ep->t, 0);
in->Read("r", ep->r); ep->r = MAX(ep->r, 0);
in->Read("b", ep->b); ep->b = MAX(ep->b, 0);
in->Read("opt", ep->opt);

this->crop = ep->l > 0 || ep->r > 0 || ep->t > 0 || ep->b > 0;

Expand Down
1 change: 1 addition & 0 deletions src/fft3d_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ struct EngineParams {
float hr; // halo radius - v1.9
float ht; // halo threshold - v1.9
int l, t, r, b; // cropping
int opt;

DSVideoInfo vi;
bool IsChroma;
Expand Down
2 changes: 1 addition & 1 deletion src/fft3d_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ class FFT3DEngine {
}

CPUFlags = GetCPUFlags(); //re-enabled in v.1.9
ffp.set_ffp(CPUFlags, ep->degrid, ep->pfactor, ep->bt);
ffp.set_ffp(CPUFlags, ep->degrid, ep->pfactor, ep->bt, ep->opt);

pwin = new float[ep->bh*outpitch]; // pattern window array

Expand Down
23 changes: 19 additions & 4 deletions src/functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,15 @@ struct FilterFunctionPointers {
void (*Sharpen)(fftwf_complex *, SharedFunctionParams);
void (*Kalman)(fftwf_complex *, fftwf_complex *, SharedFunctionParams);

void set_ffp(int CPUFlags, float degrid, float pfactor, int bt)
void set_ffp(int CPUFlags, float degrid, float pfactor, int bt, int opt)
{
// opt
// 1: C
// 2: SSE
// 3: AVX
// 4: AVX512
// 0: auto-detect without AVX512
// -1: auto-detect with AVX512
if (degrid != 0 && pfactor == 0) {
// Default Dispatcher
Apply2D_C_Dispatch = Apply2D_C<false, true>;;
Expand Down Expand Up @@ -162,44 +169,52 @@ struct FilterFunctionPointers {
Sharpen_C_Dispatch = Sharpen_C<true>;
Sharpen_SSE2_Dispatch = Sharpen_SSE2<true>;
Sharpen_AVX_Dispatch = Sharpen_AVX<true>;
Sharpen_AVX512_Dispatch = Sharpen_AVX512<true>;
}
else {
Sharpen_C_Dispatch = Sharpen_C<false>;
Sharpen_SSE2_Dispatch = Sharpen_SSE2<false>;
Sharpen_AVX_Dispatch = Sharpen_AVX<false>;
Sharpen_AVX512_Dispatch = Sharpen_AVX512<false>;
}

if (pfactor != 0) {
Kalman_C_Dispatch = Kalman_C<true>;
Kalman_SSE2_Dispatch = Kalman_SSE2<true>;
Kalman_AVX_Dispatch = Kalman_AVX<true>;
Kalman_AVX512_Dispatch = Kalman_AVX512<true>;
}
else {
Kalman_C_Dispatch = Kalman_C<false>;
Kalman_SSE2_Dispatch = Kalman_SSE2<false>;
Kalman_AVX_Dispatch = Kalman_AVX<false>;
Kalman_AVX512_Dispatch = Kalman_AVX512<false>;
}

switch(bt) {
case 2:
Apply3D_C_Dispatch = Apply3D2_C_Dispatch;
Apply3D_SSE2_Dispatch = Apply3D2_SSE2_Dispatch;
Apply3D_AVX_Dispatch = Apply3D2_AVX_Dispatch;
Apply3D_AVX512_Dispatch = Apply3D2_AVX512_Dispatch;
break;
case 3:
Apply3D_C_Dispatch = Apply3D3_C_Dispatch;
Apply3D_SSE2_Dispatch = Apply3D3_SSE2_Dispatch;
Apply3D_AVX_Dispatch = Apply3D3_AVX_Dispatch;
Apply3D_AVX512_Dispatch = Apply3D3_AVX512_Dispatch;
break;
case 4:
Apply3D_C_Dispatch = Apply3D4_C_Dispatch;
Apply3D_SSE2_Dispatch = Apply3D4_SSE2_Dispatch;
Apply3D_AVX_Dispatch = Apply3D4_AVX_Dispatch;
Apply3D_AVX512_Dispatch = Apply3D4_AVX512_Dispatch;
break;
case 5:
Apply3D_C_Dispatch = Apply3D5_C_Dispatch;
Apply3D_SSE2_Dispatch = Apply3D5_SSE2_Dispatch;
Apply3D_AVX_Dispatch = Apply3D5_AVX_Dispatch;
Apply3D_AVX512_Dispatch = Apply3D5_AVX512_Dispatch;
break;
}

Expand All @@ -210,19 +225,19 @@ struct FilterFunctionPointers {

// We actually only used SSE code.
// Let's try SSE and if it breaks on pure SSE we'll change it to SSE2.
if (CPUFlags & CPUF_SSE) {
if ((CPUFlags & CPUF_SSE) && (opt <= 0 || opt > 1)) {
Apply2D = Apply2D_SSE2_Dispatch;
Apply3D = Apply3D_SSE2_Dispatch;
Sharpen = Sharpen_SSE2_Dispatch;
Kalman = Kalman_SSE2_Dispatch;
}
if (CPUFlags & CPUF_AVX) {
if ((CPUFlags & CPUF_AVX) && (opt <= 0 || opt > 2)) {
Apply2D = Apply2D_AVX_Dispatch;
Apply3D = Apply3D_AVX_Dispatch;
Sharpen = Sharpen_AVX_Dispatch;
Kalman = Kalman_AVX_Dispatch;
}
if (CPUFlags & CPUF_AVX512F) {
if ((CPUFlags & CPUF_AVX512F) && (opt < 0 || opt > 3)) {
Apply2D = Apply2D_AVX512_Dispatch;
Apply3D = Apply3D_AVX512_Dispatch;
Sharpen = Sharpen_AVX512_Dispatch;
Expand Down

0 comments on commit a7fb01f

Please sign in to comment.