Skip to content

Commit

Permalink
Merge pull request #1626 from gnudatalanguage/master
Browse files Browse the repository at this point in the history
align 1.0.3 on latest changes
  • Loading branch information
GillesDuvert authored Aug 30, 2023
2 parents 907e95f + 759e0ae commit ab2b22c
Show file tree
Hide file tree
Showing 9 changed files with 71 additions and 74 deletions.
5 changes: 5 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
coverage:
status:
project:
default:
threshold: 1%
38 changes: 26 additions & 12 deletions src/basegdl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "basegdl.hpp"
#include "nullgdl.hpp"
#include "objects.hpp"

using namespace std;

Expand Down Expand Up @@ -843,16 +844,29 @@ void GDLDelete( BaseGDL* toDelete)
}
int GDL_NTHREADS=1;

int parallelize(SizeT n, int modifier) {
//below, please modify if you find a way to persuade behaviour of those different cases to be better if they return different number of threads.
switch(modifier)
{
case TP_DEFAULT: //the same as IDL, reserved for routines that use the thread pool, ideally check the special thread pool keywords.
case TP_ARRAY_INITIALISATION: // used by GDL array initialisation (new, convert, gdlarray): probably needs som special tuning
case TP_MEMORY_ACCESS: // concurrent memory access, probably needs to be capped to preserve bandwidth
case TP_CPU_INTENSIVE: // benefit from max number of threads
return (n >= CpuTPOOL_MIN_ELTS && (CpuTPOOL_MAX_ELTS == 0 || CpuTPOOL_MAX_ELTS >= n))?CpuTPOOL_NTHREADS:1;
default:
return 1;
}
int parallelize(SizeT nEl, int modifier) {
int nThreads = (nEl >= CpuTPOOL_MIN_ELTS && (CpuTPOOL_MAX_ELTS == 0 || CpuTPOOL_MAX_ELTS >= nEl)) ? CpuTPOOL_NTHREADS : 1;
if (useSmartTpool) {
//below, please modify if you find a way to persuade behaviour of those different cases to be better if they return different number of threads.
switch (modifier) {
case TP_DEFAULT: //the same as IDL, reserved for routines that use the thread pool, ideally check the special thread pool keywords.
case TP_ARRAY_INITIALISATION: // used by GDL array initialisation (new, convert, gdlarray): need to concern only 1 thread/code whicj is not possible AFAIK.
case TP_MEMORY_ACCESS: // concurrent memory access, probably needs to be capped to preserve bandwidth
{
if (nThreads == 1) return nThreads;
// here we have more than 1 thread, so n operations will be divided between nt threads. It becomes inefficient if nt is large, to start so many threads for diminishing returns.
// I propose to enable as many threads as necessary so that each thread will compute at least CpuTPOOL_MIN_ELTS:
if (CpuTPOOL_MIN_ELTS < 1) return CpuTPOOL_NTHREADS; // the user did not understand IDL's doc about threadpools?.
int nchunk = nEl / CpuTPOOL_MIN_ELTS;
nchunk++; //to be sure
if (nThreads > nchunk) nThreads = nchunk;
// std::cerr << nThreads;
return nThreads;
}
case TP_CPU_INTENSIVE: // benefit from max number of threads if possible given MIN and MAX elts etc
return nThreads;
default:
return 1;
}
} else return nThreads;
}
44 changes: 0 additions & 44 deletions src/basic_fun.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4091,50 +4091,6 @@ namespace lib {
}


// BaseGDL* matrix_multiply( EnvT* e)
// {
// SizeT nParam=e->NParam( 2);
//
// BaseGDL* a = e->GetNumericArrayParDefined( 0);
// BaseGDL* b = e->GetNumericArrayParDefined( 1);
//
// static int aTIx = e->KeywordIx("ATRANSPOSE");
// bool aT = e->KeywordPresent(aTIx);
// static int bTIx = e->KeywordIx("BTRANSPOSE");
// bool bT = e->KeywordPresent(bTIx);
//
// static int strassenIx = e->KeywordIx("STRASSEN_ALGORITHM");
// bool strassen = e->KeywordPresent(strassenIx);
//
//
// if( p1->N_Elements() != rank)
// e->Throw("Incorrect number of elements in permutation.");
//
// DUInt* perm = new DUInt[rank];
// Guard<DUInt> perm_guard( perm);
//
// DUIntGDL* p1L = static_cast<DUIntGDL*>
// (p1->Convert2( GDL_UINT, BaseGDL::COPY));
// for( SizeT i=0; i<rank; ++i) perm[i] = (*p1L)[ i];
// delete p1L;
//
// // check permutaion vector
// for( SizeT i=0; i<rank; ++i)
// {
// DUInt j;
// for( j=0; j<rank; ++j) if( perm[j] == i) break;
// if (j == rank)
// e->Throw( "Incorrect permutation vector.");
// }
// return p0->Transpose( perm);
// }
//
// return a->Transpose( NULL);
// }

// helper function for sort_fun, recursive
// optimized version

template< typename IndexT>
void MergeSortOpt(BaseGDL* p0, IndexT* hhS, IndexT* h1, IndexT* h2,
SizeT len) {
Expand Down
22 changes: 14 additions & 8 deletions src/datatypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1320,13 +1320,16 @@ BaseGDL* Data_<Sp>::Transpose(DUInt* perm) { TRACE_ROUTINE(__FUNCTION__,__FILE__
for (SizeT d = 0; d < rank; ++d) {
resDim[ d] = this->dim[ perm[ d]];
}

Data_* res = new Data_(dimension(resDim, rank), BaseGDL::NOZERO);

// src stride
SizeT srcStride[ MAXRANK + 1];
this->dim.Stride(srcStride, rank);


// GD: Tests show that we are way faster than eigen (below) with our 'parallell' method in ALL CASES on my intel I7.
// But this may not be true on other platforms, so keep the possibility via a -- switch.
if (useEigenForTransposeOps) {
#ifdef USE_EIGEN
//for some reason, this simple eigen::code dos not like dimensions == 1, so cannot be used if this is the case.
bool try_eigen=true;
Expand All @@ -1344,6 +1347,7 @@ BaseGDL* Data_<Sp>::Transpose(DUInt* perm) { TRACE_ROUTINE(__FUNCTION__,__FILE__
return res;
}
#endif

#ifdef EIGEN_HAS_TENSOR
else if (try_eigen && rank == 3) // special case: eigen x 3
{
Expand Down Expand Up @@ -1391,11 +1395,13 @@ BaseGDL* Data_<Sp>::Transpose(DUInt* perm) { TRACE_ROUTINE(__FUNCTION__,__FILE__

#endif

} //will have returned if eigen ops exist.

SizeT nElem = dd.size();
long chunksize = nElem;
long nchunk = 1;
bool do_parallel = false;
GDL_NTHREADS=parallelize( nElem, TP_MEMORY_ACCESS);
GDL_NTHREADS=parallelize( nElem, TP_CPU_INTENSIVE);
if (GDL_NTHREADS > 1) { //no use start parallel threading for small numbers.
chunksize = nElem / GDL_NTHREADS;
nchunk = nElem / chunksize;
Expand Down Expand Up @@ -1485,7 +1491,7 @@ void Data_<Sp>::Reverse(DLong dim) { TRACE_ROUTINE(__FUNCTION__,__FILE__,__LINE_
if (this->dim[dim]%2) halfDim++;
SizeT outerStride = this->dim.Stride(dim + 1);
SizeT span=outerStride - revStride;
if ((GDL_NTHREADS=parallelize(nEl, TP_MEMORY_ACCESS))==1) { //most frequent
if ((GDL_NTHREADS=parallelize(nEl, TP_CPU_INTENSIVE))==1) { //most frequent
for (SizeT o = 0; o < nEl; o += outerStride) {
for (SizeT i = o; i < o+revStride; ++i) {
for (SizeT s = i, opp=span+i; s < halfDim+i ; s += revStride, opp-=revStride) {
Expand Down Expand Up @@ -1523,7 +1529,7 @@ BaseGDL* Data_<Sp>::DupReverse(DLong dim) { TRACE_ROUTINE(__FUNCTION__,__FILE__,
if (this->dim[dim]%2) halfDim++;
SizeT outerStride = this->dim.Stride(dim + 1);
SizeT span=outerStride - revStride;
if ((GDL_NTHREADS=parallelize(nEl, TP_MEMORY_ACCESS))==1) { //most frequent
if ((GDL_NTHREADS=parallelize(nEl, TP_CPU_INTENSIVE))==1) { //most frequent
for (SizeT o = 0; o < nEl; o += outerStride) {
for (SizeT i = o; i < o+revStride; ++i) {
for (SizeT s = i, opp=span+i; s < halfDim+i ; s += revStride, opp-=revStride) {
Expand Down Expand Up @@ -1563,7 +1569,7 @@ BaseGDL* Data_<SpDPtr>::DupReverse(DLong dim) {
if (this->dim[dim] % 2) halfDim++;
SizeT outerStride = this->dim.Stride(dim + 1);
SizeT span = outerStride - revStride;
if ((GDL_NTHREADS=parallelize(nEl, TP_MEMORY_ACCESS)) == 1) { //most frequent
if ((GDL_NTHREADS=parallelize(nEl, TP_CPU_INTENSIVE)) == 1) { //most frequent
for (SizeT o = 0; o < nEl; o += outerStride) {
for (SizeT i = o; i < o + revStride; ++i) {
for (SizeT s = i, opp = span + i; s < halfDim + i; s += revStride, opp -= revStride) {
Expand Down Expand Up @@ -1605,7 +1611,7 @@ BaseGDL* Data_<SpDObj>::DupReverse(DLong dim) {
if (this->dim[dim] % 2) halfDim++;
SizeT outerStride = this->dim.Stride(dim + 1);
SizeT span = outerStride - revStride;
if ((GDL_NTHREADS=parallelize(nEl, TP_MEMORY_ACCESS)) == 1) { //most frequent
if ((GDL_NTHREADS=parallelize(nEl, TP_CPU_INTENSIVE)) == 1) { //most frequent
for (SizeT o = 0; o < nEl; o += outerStride) {
for (SizeT i = o; i < o + revStride; ++i) {
for (SizeT s = i, opp = span + i; s < halfDim + i; s += revStride, opp -= revStride) {
Expand Down Expand Up @@ -3817,7 +3823,7 @@ void Data_<Sp>::CatInsert (const Data_* srcArr, const SizeT atDim, SizeT& at)
SizeT gap = this->dim.Stride (atDim + 1); // dest array

//GD: speed up by using indexing that permit parallel and collapse.
if ((GDL_NTHREADS=parallelize( len*nCp, TP_MEMORY_ACCESS))==1) { //most frequent
if ((GDL_NTHREADS=parallelize( len*nCp, TP_CPU_INTENSIVE))==1) { //most frequent
for (OMPInt c = 0; c < nCp; ++c) {
for (SizeT destIx = 0; destIx < len; destIx++) (*this)[destIx + destStart + c * gap] = (*srcArr)[ destIx + c * len];
}
Expand Down
16 changes: 13 additions & 3 deletions src/gdl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,14 +368,16 @@ int main(int argc, char *argv[])
cerr << " --sloppy Sets the traditional (default) compiling option where \"()\" can be used both with functions and arrays." << endl;
cerr << " Needed to counteract temporarily the effect of the enviromnment variable \"GDL_IS_FUSSY\"." << endl;
cerr << " --MAC Graphic device will be called 'MAC' on MacOSX. (default: 'X')" << endl;
cerr << " --no-use-wx Tells GDL not to use WxWidgets graphics." << endl;
cerr << " [--no-use-wx | -X] Tells GDL not to use WxWidgets graphics and resort to X11 (if available)." << endl;
cerr << " Also enabled by setting the environment variable GDL_DISABLE_WX_PLOTS to a non-null value." << endl;
cerr << " --notebook Force SVG-only device, used only when GDL is a Python Notebook Kernel." << endl;
cerr << " --widget-compat Tells GDL to use a default (rather ugly) fixed pitch font for compatiblity with IDL widgets." << endl;
cerr << " Also enabled by setting the environment variable GDL_WIDGET_COMPAT to a non-null value." << endl;
cerr << " Using this option may render some historical widgets unworkable (as they are based on fixed sizes)." << endl;
cerr << " Using this option may render some historical widgets more readable (as they are based on fixed sizes)." << endl;
cerr << " --no-dSFMT Tells GDL not to use double precision SIMD oriented Fast Mersenne Twister(dSFMT) for random doubles." << endl;
cerr << " Also disable by setting the environment variable GDL_NO_DSFMT to a non-null value." << endl;
cerr << " --with-eigen-transpose lets GDL use Eigen::transpose and related functions instead of our accelerated transpose function. Normally slower." <<endl;
cerr << " --smart-tpool switch to a mode where the number of threads is adaptive (experimental). Should enable better perfs on many core machines." <<endl;
#ifdef _WIN32
cerr << " --posix (Windows only): paths will be posix paths (experimental)." << endl;
#endif
Expand Down Expand Up @@ -483,10 +485,18 @@ int main(int argc, char *argv[])
{
usePlatformDeviceName = true;
}
else if (string(argv[a]) == "--no-use-wx")
else if (string(argv[a]) == "--no-use-wx" | string(argv[a]) == "-X")
{
force_no_wxgraphics = true;
}
else if (string(argv[a]) == "--with-eigen-transpose")
{
useEigenForTransposeOps = true;
}
else if (string(argv[a]) == "--smart-tpool")
{
useSmartTpool = true;
}
else if (string(argv[a]) == "--notebook")
{
iAmANotebook = true;
Expand Down
12 changes: 6 additions & 6 deletions src/math_fun_jmg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -927,7 +927,7 @@ namespace lib {
}

/* Double loop on the output image */
if ((GDL_NTHREADS=parallelize( nEl))==1) {
if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
for (OMPInt j = 0; j < nRows; ++j) {
for (OMPInt i = 0; i < nCols; ++i) {
// Compute the original source for this pixel, note order of j and i in P and Q definition of IDL doc.
Expand Down Expand Up @@ -1027,7 +1027,7 @@ namespace lib {
}

/* Double loop on the output image */
if ((GDL_NTHREADS=parallelize( nEl))==1) {
if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
for (OMPInt j = 0; j < nRows; ++j) {
for (OMPInt i = 0; i < nCols; ++i) {
// Compute the original source for this pixel, note order of j and i in P and Q definition of IDL doc.
Expand Down Expand Up @@ -1225,7 +1225,7 @@ namespace lib {
}

/* Double loop on the output image */
if ((GDL_NTHREADS=parallelize( nEl))==1) {
if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
for (OMPInt j = 0; j < nRows; ++j) {
for (OMPInt i = 0; i < nCols; ++i) {
// Compute the original source for this pixel, note order of j and i in P and Q definition of IDL doc.
Expand Down Expand Up @@ -1373,7 +1373,7 @@ namespace lib {
}

/* Double loop on the output image */
if ((GDL_NTHREADS=parallelize( nEl))==1) {
if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
for (OMPInt j = 0; j < nRows; ++j) {
for (OMPInt i = 0; i < nCols; ++i) {
// Compute the original source for this pixel, note order of j and i.
Expand Down Expand Up @@ -1485,7 +1485,7 @@ namespace lib {
}

/* Double loop on the output image */
if ((GDL_NTHREADS=parallelize( nEl))==1) {
if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
for (OMPInt j = 0; j < nRows; ++j) {
for (OMPInt i = 0; i < nCols; ++i) {
// Compute the original source for this pixel, note order of j and i.
Expand Down Expand Up @@ -1691,7 +1691,7 @@ namespace lib {
}

/* Double loop on the output image */
if ((GDL_NTHREADS=parallelize( nEl))==1) {
if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
for (OMPInt j = 0; j < nRows; ++j) {
for (OMPInt i = 0; i < nCols; ++i) {
// Compute the original source for this pixel, note order of j and i.
Expand Down
2 changes: 1 addition & 1 deletion src/minmax_include.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@


SizeT nElem = (stop - start) / step;
GDL_NTHREADS=parallelize( nElem);
GDL_NTHREADS=parallelize( nElem, TP_CPU_INTENSIVE);
//trap existence of ABSFUNC and create something that stands cppchekck useage (needed by contiunous integration scripts!)
#ifndef ABSFUNC
#define FUNCABS
Expand Down
4 changes: 4 additions & 0 deletions src/objects.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ volatile bool tryToMimicOriginalWidgets;
volatile bool useLocalDrivers;
//do we favor SIMD-accelerated random number generation?
volatile bool useDSFMTAcceleration;
//Transpose() operations are faster with our method, but setting this may test if this is still true for future Eigen:: versions or platforms.
volatile bool useEigenForTransposeOps=false;
//experimental TPOOL use adaptive number of threads.
volatile bool useSmartTpool=false;

void ResetObjects()
{
Expand Down
2 changes: 2 additions & 0 deletions src/objects.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ extern volatile bool useDSFMTAcceleration;
//do we use our own copy of (better?) drivers?
extern volatile bool useLocalDrivers;
extern volatile bool usePlatformDeviceName;
extern volatile bool useEigenForTransposeOps;
extern volatile bool useSmartTpool;
extern int debugMode;

enum DebugCode {
Expand Down

0 comments on commit ab2b22c

Please sign in to comment.