Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

supoort gpu expressions and further optimize ntt and merkle tree #869

Merged
merged 5 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[submodule "src/goldilocks"]
path = src/goldilocks
url = https://github.com/0xPolygonHermez/goldilocks.git
branch = "develop"
10 changes: 5 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#INFO := $(shell cd src/goldilocks && ./configure.sh && cd ../.. && sleep 2)
#include src/goldilocks/CudaArch.mk
include src/goldilocks/CudaArch.mk
NVCC := /usr/local/cuda/bin/nvcc

TARGET_ZKP := zkProver
Expand Down Expand Up @@ -70,7 +70,7 @@ GRPC_CPP_PLUGIN = grpc_cpp_plugin
GRPC_CPP_PLUGIN_PATH ?= `which $(GRPC_CPP_PLUGIN)`

INC_DIRS := $(shell find $(SRC_DIRS) -type d) $(sort $(dir))
INC_FLAGS := $(addprefix -I,$(INC_DIRS))
INC_FLAGS := $(addprefix -I,$(INC_DIRS)) -I/usr/local/cuda/include

SRCS_ZKP := $(shell find $(SRC_DIRS) ! -path "./src/fflonk_setup/fflonk_setup*" ! -path "./tools/starkpil/bctree/*" ! -path "./test/examples/*" ! -path "./test/expressions/*" ! -path "./test/prover/*" ! -path "./src/goldilocks/benchs/*" ! -path "./src/goldilocks/benchs/*" ! -path "./src/goldilocks/tests/*" ! -path "./src/main_generator/*" ! -path "./src/pols_generator/*" ! -path "./src/pols_diff/*" ! -path "./src/witness2db/*" \( -name *.cpp -or -name *.c -or -name *.asm -or -name *.cc \))
SRCS_ZKP_GPU := $(shell find $(SRC_DIRS) ! -path "./src/fflonk_setup/fflonk_setup*" ! -path "./tools/starkpil/bctree/*" ! -path "./test/examples/*" ! -path "./test/expressions/*" ! -path "./test/prover/*" ! -path "./src/goldilocks/benchs/*" ! -path "./src/goldilocks/benchs/*" ! -path "./src/goldilocks/tests/*" ! -path "./src/main_generator/*" ! -path "./src/pols_generator/*" ! -path "./src/pols_diff/*" ! -path "./src/witness2db/*" ! -path "./src/goldilocks/utils/deviceQuery.cu" \( -name *.cpp -or -name *.c -or -name *.asm -or -name *.cc -or -name *.cu \))
Expand Down Expand Up @@ -157,16 +157,16 @@ $(BUILD_DIR_GPU)/%.asm.o: %.asm
# c++ source
$(BUILD_DIR_GPU)/%.cpp.o: %.cpp
$(MKDIR_P) $(dir $@)
$(CXX) -D__USE_CUDA__ $(CFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@
$(CXX) -D__USE_CUDA__ -DENABLE_EXPERIMENTAL_CODE $(CFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@

$(BUILD_DIR_GPU)/%.cc.o: %.cc
$(MKDIR_P) $(dir $@)
$(CXX) -D__USE_CUDA__ $(CFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@
$(CXX) -D__USE_CUDA__ -DENABLE_EXPERIMENTAL_CODE $(CFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@

# cuda source
$(BUILD_DIR_GPU)/%.cu.o: %.cu
$(MKDIR_P) $(dir $@)
$(NVCC) -D__USE_CUDA__ $(INC_FLAGS) -Isrc/goldilocks/utils -Xcompiler -fopenmp -Xcompiler -fPIC -Xcompiler -mavx2 -Xcompiler -O3 -O3 -arch=$(CUDA_ARCH) -O3 $< -dc --output-file $@
$(NVCC) -D__USE_CUDA__ -DENABLE_EXPERIMENTAL_CODE $(INC_FLAGS) -Isrc/goldilocks/utils -Xcompiler -fopenmp -Xcompiler -fPIC -Xcompiler -mavx2 -Xcompiler -O3 -O3 -arch=$(CUDA_ARCH) -O3 $< -dc --output-file $@

main_generator: $(BUILD_DIR)/$(TARGET_MNG)

Expand Down
10 changes: 5 additions & 5 deletions src/config/definitions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,15 @@

#define NROWS_PACK 4

#ifdef __USE_CUDA__
//#define MULTI_ROM_TEST

//#define ENABLE_EXPERIMENTAL_CODE

#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
#define TRANSPOSE_TMP_POLS false
#else
#define TRANSPOSE_TMP_POLS true
#endif

//#define MULTI_ROM_TEST

//#define ENABLE_EXPERIMENTAL_CODE


#endif
1 change: 1 addition & 0 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ void runFileGenBatchProof(Goldilocks fr, Prover &prover, Config &config)

// Call the prover
prover.genBatchProof(&proverRequest);
exit(0);
}

void runFileGenAggregatedProof(Goldilocks fr, Prover &prover, Config &config)
Expand Down
27 changes: 17 additions & 10 deletions src/prover/prover.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
#include "commit_pols_starks.hpp"
#include "chelpers_steps.hpp"
#include "chelpers_steps_pack.hpp"
#include "chelpers_steps_gpu.hpp"
#ifdef __AVX512__
#include "chelpers_steps_avx512.hpp"
#endif
Expand All @@ -54,6 +53,7 @@
#include "cuda_utils.hpp"
#include "ntt_goldilocks.hpp"
#include <pthread.h>
#include "chelpers_steps_gpu.cuh"

int asynctask(void* (*task)(void* args), void* arg)
{
Expand Down Expand Up @@ -113,6 +113,11 @@ Prover::Prover(Goldilocks &fr,

StarkInfo _starkInfo(config.zkevmStarkInfo, reduceMemoryZkevm);

#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
warmup_gpu();
alloc_pinned_mem_per_device((1 << _starkInfo.starkStruct.nBitsExt) * 32);
#endif

// Allocate an area of memory, mapped to file, to store all the committed polynomials,
// and create them using the allocated address

Expand All @@ -136,11 +141,6 @@ Prover::Prover(Goldilocks &fr,
}
zklog.info("Prover::genBatchProof() successfully allocated " + to_string(polsSize) + " bytes");
}

#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
alloc_pinned_mem(uint64_t(1<<24) * _starkInfo.mapSectionsN.section[eSection::cm1_n]);
warmup_gpu();
#endif

json finalVerkeyJson;
file2json(config.finalVerkey, finalVerkeyJson);
Expand Down Expand Up @@ -519,8 +519,11 @@ void Prover::genBatchProof(ProverRequest *pProverRequest)
json recursive2Verkey;
file2json(config.recursive2Verkey, recursive2Verkey);

#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
Goldilocks::Element *publics = (Goldilocks::Element *)malloc_zkevm(starksRecursive1->starkInfo.nPublics);
#else
Goldilocks::Element publics[starksRecursive1->starkInfo.nPublics];

#endif
// oldStateRoot
publics[0] = cmPols.Main.B0[0];
publics[1] = cmPols.Main.B1[0];
Expand Down Expand Up @@ -601,7 +604,7 @@ void Prover::genBatchProof(ProverRequest *pProverRequest)
CHelpersStepsGPU cHelpersSteps;
#elif defined(__AVX512__)
CHelpersStepsAvx512 cHelpersSteps;
#elif defined(__PACK__)
#elif defined(__PACK__)
CHelpersStepsPack cHelpersSteps;
cHelpersSteps.nrowsPack = NROWS_PACK;
#else
Expand Down Expand Up @@ -727,6 +730,10 @@ void Prover::genBatchProof(ProverRequest *pProverRequest)
jProofRecursive1["publics"] = publicStarkJson;
json2file(jProofRecursive1, pProverRequest->filePrefix + "batch_proof.proof.json");
}

#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
free_zkevm(publics);
#endif
TimerStopAndLog(SAVE_PROOF);
}

Expand Down Expand Up @@ -846,7 +853,7 @@ void Prover::genAggregatedProof(ProverRequest *pProverRequest)

if(USE_GENERIC_PARSER) {
#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
CHelpersStepsGPU cHelpersSteps;
CHelpersStepsGPU cHelpersSteps;
#elif defined(__AVX512__)
CHelpersStepsAvx512 cHelpersSteps;
#elif defined(__PACK__)
Expand Down Expand Up @@ -955,7 +962,7 @@ void Prover::genFinalProof(ProverRequest *pProverRequest)
FRIProofC12 fproofRecursiveF((1 << polBitsRecursiveF), FIELD_EXTENSION, starksRecursiveF->starkInfo.starkStruct.steps.size(), starksRecursiveF->starkInfo.evMap.size(), starksRecursiveF->starkInfo.nPublics);
if(USE_GENERIC_PARSER) {
#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
CHelpersStepsGPU cHelpersSteps;
CHelpersStepsGPU cHelpersSteps;
#elif defined(__AVX512__)
CHelpersStepsAvx512 cHelpersSteps;
#elif defined(__PACK__)
Expand Down
2 changes: 1 addition & 1 deletion src/starkpil/chelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,4 @@ void CHelpers::loadCHelpers(BinFileUtils::BinFile *cHelpersBin) {
}

cHelpersBin->endReadSection();
};
};
22 changes: 14 additions & 8 deletions src/starkpil/chelpers_steps.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#ifndef CHELPERS_STEPS_HPP
#define CHELPERS_STEPS_HPP
#include "chelpers.hpp"

#include "steps.hpp"
#include "definitions.hpp"

class CHelpersSteps {
public:
Expand Down Expand Up @@ -75,14 +75,19 @@ class CHelpersSteps {
bool isTmpPol = !domainExtended && s == 4;
for(uint64_t k = 0; k < nColsStages[s]; ++k) {
uint64_t dim = storePol[nColsStagesAcc[s] + k];
if(storePol[nColsStagesAcc[s] + k]) {
if(!TRANSPOSE_TMP_POLS) {
__m256i *buffT = &bufferT_[(nColsStagesAcc[s] + k)];
if(isTmpPol) {
for(uint64_t i = 0; i < dim; ++i) {
Goldilocks::store_avx(&params.pols[offsetsStages[s] + k * domainSize + row * dim + i], uint64_t(dim), buffT[i]);
Goldilocks::store_avx(&params.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]);
} else {
if(storePol[nColsStagesAcc[s] + k]) {
__m256i *buffT = &bufferT_[(nColsStagesAcc[s] + k)];
if(isTmpPol) {
for(uint64_t i = 0; i < dim; ++i) {
Goldilocks::store_avx(&params.pols[offsetsStages[s] + k * domainSize + row * dim + i], uint64_t(dim), buffT[i]);
}
} else {
Goldilocks::store_avx(&params.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]);
}
} else {
Goldilocks::store_avx(&params.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]);
}
}
}
Expand Down Expand Up @@ -165,6 +170,7 @@ class CHelpersSteps {
uint8_t *storePol = &parserArgs.storePols[parserParams.storePolsOffset];

setBufferTInfo(starkInfo, parserParams.stage);

Goldilocks3::Element_avx challenges[params.challenges.degree()];
Goldilocks3::Element_avx challenges_ops[params.challenges.degree()];
for(uint64_t i = 0; i < params.challenges.degree(); ++i) {
Expand Down Expand Up @@ -703,4 +709,4 @@ class CHelpersSteps {
}
};

#endif
#endif
21 changes: 14 additions & 7 deletions src/starkpil/chelpers_steps_avx512.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "chelpers.hpp"
#include "chelpers_steps.hpp"
#include "steps.hpp"
#include "definitions.hpp"

class CHelpersStepsAvx512 : public CHelpersSteps {
public:
Expand Down Expand Up @@ -75,14 +76,19 @@ class CHelpersStepsAvx512 : public CHelpersSteps {
bool isTmpPol = !domainExtended && s == 4;
for(uint64_t k = 0; k < nColsStages[s]; ++k) {
uint64_t dim = storePol[nColsStagesAcc[s] + k];
if(storePol[nColsStagesAcc[s] + k]) {
if(!TRANSPOSE_TMP_POLS) {
__m512i *buffT = &bufferT_[(nColsStagesAcc[s] + k)];
if(isTmpPol) {
for(uint64_t i = 0; i < dim; ++i) {
Goldilocks::store_avx512(&params.pols[offsetsStages[s] + k * domainSize + row * dim + i], uint64_t(dim), buffT[i]);
Goldilocks::store_avx512(&params.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]);
} else {
if(storePol[nColsStagesAcc[s] + k]) {
__m512i *buffT = &bufferT_[(nColsStagesAcc[s] + k)];
if(isTmpPol) {
for(uint64_t i = 0; i < dim; ++i) {
Goldilocks::store_avx512(&params.pols[offsetsStages[s] + k * domainSize + row * dim + i], uint64_t(dim), buffT[i]);
}
} else {
Goldilocks::store_avx512(&params.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]);
}
} else {
Goldilocks::store_avx512(&params.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]);
}
}
}
Expand Down Expand Up @@ -701,6 +707,7 @@ class CHelpersStepsAvx512 : public CHelpersSteps {
assert(i_args == parserParams.nArgs);
}
}

};

#endif
#endif
Loading
Loading