From 42b0d582b6f2fabbd233b0742952bb6e1ba7e85d Mon Sep 17 00:00:00 2001 From: fmoletta <99273364+fmoletta@users.noreply.github.com> Date: Fri, 14 Jul 2023 16:14:26 +0300 Subject: [PATCH 1/7] bugfix: Fix `UnsignedInteger::bits_le` + make it pub (#505) * bugfix: Fix UnsignedInteger::bits_le + make it pub * fmt --- math/src/unsigned_integer/element.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/math/src/unsigned_integer/element.rs b/math/src/unsigned_integer/element.rs index 471e4f88f..66631f2e3 100644 --- a/math/src/unsigned_integer/element.rs +++ b/math/src/unsigned_integer/element.rs @@ -790,14 +790,16 @@ impl UnsignedInteger { #[inline(always)] /// Returns the number of bits needed to represent the number as little endian - const fn bits_le(&self) -> usize { + pub const fn bits_le(&self) -> usize { let mut i = 0; - while i < NUM_LIMBS && self.limbs[i] == 0 { + while i < NUM_LIMBS { + if self.limbs[i] != 0 { + return u64::BITS as usize * (NUM_LIMBS - i) + - self.limbs[i].leading_zeros() as usize; + } i += 1; } - - let limb = self.limbs[i]; - u64::BITS as usize * (NUM_LIMBS - i) - limb.leading_zeros() as usize + 0 } /// Computes self / rhs, returns the quotient, remainder. From a3edf1e991d54eb3f658de8b8dbe368a9fbf7e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Est=C3=A9fano=20Bargas?= Date: Fri, 14 Jul 2023 18:08:40 -0300 Subject: [PATCH 2/7] Fix CUDA and Metal FFT reaching block/threadgroup limit (#504) * Change metal FFT grid layout * Optimized div and mod ops from metal FFT * Change CUDA FFT grid layout * Added metal FFT with large input test * Added CUDA FFT with large input test * Add comment in Metal FFT kernel * Fix CUDA FFT * Fix CUDA test copy-paste errors * Avoid spawning an extra CUDA block * Fix CUDA fft template instantiation * Fix &usize not impl DeviceRepr * Added more params * Fix param types * Fix clippy * Fix types * Fix types * Fix typo * Fix typo * Pass by value some cuda fft params * Remove errors from launch * Fix cuda fft op * Fixed CUDA operations * Refactor warpsize constant --- math/src/fft/gpu/cuda/ops.rs | 29 +- math/src/fft/gpu/cuda/state.rs | 70 +- math/src/fft/gpu/metal/ops.rs | 29 +- .../cuda/shaders/fft/bitrev_permutation.cuh | 9 +- math/src/gpu/cuda/shaders/fft/fft.cuh | 31 +- math/src/gpu/cuda/shaders/fft/twiddles.cuh | 17 +- math/src/gpu/cuda/shaders/field/stark256.cu | 22 +- math/src/gpu/cuda/shaders/field/stark256.ptx | 5082 +++++++++++++---- math/src/gpu/metal/shaders/fft/fft.h.metal | 30 +- .../gpu/metal/shaders/field/stark256.h.metal | 8 +- 10 files changed, 4152 insertions(+), 1175 deletions(-) diff --git a/math/src/fft/gpu/cuda/ops.rs b/math/src/fft/gpu/cuda/ops.rs index d9b75ae52..fb2645ab7 100644 --- a/math/src/fft/gpu/cuda/ops.rs +++ b/math/src/fft/gpu/cuda/ops.rs @@ -25,12 +25,16 @@ where { let mut function = state.get_radix2_dit_butterfly(input, twiddles)?; + const WARP_SIZE: usize = 32; + + let block_size = WARP_SIZE; + let butterfly_count = input.len() / 2; + let block_count = (butterfly_count + block_size - 1) / block_size; + let order = input.len().trailing_zeros(); - for stage in 0..order { - let group_count = 1 << stage; - let group_size = input.len() / group_count; - function.launch(group_count, group_size)?; + for stage in 0..order { + function.launch(block_count, block_size, stage, butterfly_count as u32)?; } let output = function.retrieve_result()?; @@ -67,7 +71,7 @@ pub fn bitrev_permutation( ) -> Result>, CudaError> { let mut function = state.get_bitrev_permutation(&input, &input)?; - function.launch(input.len())?; + function.launch()?; function.retrieve_result() } @@ -116,6 +120,21 @@ mod tests { } } + #[test] + fn test_cuda_fft_matches_sequential_large_input() { + const ORDER: usize = 20; + let input = vec![FE::one(); 1 << ORDER]; + + let state = CudaState::new().unwrap(); + let order = input.len().trailing_zeros(); + let twiddles = get_twiddles(order.into(), RootsConfig::BitReverse).unwrap(); + + let cuda_result = fft(&input, &twiddles, &state).unwrap(); + let sequential_result = crate::fft::cpu::ops::fft(&input, &twiddles).unwrap(); + + assert_eq!(&cuda_result, &sequential_result); + } + #[test] fn gen_twiddles_with_order_greater_than_63_should_fail() { let state = CudaState::new().unwrap(); diff --git a/math/src/fft/gpu/cuda/state.rs b/math/src/fft/gpu/cuda/state.rs index 17dad9ff1..c3c53180b 100644 --- a/math/src/fft/gpu/cuda/state.rs +++ b/math/src/fft/gpu/cuda/state.rs @@ -16,6 +16,7 @@ use lambdaworks_gpu::cuda::abstractions::errors::CudaError; use std::sync::Arc; const STARK256_PTX: &str = include_str!("../../../gpu/cuda/shaders/field/stark256.ptx"); +const WARP_SIZE: usize = 32; // the implementation will spawn threadblocks of this size. /// Structure for abstracting basic calls to a CUDA device and saving the state. Used for /// implementing GPU parallel computations in CUDA. @@ -166,23 +167,13 @@ impl Radix2DitButterflyFunction { pub(crate) fn launch( &mut self, - group_count: usize, - group_size: usize, + block_count: usize, + block_size: usize, + stage: u32, + butterfly_count: u32, ) -> Result<(), CudaError> { - let grid_dim = (group_count as u32, 1, 1); // in blocks - let block_dim = ((group_size / 2) as u32, 1, 1); - - if block_dim.0 as usize > DeviceSlice::len(&self.twiddles) { - return Err(CudaError::IndexOutOfBounds( - block_dim.0 as usize, - self.twiddles.len(), - )); - } else if (grid_dim.0 * block_dim.0) as usize > DeviceSlice::len(&self.input) { - return Err(CudaError::IndexOutOfBounds( - (grid_dim.0 * block_dim.0) as usize, - self.input.len(), - )); - } + let grid_dim = (block_count as u32, 1, 1); // in blocks + let block_dim = (block_size as u32, 1, 1); let config = LaunchConfig { grid_dim, @@ -193,9 +184,10 @@ impl Radix2DitButterflyFunction { // Calling a kernel is similar to calling a foreign-language function, // as the kernel itself could be written in C or unsafe Rust. unsafe { - self.function - .clone() - .launch(config, (&mut self.input, &self.twiddles)) + self.function.clone().launch( + config, + (&mut self.input, &self.twiddles, stage, butterfly_count), + ) } .map_err(|err| CudaError::Launch(err.to_string())) } @@ -235,16 +227,12 @@ impl CalcTwiddlesFunction { } } - pub(crate) fn launch(&mut self, group_size: usize) -> Result<(), CudaError> { - let grid_dim = (1, 1, 1); // in blocks - let block_dim = (group_size as u32, 1, 1); + pub(crate) fn launch(&mut self, count: usize) -> Result<(), CudaError> { + let block_size = WARP_SIZE; + let block_count = (count + block_size - 1) / block_size; - if block_dim.0 as usize > DeviceSlice::len(&self.twiddles) { - return Err(CudaError::IndexOutOfBounds( - block_dim.0 as usize, - self.twiddles.len(), - )); - } + let grid_dim = (block_count as u32, 1, 1); // in blocks + let block_dim = (block_size as u32, 1, 1); let config = LaunchConfig { grid_dim, @@ -257,7 +245,7 @@ impl CalcTwiddlesFunction { unsafe { self.function .clone() - .launch(config, (&mut self.twiddles, &self.omega)) + .launch(config, (&mut self.twiddles, &self.omega, count as u32)) } .map_err(|err| CudaError::Launch(err.to_string())) } @@ -299,21 +287,13 @@ impl BitrevPermutationFunction { } } - pub(crate) fn launch(&mut self, group_size: usize) -> Result<(), CudaError> { - let grid_dim = (1, 1, 1); // in blocks - let block_dim = (group_size as u32, 1, 1); - - if block_dim.0 as usize > DeviceSlice::len(&self.input) { - return Err(CudaError::IndexOutOfBounds( - block_dim.0 as usize, - self.input.len(), - )); - } else if block_dim.0 as usize > DeviceSlice::len(&self.result) { - return Err(CudaError::IndexOutOfBounds( - block_dim.0 as usize, - self.result.len(), - )); - } + pub(crate) fn launch(&mut self) -> Result<(), CudaError> { + let len = self.input.len(); + let block_size = WARP_SIZE; + let block_count = (len + block_size - 1) / block_size; + + let grid_dim = (block_count as u32, 1, 1); // in blocks + let block_dim = (block_size as u32, 1, 1); let config = LaunchConfig { grid_dim, @@ -326,7 +306,7 @@ impl BitrevPermutationFunction { unsafe { self.function .clone() - .launch(config, (&mut self.input, &self.result)) + .launch(config, (&mut self.input, &self.result, len)) } .map_err(|err| CudaError::Launch(err.to_string())) } diff --git a/math/src/fft/gpu/metal/ops.rs b/math/src/fft/gpu/metal/ops.rs index 288656fcc..c9d7131f2 100644 --- a/math/src/fft/gpu/metal/ops.rs +++ b/math/src/fft/gpu/metal/ops.rs @@ -34,17 +34,18 @@ pub fn fft( objc::rc::autoreleasepool(|| { let (command_buffer, command_encoder) = state.setup_command( &pipeline, - Some(&[(0, &input_buffer), (1, &twiddles_buffer)]), + Some(&[(0, &input_buffer), (1, &twiddles_buffer)]), // index 2 is stage ); let order = input.len().trailing_zeros(); for stage in 0..order { - let group_count = 1 << stage; - let group_size = input.len() as u64 / group_count; + command_encoder.set_bytes(2, mem::size_of_val(&stage) as u64, void_ptr(&stage)); - let threadgroup_size = MTLSize::new(group_size / 2, 1, 1); - let threadgroup_count = MTLSize::new(group_count, 1, 1); - command_encoder.dispatch_thread_groups(threadgroup_count, threadgroup_size); + let grid_size = MTLSize::new(input.len() as u64 / 2, 1, 1); // one thread per butterfly + let threadgroup_size = MTLSize::new(pipeline.thread_execution_width(), 1, 1); + + // WARN: Device should support non-uniform threadgroups (Metal3 and Apple4 or latter). + command_encoder.dispatch_threads(grid_size, threadgroup_size); } command_encoder.end_encoding(); @@ -181,6 +182,22 @@ mod tests { } } + // May want to modify the order constant, takes ~5s to run on a M1. + #[test] + fn test_metal_fft_matches_sequential_large_input() { + const ORDER: usize = 20; + let input = vec![FE::one(); 1 << ORDER]; + + let metal_state = MetalState::new(None).unwrap(); + let order = input.len().trailing_zeros(); + let twiddles = get_twiddles(order.into(), RootsConfig::BitReverse).unwrap(); + + let metal_result = super::fft(&input, &twiddles, &metal_state).unwrap(); + let sequential_result = crate::fft::cpu::ops::fft(&input, &twiddles).unwrap(); + + assert_eq!(&metal_result, &sequential_result); + } + #[test] fn gen_twiddles_with_order_greater_than_63_should_fail() { let metal_state = MetalState::new(None).unwrap(); diff --git a/math/src/gpu/cuda/shaders/fft/bitrev_permutation.cuh b/math/src/gpu/cuda/shaders/fft/bitrev_permutation.cuh index 142f75955..4c6626248 100644 --- a/math/src/gpu/cuda/shaders/fft/bitrev_permutation.cuh +++ b/math/src/gpu/cuda/shaders/fft/bitrev_permutation.cuh @@ -3,10 +3,11 @@ #include "../utils.h" template -inline __device__ void _bitrev_permutation(const Fp *input, Fp *result) +inline __device__ void _bitrev_permutation(const Fp *input, Fp *result, const int len) { - unsigned index = threadIdx.x; - unsigned size = blockDim.x; + unsigned thread_pos = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_pos >= len) return; + // TODO: guard is not needed for inputs of len >=block_size * 2, if len is pow of two - result[index] = input[reverse_index(index, size)]; + result[thread_pos] = input[reverse_index(thread_pos, len)]; }; diff --git a/math/src/gpu/cuda/shaders/fft/fft.cuh b/math/src/gpu/cuda/shaders/fft/fft.cuh index 09e9ac660..6768756e9 100644 --- a/math/src/gpu/cuda/shaders/fft/fft.cuh +++ b/math/src/gpu/cuda/shaders/fft/fft.cuh @@ -2,21 +2,28 @@ template inline __device__ void _radix2_dit_butterfly(Fp *input, - const Fp *twiddles) + const Fp *twiddles, + const int stage, + const int butterfly_count) { - int group = blockIdx.x; - int pos_in_group = threadIdx.x; - int half_group_size = blockDim.x; + int thread_pos = blockDim.x * blockIdx.x + threadIdx.x; - int i = group * half_group_size * 2 + pos_in_group; + if (thread_pos >= butterfly_count) return; + // TODO: guard is not needed for inputs of len >=block_size * 2, only if len is pow of two - Fp w = twiddles[group]; - Fp a = input[i]; - Fp b = input[i + half_group_size]; + int half_group_size = butterfly_count >> stage; + int group = thread_pos / half_group_size; - Fp res_1 = a + w * b; - Fp res_2 = a - w * b; + int pos_in_group = thread_pos & (half_group_size - 1); + int i = thread_pos * 2 - pos_in_group; // multiply quotient by 2 - input[i] = res_1; // --\/-- - input[i + half_group_size] = res_2; // --/\-- + Fp w = twiddles[group]; + Fp a = input[i]; + Fp b = input[i + half_group_size]; + + Fp res_1 = a + w * b; + Fp res_2 = a - w * b; + + input[i] = res_1; // --\/-- + input[i + half_group_size] = res_2; // --/\-- }; diff --git a/math/src/gpu/cuda/shaders/fft/twiddles.cuh b/math/src/gpu/cuda/shaders/fft/twiddles.cuh index 04071f7f1..925bf71ac 100644 --- a/math/src/gpu/cuda/shaders/fft/twiddles.cuh +++ b/math/src/gpu/cuda/shaders/fft/twiddles.cuh @@ -4,23 +4,26 @@ // NOTE: In order to calculate the inverse twiddles, call with _omega = _omega.inverse() template -inline __device__ void _calc_twiddles(Fp *result, const Fp &_omega) +inline __device__ void _calc_twiddles(Fp *result, const Fp &_omega, const int count) { - int index = threadIdx.x; + unsigned thread_pos = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_pos >= count) return; + // TODO: guard is not needed for count >=block_size * 2, if count is pow of two Fp omega = _omega; - result[index] = omega.pow((unsigned)index); + result[thread_pos] = omega.pow(thread_pos); }; // NOTE: In order to calculate the inverse twiddles, call with _omega = _omega.inverse() template -inline __device__ void _calc_twiddles_bitrev(Fp *result, const Fp &_omega) +inline __device__ void _calc_twiddles_bitrev(Fp *result, const Fp &_omega, const int count) { - int index = threadIdx.x; - int size = blockDim.x; + unsigned thread_pos = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_pos >= count) return; + // TODO: guard is not needed for count >=block_size * 2, if count is pow of two Fp omega = _omega; - result[index] = omega.pow(reverse_index((unsigned)index, (unsigned)size)); + result[thread_pos] = omega.pow(reverse_index(thread_pos, count)); }; diff --git a/math/src/gpu/cuda/shaders/field/stark256.cu b/math/src/gpu/cuda/shaders/field/stark256.cu index 078865d47..dfbae8388 100644 --- a/math/src/gpu/cuda/shaders/field/stark256.cu +++ b/math/src/gpu/cuda/shaders/field/stark256.cu @@ -20,26 +20,32 @@ namespace p256 extern "C" { - __global__ void radix2_dit_butterfly(p256::Fp *input, const p256::Fp *twiddles) + __global__ void radix2_dit_butterfly( p256::Fp *input, + const p256::Fp *twiddles, + const int stage, + const int butterfly_count) { - _radix2_dit_butterfly(input, twiddles); + _radix2_dit_butterfly(input, twiddles, stage, butterfly_count); } // NOTE: In order to calculate the inverse twiddles, call with _omega = _omega.inverse() - __global__ void calc_twiddles(p256::Fp *result, const p256::Fp &_omega) + __global__ void calc_twiddles(p256::Fp *result, const p256::Fp &_omega, const int count) { - _calc_twiddles(result, _omega); + _calc_twiddles(result, _omega, count); }; // NOTE: In order to calculate the inverse twiddles, call with _omega = _omega.inverse() - __global__ void calc_twiddles_bitrev(p256::Fp *result, const p256::Fp &_omega) + __global__ void calc_twiddles_bitrev(p256::Fp *result, + const p256::Fp &_omega, + const int count) { - _calc_twiddles_bitrev(result, _omega); + _calc_twiddles_bitrev(result, _omega, count); }; __global__ void bitrev_permutation( const p256::Fp *input, - p256::Fp *result + p256::Fp *result, + const int len ) { - _bitrev_permutation(input, result); + _bitrev_permutation(input, result, len); }; } diff --git a/math/src/gpu/cuda/shaders/field/stark256.ptx b/math/src/gpu/cuda/shaders/field/stark256.ptx index 45309b87c..30227ef35 100644 --- a/math/src/gpu/cuda/shaders/field/stark256.ptx +++ b/math/src/gpu/cuda/shaders/field/stark256.ptx @@ -1,12 +1,12 @@ // // Generated by NVIDIA NVVM Compiler // -// Compiler Build ID: CL-30521435 -// Cuda compilation tools, release 11.4, V11.4.152 +// Compiler Build ID: CL-32965470 +// Cuda compilation tools, release 12.2, V12.2.91 // Based on NVVM 7.0.1 // -.version 7.4 +.version 8.2 .target sm_52 .address_size 64 @@ -14,1090 +14,4030 @@ .visible .entry radix2_dit_butterfly( .param .u64 radix2_dit_butterfly_param_0, - .param .u64 radix2_dit_butterfly_param_1 + .param .u64 radix2_dit_butterfly_param_1, + .param .u32 radix2_dit_butterfly_param_2, + .param .u32 radix2_dit_butterfly_param_3 ) { - .reg .pred %p<295>; - .reg .b32 %r<13>; - .reg .b64 %rd<770>; + .reg .pred %p<312>; + .reg .b32 %r<26>; + .reg .b64 %rd<705>; - ld.param.u64 %rd101, [radix2_dit_butterfly_param_0]; - ld.param.u64 %rd102, [radix2_dit_butterfly_param_1]; - cvta.to.global.u64 %rd103, %rd101; - mov.u32 %r1, %ntid.x; - shl.b32 %r2, %r1, 1; - mov.u32 %r3, %ctaid.x; - mov.u32 %r4, %tid.x; - mad.lo.s32 %r5, %r2, %r3, %r4; - cvta.to.global.u64 %rd104, %rd102; - mul.wide.s32 %rd105, %r3, 32; - add.s64 %rd106, %rd104, %rd105; - mul.wide.s32 %rd107, %r5, 32; - add.s64 %rd1, %rd103, %rd107; + ld.param.u64 %rd109, [radix2_dit_butterfly_param_0]; + ld.param.u64 %rd110, [radix2_dit_butterfly_param_1]; + ld.param.u32 %r2, [radix2_dit_butterfly_param_2]; + ld.param.u32 %r3, [radix2_dit_butterfly_param_3]; + mov.u32 %r4, %ctaid.x; + mov.u32 %r5, %ntid.x; + mov.u32 %r6, %tid.x; + mad.lo.s32 %r1, %r5, %r4, %r6; + setp.ge.s32 %p13, %r1, %r3; + @%p13 bra $L__BB0_16; + + cvta.to.global.u64 %rd112, %rd109; + shr.s32 %r7, %r3, %r2; + div.s32 %r8, %r1, %r7; + add.s32 %r9, %r7, -1; + and.b32 %r10, %r9, %r1; + shl.b32 %r11, %r1, 1; + sub.s32 %r12, %r11, %r10; + cvta.to.global.u64 %rd113, %rd110; + mul.wide.s32 %rd114, %r8, 32; + add.s64 %rd115, %rd113, %rd114; + mul.wide.s32 %rd116, %r12, 32; + add.s64 %rd1, %rd112, %rd116; ld.global.u64 %rd2, [%rd1]; - mov.u64 %rd108, 0; + mov.u64 %rd117, 0; ld.global.u64 %rd3, [%rd1+8]; ld.global.u64 %rd4, [%rd1+16]; ld.global.u64 %rd5, [%rd1+24]; - shl.b32 %r6, %r1, 2; - mul.wide.s32 %rd109, %r6, 8; - add.s64 %rd6, %rd1, %rd109; - ld.global.u64 %rd110, [%rd6+8]; - mul.hi.u64 %rd7, %rd108, %rd110; - ld.global.u64 %rd111, [%rd106+8]; - mul.hi.u64 %rd112, %rd111, %rd108; - mul.hi.u64 %rd113, %rd111, %rd110; - mul.lo.s64 %rd114, %rd110, %rd111; - ld.global.u64 %rd115, [%rd106]; - mul.hi.u64 %rd116, %rd115, %rd108; - mul.hi.u64 %rd117, %rd115, %rd110; - mul.lo.s64 %rd118, %rd110, %rd115; - ld.global.u64 %rd119, [%rd6]; - mul.hi.u64 %rd120, %rd111, %rd119; - mul.hi.u64 %rd8, %rd108, %rd119; - mul.lo.s64 %rd121, %rd119, %rd111; - add.s64 %rd122, %rd120, %rd112; - add.s64 %rd123, %rd122, %rd8; - add.s64 %rd124, %rd7, %rd121; - add.s64 %rd125, %rd124, %rd112; - add.s64 %rd126, %rd125, %rd113; - setp.lt.u64 %p10, %rd126, %rd121; - selp.u64 %rd127, 1, 0, %p10; - mul.hi.u64 %rd9, %rd115, %rd119; - mul.lo.s64 %rd128, %rd119, %rd115; - mul.hi.u64 %rd10, %rd108, %rd108; - add.s64 %rd129, %rd112, %rd10; - add.s64 %rd130, %rd129, %rd112; - add.s64 %rd131, %rd126, %rd118; - setp.lt.u64 %p11, %rd131, %rd126; - selp.u64 %rd132, 1, 0, %p11; - add.s64 %rd133, %rd132, %rd127; - add.s64 %rd134, %rd7, %rd128; - add.s64 %rd135, %rd134, %rd116; - add.s64 %rd136, %rd135, %rd117; - setp.lt.u64 %p12, %rd136, %rd128; - selp.u64 %rd11, 1, 0, %p12; - add.s64 %rd137, %rd123, %rd136; - setp.lt.u64 %p13, %rd137, %rd123; - selp.u64 %rd12, 1, 0, %p13; - add.s64 %rd138, %rd133, %rd137; - setp.lt.u64 %p14, %rd138, %rd133; - selp.u64 %rd13, 1, 0, %p14; - ld.global.u64 %rd139, [%rd6+24]; - mul.hi.u64 %rd140, %rd108, %rd139; - add.s64 %rd141, %rd140, %rd10; - add.s64 %rd142, %rd141, %rd140; - mul.hi.u64 %rd143, %rd111, %rd139; - mul.lo.s64 %rd144, %rd139, %rd111; - mul.hi.u64 %rd145, %rd115, %rd139; - mul.lo.s64 %rd146, %rd139, %rd115; - ld.global.u64 %rd147, [%rd6+16]; - mul.hi.u64 %rd148, %rd111, %rd147; - mul.hi.u64 %rd149, %rd108, %rd147; - mul.lo.s64 %rd150, %rd147, %rd111; - add.s64 %rd151, %rd148, %rd112; - add.s64 %rd152, %rd151, %rd149; - add.s64 %rd153, %rd112, %rd150; - add.s64 %rd154, %rd153, %rd140; - add.s64 %rd155, %rd154, %rd143; - setp.lt.u64 %p15, %rd155, %rd150; - selp.u64 %rd156, 1, 0, %p15; - mul.hi.u64 %rd157, %rd115, %rd147; - mul.lo.s64 %rd158, %rd147, %rd115; - add.s64 %rd159, %rd155, %rd146; - setp.lt.u64 %p16, %rd159, %rd155; - selp.u64 %rd160, 1, 0, %p16; - add.s64 %rd161, %rd160, %rd156; - add.s64 %rd162, %rd116, %rd158; - add.s64 %rd163, %rd162, %rd140; - add.s64 %rd164, %rd163, %rd145; - setp.lt.u64 %p17, %rd164, %rd158; - selp.u64 %rd165, 1, 0, %p17; - add.s64 %rd166, %rd152, %rd164; - setp.lt.u64 %p18, %rd166, %rd152; - selp.u64 %rd167, 1, 0, %p18; - add.s64 %rd168, %rd161, %rd166; - setp.lt.u64 %p19, %rd168, %rd161; - selp.u64 %rd169, 1, 0, %p19; - ld.global.u64 %rd170, [%rd106+24]; - mul.hi.u64 %rd171, %rd108, %rd170; - mul.hi.u64 %rd172, %rd110, %rd108; - mul.hi.u64 %rd173, %rd110, %rd170; - mul.lo.s64 %rd174, %rd110, %rd170; - mul.hi.u64 %rd175, %rd119, %rd108; - mul.hi.u64 %rd176, %rd119, %rd170; - mul.lo.s64 %rd177, %rd119, %rd170; - ld.global.u64 %rd178, [%rd106+16]; - mul.hi.u64 %rd179, %rd110, %rd178; - mul.hi.u64 %rd180, %rd108, %rd178; - mul.lo.s64 %rd181, %rd110, %rd178; - add.s64 %rd182, %rd179, %rd172; - add.s64 %rd183, %rd182, %rd180; - add.s64 %rd184, %rd171, %rd181; - add.s64 %rd185, %rd184, %rd172; - add.s64 %rd186, %rd185, %rd173; - setp.lt.u64 %p20, %rd186, %rd181; - selp.u64 %rd187, 1, 0, %p20; - mul.hi.u64 %rd188, %rd119, %rd178; - mul.lo.s64 %rd189, %rd119, %rd178; - add.s64 %rd190, %rd186, %rd177; - setp.lt.u64 %p21, %rd190, %rd186; - selp.u64 %rd191, 1, 0, %p21; - add.s64 %rd192, %rd191, %rd187; - add.s64 %rd193, %rd171, %rd189; - add.s64 %rd194, %rd193, %rd175; - add.s64 %rd195, %rd194, %rd176; - setp.lt.u64 %p22, %rd195, %rd189; - selp.u64 %rd196, 1, 0, %p22; - add.s64 %rd197, %rd183, %rd195; - setp.lt.u64 %p23, %rd197, %rd183; - selp.u64 %rd198, 1, 0, %p23; - add.s64 %rd199, %rd192, %rd197; - setp.lt.u64 %p24, %rd199, %rd192; - selp.u64 %rd200, 1, 0, %p24; - mul.hi.u64 %rd201, %rd170, %rd108; - mul.hi.u64 %rd202, %rd170, %rd139; - mul.lo.s64 %rd203, %rd139, %rd170; - mul.hi.u64 %rd204, %rd178, %rd108; - mul.hi.u64 %rd205, %rd178, %rd139; - mul.lo.s64 %rd206, %rd139, %rd178; - mul.hi.u64 %rd207, %rd170, %rd147; - mul.lo.s64 %rd208, %rd147, %rd170; - add.s64 %rd209, %rd201, %rd149; - add.s64 %rd210, %rd209, %rd207; - add.s64 %rd211, %rd140, %rd208; - add.s64 %rd212, %rd211, %rd201; - add.s64 %rd213, %rd212, %rd202; - setp.lt.u64 %p25, %rd213, %rd208; - selp.u64 %rd214, 1, 0, %p25; - mul.hi.u64 %rd215, %rd178, %rd147; - mul.lo.s64 %rd216, %rd147, %rd178; - add.s64 %rd217, %rd213, %rd206; - setp.lt.u64 %p26, %rd217, %rd213; - neg.s64 %rd218, %rd217; - selp.u64 %rd219, 1, 0, %p26; - add.s64 %rd220, %rd219, %rd214; - add.s64 %rd221, %rd140, %rd216; - add.s64 %rd222, %rd221, %rd204; - add.s64 %rd223, %rd222, %rd205; - setp.lt.u64 %p27, %rd223, %rd216; - selp.u64 %rd224, 1, 0, %p27; - add.s64 %rd225, %rd210, %rd223; - setp.lt.u64 %p28, %rd225, %rd210; - selp.u64 %rd226, 1, 0, %p28; - add.s64 %rd227, %rd220, %rd225; - setp.lt.u64 %p29, %rd227, %rd220; - selp.u64 %rd228, 1, 0, %p29; - add.s64 %rd229, %rd144, %rd174; - setp.lt.u64 %p30, %rd229, %rd144; - selp.u64 %rd230, 1, 0, %p30; - add.s64 %rd231, %rd159, %rd230; - add.s64 %rd232, %rd231, %rd190; - setp.eq.s64 %p31, %rd232, %rd159; - and.pred %p32, %p30, %p31; - setp.lt.u64 %p33, %rd232, %rd159; - or.pred %p34, %p33, %p32; - selp.u64 %rd233, 1, 0, %p34; - add.s64 %rd234, %rd227, %rd229; - setp.lt.u64 %p35, %rd234, %rd227; - selp.u64 %rd235, 1, 0, %p35; - add.s64 %rd236, %rd142, %rd10; - add.s64 %rd237, %rd236, %rd149; - add.s64 %rd238, %rd237, %rd232; - add.s64 %rd239, %rd238, %rd201; - add.s64 %rd240, %rd239, %rd201; - add.s64 %rd241, %rd240, %rd204; - add.s64 %rd242, %rd241, %rd215; - add.s64 %rd243, %rd242, %rd224; - add.s64 %rd244, %rd243, %rd226; - add.s64 %rd245, %rd244, %rd228; - add.s64 %rd246, %rd245, %rd235; - setp.eq.s64 %p36, %rd246, %rd232; - and.pred %p37, %p35, %p36; - setp.lt.u64 %p38, %rd246, %rd232; - or.pred %p39, %p38, %p37; - selp.u64 %rd247, 1, 0, %p39; - add.s64 %rd248, %rd168, %rd114; - setp.lt.u64 %p40, %rd248, %rd168; - selp.u64 %rd249, 1, 0, %p40; - add.s64 %rd14, %rd130, %rd116; - add.s64 %rd250, %rd14, %rd131; - add.s64 %rd251, %rd250, %rd142; - add.s64 %rd252, %rd251, %rd149; - add.s64 %rd253, %rd252, %rd157; - add.s64 %rd254, %rd253, %rd165; - add.s64 %rd255, %rd254, %rd167; - add.s64 %rd256, %rd255, %rd169; - add.s64 %rd257, %rd256, %rd249; - setp.eq.s64 %p41, %rd257, %rd131; - and.pred %p42, %p40, %p41; - setp.lt.u64 %p43, %rd257, %rd131; - or.pred %p44, %p43, %p42; - selp.u64 %rd258, 1, 0, %p44; - add.s64 %rd259, %rd138, %rd258; - setp.lt.u64 %p45, %rd259, %rd138; - selp.u64 %rd15, 1, 0, %p45; - add.s64 %rd18, %rd199, %rd248; - setp.lt.u64 %p46, %rd18, %rd199; - selp.u64 %rd260, 1, 0, %p46; - shl.b64 %rd261, %rd10, 1; - mov.u64 %rd764, 1; - add.s64 %rd262, %rd171, %rd261; - add.s64 %rd263, %rd262, %rd171; - add.s64 %rd264, %rd263, %rd172; - add.s64 %rd265, %rd264, %rd172; - add.s64 %rd266, %rd265, %rd175; - add.s64 %rd267, %rd257, %rd266; - add.s64 %rd268, %rd267, %rd180; - add.s64 %rd269, %rd268, %rd188; - add.s64 %rd270, %rd269, %rd196; - add.s64 %rd271, %rd270, %rd198; - add.s64 %rd272, %rd271, %rd200; - add.s64 %rd16, %rd272, %rd260; - setp.eq.s64 %p47, %rd16, %rd257; - and.pred %p48, %p46, %p47; - setp.lt.u64 %p49, %rd16, %rd257; - or.pred %p50, %p49, %p48; - selp.u64 %rd273, 1, 0, %p50; - add.s64 %rd17, %rd259, %rd273; - add.s64 %rd274, %rd18, %rd233; - add.s64 %rd19, %rd274, %rd247; - mul.lo.s64 %rd275, %rd203, 576460752303423504; - mul.hi.u64 %rd276, %rd234, %rd108; - mov.u64 %rd277, -1; - mul.hi.u64 %rd278, %rd234, %rd277; - mul.hi.u64 %rd279, %rd203, %rd108; - mul.hi.u64 %rd280, %rd203, %rd277; - neg.s64 %rd281, %rd203; - mul.hi.u64 %rd20, %rd108, %rd277; - add.s64 %rd282, %rd279, %rd20; - add.s64 %rd283, %rd282, %rd280; - mul.hi.u64 %rd284, %rd217, %rd108; - mul.hi.u64 %rd285, %rd217, %rd277; - sub.s64 %rd286, %rd283, %rd203; - setp.lt.u64 %p51, %rd286, %rd281; - selp.u64 %rd287, 1, 0, %p51; - sub.s64 %rd288, %rd20, %rd217; - add.s64 %rd289, %rd288, %rd284; - add.s64 %rd290, %rd289, %rd285; - sub.s64 %rd291, %rd286, %rd217; - setp.lt.u64 %p52, %rd291, %rd286; - selp.u64 %rd292, 1, 0, %p52; - add.s64 %rd293, %rd292, %rd287; - setp.lt.u64 %p53, %rd290, %rd218; - selp.u64 %rd294, 1, 0, %p53; - add.s64 %rd295, %rd290, %rd286; - setp.lt.u64 %p54, %rd295, %rd290; - selp.u64 %rd296, 1, 0, %p54; - sub.s64 %rd297, %rd295, %rd234; - setp.lt.u64 %p55, %rd297, %rd295; - selp.u64 %rd298, 1, 0, %p55; - add.s64 %rd23, %rd293, %rd297; - setp.lt.u64 %p56, %rd23, %rd293; - selp.u64 %rd299, 1, 0, %p56; - add.s64 %rd300, %rd276, %rd275; - add.s64 %rd301, %rd300, %rd278; - add.s64 %rd302, %rd301, %rd20; - sub.s64 %rd303, %rd302, %rd234; - add.s64 %rd304, %rd303, %rd283; - sub.s64 %rd305, %rd304, %rd246; - add.s64 %rd306, %rd305, %rd290; - add.s64 %rd307, %rd306, %rd287; - add.s64 %rd308, %rd307, %rd294; - add.s64 %rd309, %rd308, %rd296; - add.s64 %rd310, %rd309, %rd298; - add.s64 %rd21, %rd310, %rd299; - mul.lo.s64 %rd22, %rd10, 3; - mul.hi.u64 %rd24, %rd23, %rd108; - add.s64 %rd311, %rd24, %rd10; - add.s64 %rd312, %rd311, %rd24; - mul.hi.u64 %rd313, %rd21, %rd108; - add.s64 %rd314, %rd313, %rd10; - add.s64 %rd25, %rd314, %rd313; - mul.lo.s64 %rd315, %rd23, 576460752303423505; - mov.u64 %rd316, 576460752303423505; - add.s64 %rd26, %rd315, %rd312; - setp.lt.u64 %p2, %rd26, %rd315; - mul.hi.u64 %rd27, %rd108, %rd764; - add.s64 %rd317, %rd27, %rd10; - add.s64 %rd28, %rd317, %rd27; - mul.hi.u64 %rd318, %rd23, %rd764; - add.s64 %rd319, %rd27, %rd24; - add.s64 %rd320, %rd319, %rd318; - mul.hi.u64 %rd321, %rd21, %rd764; - add.s64 %rd322, %rd320, %rd21; - setp.lt.u64 %p57, %rd322, %rd320; - selp.u64 %rd323, 1, 0, %p57; - add.s64 %rd29, %rd312, %rd313; - add.s64 %rd324, %rd29, %rd27; - add.s64 %rd325, %rd324, %rd321; - setp.lt.u64 %p58, %rd325, %rd312; - selp.u64 %rd326, 1, 0, %p58; - add.s64 %rd35, %rd325, %rd323; - setp.lt.u64 %p59, %rd35, %rd325; - selp.u64 %rd327, 1, 0, %p59; - mul.hi.u64 %rd328, %rd108, %rd281; - add.s64 %rd329, %rd328, %rd10; - add.s64 %rd330, %rd329, %rd328; - mul.hi.u64 %rd331, %rd316, %rd281; - mul.lo.s64 %rd332, %rd203, -576460752303423505; - mul.hi.u64 %rd333, %rd108, %rd291; - add.s64 %rd334, %rd333, %rd10; - add.s64 %rd335, %rd334, %rd333; - mul.hi.u64 %rd336, %rd316, %rd291; - mul.lo.s64 %rd337, %rd291, 576460752303423505; - add.s64 %rd338, %rd330, %rd332; - setp.lt.u64 %p60, %rd338, %rd330; - selp.u64 %rd339, 1, 0, %p60; - add.s64 %rd340, %rd328, %rd337; - mul.hi.u64 %rd341, %rd316, %rd108; + add.s32 %r13, %r12, %r7; + mul.wide.s32 %rd118, %r13, 32; + add.s64 %rd6, %rd112, %rd118; + ld.global.u64 %rd119, [%rd6+8]; + mul.hi.u64 %rd7, %rd117, %rd119; + ld.global.u64 %rd120, [%rd115+8]; + mul.hi.u64 %rd121, %rd120, %rd119; + mul.lo.s64 %rd122, %rd119, %rd120; + ld.global.u64 %rd123, [%rd115]; + mul.hi.u64 %rd124, %rd123, %rd119; + mul.lo.s64 %rd125, %rd119, %rd123; + ld.global.u64 %rd126, [%rd6]; + mul.hi.u64 %rd127, %rd120, %rd126; + mul.lo.s64 %rd128, %rd126, %rd120; + add.s64 %rd129, %rd121, %rd128; + setp.lt.u64 %p14, %rd129, %rd121; + selp.u64 %rd130, 1, 0, %p14; + mul.hi.u64 %rd8, %rd123, %rd126; + mul.lo.s64 %rd131, %rd126, %rd123; + mul.hi.u64 %rd9, %rd120, %rd117; + add.s64 %rd132, %rd129, %rd125; + setp.lt.u64 %p15, %rd132, %rd129; + selp.u64 %rd133, 1, 0, %p15; + add.s64 %rd134, %rd124, %rd131; + setp.lt.u64 %p16, %rd134, %rd124; + selp.u64 %rd10, 1, 0, %p16; + add.s64 %rd135, %rd127, %rd134; + setp.lt.u64 %p17, %rd135, %rd127; + selp.u64 %rd11, 1, 0, %p17; + add.s64 %rd136, %rd135, %rd130; + add.s64 %rd137, %rd136, %rd133; + setp.lt.u64 %p18, %rd137, %rd135; + selp.u64 %rd12, 1, 0, %p18; + ld.global.u64 %rd138, [%rd6+24]; + mul.hi.u64 %rd139, %rd117, %rd138; + mul.hi.u64 %rd140, %rd120, %rd138; + mul.lo.s64 %rd141, %rd138, %rd120; + mul.hi.u64 %rd142, %rd123, %rd138; + mul.lo.s64 %rd143, %rd138, %rd123; + ld.global.u64 %rd144, [%rd6+16]; + mul.hi.u64 %rd145, %rd120, %rd144; + mul.lo.s64 %rd146, %rd144, %rd120; + add.s64 %rd147, %rd140, %rd146; + setp.lt.u64 %p19, %rd147, %rd140; + selp.u64 %rd148, 1, 0, %p19; + mul.hi.u64 %rd149, %rd123, %rd144; + mul.lo.s64 %rd150, %rd144, %rd123; + add.s64 %rd151, %rd147, %rd143; + setp.lt.u64 %p20, %rd151, %rd147; + selp.u64 %rd152, 1, 0, %p20; + add.s64 %rd153, %rd142, %rd150; + setp.lt.u64 %p21, %rd153, %rd142; + selp.u64 %rd154, 1, 0, %p21; + add.s64 %rd155, %rd145, %rd153; + setp.lt.u64 %p22, %rd155, %rd145; + selp.u64 %rd156, 1, 0, %p22; + add.s64 %rd157, %rd155, %rd148; + add.s64 %rd158, %rd157, %rd152; + setp.lt.u64 %p23, %rd158, %rd155; + selp.u64 %rd159, 1, 0, %p23; + ld.global.u64 %rd160, [%rd115+24]; + mul.hi.u64 %rd161, %rd117, %rd160; + mul.hi.u64 %rd162, %rd119, %rd160; + mul.lo.s64 %rd163, %rd119, %rd160; + mul.hi.u64 %rd164, %rd126, %rd160; + mul.lo.s64 %rd165, %rd126, %rd160; + ld.global.u64 %rd166, [%rd115+16]; + mul.hi.u64 %rd167, %rd119, %rd166; + mul.lo.s64 %rd168, %rd119, %rd166; + add.s64 %rd169, %rd162, %rd168; + setp.lt.u64 %p24, %rd169, %rd162; + selp.u64 %rd170, 1, 0, %p24; + mul.hi.u64 %rd171, %rd126, %rd166; + mul.lo.s64 %rd172, %rd126, %rd166; + mul.hi.u64 %rd173, %rd119, %rd117; + add.s64 %rd174, %rd169, %rd165; + setp.lt.u64 %p25, %rd174, %rd169; + selp.u64 %rd175, 1, 0, %p25; + add.s64 %rd176, %rd164, %rd172; + setp.lt.u64 %p26, %rd176, %rd164; + selp.u64 %rd177, 1, 0, %p26; + add.s64 %rd178, %rd167, %rd176; + setp.lt.u64 %p27, %rd178, %rd167; + selp.u64 %rd179, 1, 0, %p27; + add.s64 %rd180, %rd178, %rd170; + add.s64 %rd181, %rd180, %rd175; + setp.lt.u64 %p28, %rd181, %rd178; + selp.u64 %rd182, 1, 0, %p28; + mul.hi.u64 %rd183, %rd160, %rd138; + mul.lo.s64 %rd184, %rd138, %rd160; + mul.hi.u64 %rd185, %rd166, %rd138; + mul.lo.s64 %rd186, %rd138, %rd166; + mul.hi.u64 %rd187, %rd160, %rd144; + mul.lo.s64 %rd188, %rd144, %rd160; + add.s64 %rd189, %rd183, %rd188; + setp.lt.u64 %p29, %rd189, %rd183; + selp.u64 %rd190, 1, 0, %p29; + mul.hi.u64 %rd191, %rd166, %rd144; + mul.lo.s64 %rd192, %rd144, %rd166; + mul.hi.u64 %rd193, %rd160, %rd117; + add.s64 %rd194, %rd189, %rd186; + setp.lt.u64 %p30, %rd194, %rd189; + selp.u64 %rd195, 1, 0, %p30; + add.s64 %rd196, %rd185, %rd192; + setp.lt.u64 %p31, %rd196, %rd185; + selp.u64 %rd197, 1, 0, %p31; + add.s64 %rd198, %rd187, %rd196; + setp.lt.u64 %p32, %rd198, %rd187; + selp.u64 %rd199, 1, 0, %p32; + add.s64 %rd200, %rd198, %rd190; + add.s64 %rd201, %rd200, %rd195; + setp.lt.u64 %p33, %rd201, %rd198; + selp.u64 %rd202, 1, 0, %p33; + add.s64 %rd203, %rd141, %rd163; + setp.lt.u64 %p34, %rd203, %rd141; + selp.u64 %rd204, 1, 0, %p34; + add.s64 %rd205, %rd151, %rd204; + add.s64 %rd206, %rd205, %rd174; + setp.eq.s64 %p35, %rd206, %rd151; + and.pred %p36, %p34, %p35; + setp.lt.u64 %p37, %rd206, %rd151; + or.pred %p38, %p37, %p36; + selp.u64 %rd207, 1, 0, %p38; + add.s64 %rd208, %rd201, %rd203; + setp.lt.u64 %p39, %rd208, %rd201; + selp.u64 %rd209, 1, 0, %p39; + add.s64 %rd210, %rd206, %rd139; + add.s64 %rd211, %rd210, %rd191; + add.s64 %rd212, %rd211, %rd193; + add.s64 %rd213, %rd212, %rd197; + add.s64 %rd214, %rd213, %rd199; + add.s64 %rd215, %rd214, %rd202; + add.s64 %rd13, %rd215, %rd209; + setp.eq.s64 %p40, %rd13, %rd206; + and.pred %p41, %p39, %p40; + setp.lt.u64 %p42, %rd13, %rd206; + or.pred %p43, %p42, %p41; + selp.u64 %rd216, 1, 0, %p43; + add.s64 %rd217, %rd158, %rd122; + setp.lt.u64 %p44, %rd217, %rd158; + selp.u64 %rd218, 1, 0, %p44; + add.s64 %rd219, %rd9, %rd132; + add.s64 %rd220, %rd219, %rd139; + add.s64 %rd221, %rd220, %rd149; + add.s64 %rd222, %rd221, %rd154; + add.s64 %rd223, %rd222, %rd156; + add.s64 %rd224, %rd223, %rd159; + add.s64 %rd225, %rd224, %rd218; + setp.eq.s64 %p45, %rd225, %rd132; + and.pred %p46, %p44, %p45; + setp.lt.u64 %p47, %rd225, %rd132; + or.pred %p48, %p47, %p46; + selp.u64 %rd226, 1, 0, %p48; + add.s64 %rd227, %rd137, %rd226; + setp.lt.u64 %p49, %rd227, %rd137; + selp.u64 %rd14, 1, 0, %p49; + add.s64 %rd17, %rd181, %rd217; + setp.lt.u64 %p50, %rd17, %rd181; + selp.u64 %rd228, 1, 0, %p50; + add.s64 %rd229, %rd171, %rd161; + add.s64 %rd230, %rd229, %rd173; + add.s64 %rd231, %rd230, %rd177; + add.s64 %rd232, %rd225, %rd179; + add.s64 %rd233, %rd232, %rd231; + add.s64 %rd234, %rd233, %rd182; + add.s64 %rd15, %rd234, %rd228; + setp.eq.s64 %p51, %rd15, %rd225; + and.pred %p52, %p50, %p51; + setp.lt.u64 %p53, %rd15, %rd225; + or.pred %p54, %p53, %p52; + selp.u64 %rd235, 1, 0, %p54; + add.s64 %rd16, %rd227, %rd235; + setp.lt.u64 %p1, %rd16, %rd227; + add.s64 %rd236, %rd17, %rd207; + add.s64 %rd18, %rd236, %rd216; + mul.lo.s64 %rd237, %rd184, 576460752303423504; + mov.u64 %rd238, -1; + mul.hi.u64 %rd239, %rd208, %rd238; + mul.hi.u64 %rd240, %rd184, %rd238; + neg.s64 %rd241, %rd184; + mul.hi.u64 %rd242, %rd194, %rd238; + sub.s64 %rd243, %rd240, %rd184; + setp.lt.u64 %p55, %rd243, %rd241; + selp.u64 %rd244, 1, 0, %p55; + neg.s64 %rd245, %rd194; + sub.s64 %rd246, %rd242, %rd194; + sub.s64 %rd247, %rd243, %rd194; + setp.lt.u64 %p56, %rd247, %rd243; + selp.u64 %rd248, 1, 0, %p56; + add.s64 %rd249, %rd248, %rd244; + setp.lt.u64 %p57, %rd246, %rd245; + selp.u64 %rd19, 1, 0, %p57; + add.s64 %rd250, %rd246, %rd243; + setp.lt.u64 %p58, %rd250, %rd246; + selp.u64 %rd20, 1, 0, %p58; + sub.s64 %rd251, %rd250, %rd208; + setp.lt.u64 %p59, %rd251, %rd250; + selp.u64 %rd21, 1, 0, %p59; + add.s64 %rd25, %rd249, %rd251; + setp.lt.u64 %p60, %rd25, %rd249; + selp.u64 %rd22, 1, 0, %p60; + add.s64 %rd252, %rd239, %rd237; + add.s64 %rd253, %rd252, %rd240; + sub.s64 %rd254, %rd253, %rd208; + add.s64 %rd255, %rd254, %rd246; + add.s64 %rd23, %rd255, %rd244; + sub.s64 %rd256, %rd19, %rd13; + add.s64 %rd257, %rd256, %rd23; + add.s64 %rd258, %rd257, %rd20; + add.s64 %rd259, %rd258, %rd21; + add.s64 %rd24, %rd259, %rd22; + mul.hi.u64 %rd26, %rd25, %rd117; + mov.u64 %rd260, 576460752303423505; + mul.hi.u64 %rd27, %rd25, %rd260; + mul.lo.s64 %rd261, %rd25, 576460752303423505; + add.s64 %rd41, %rd261, %rd26; + setp.lt.u64 %p61, %rd41, %rd261; + selp.u64 %rd28, 1, 0, %p61; + mov.u64 %rd111, 1; + mul.hi.u64 %rd29, %rd25, %rd111; + add.s64 %rd30, %rd24, %rd29; + mul.hi.u64 %rd262, %rd117, %rd241; + mul.hi.u64 %rd263, %rd260, %rd241; + mul.lo.s64 %rd264, %rd184, -576460752303423505; + mul.hi.u64 %rd265, %rd117, %rd247; + mul.hi.u64 %rd266, %rd260, %rd247; + mul.lo.s64 %rd267, %rd247, 576460752303423505; + add.s64 %rd35, %rd262, %rd264; + setp.lt.u64 %p62, %rd35, %rd262; + selp.u64 %rd268, 1, 0, %p62; + add.s64 %rd269, %rd263, %rd267; + setp.lt.u64 %p63, %rd269, %rd263; + selp.u64 %rd270, 1, 0, %p63; + add.s64 %rd271, %rd265, %rd269; + setp.lt.u64 %p64, %rd271, %rd265; + selp.u64 %rd272, 1, 0, %p64; + add.s64 %rd42, %rd271, %rd268; + setp.lt.u64 %p65, %rd42, %rd271; + selp.u64 %rd273, 1, 0, %p65; + mul.hi.u64 %rd274, %rd241, %rd111; + mul.hi.u64 %rd275, %rd247, %rd111; + mul.hi.u64 %rd31, %rd241, %rd117; + mul.hi.u64 %rd32, %rd247, %rd117; + add.s64 %rd276, %rd274, %rd247; + setp.lt.u64 %p66, %rd276, %rd274; + selp.u64 %rd277, 1, 0, %p66; + add.s64 %rd278, %rd31, %rd275; + setp.lt.u64 %p67, %rd278, %rd31; + selp.u64 %rd33, 1, 0, %p67; + add.s64 %rd279, %rd278, %rd277; + setp.lt.u64 %p68, %rd279, %rd278; + selp.u64 %rd34, 1, 0, %p68; + add.s64 %rd36, %rd35, %rd30; + setp.lt.u64 %p3, %rd36, %rd35; + add.s64 %rd45, %rd279, %rd25; + setp.lt.u64 %p4, %rd45, %rd279; + selp.u64 %rd37, 1, 0, %p4; + mul.hi.u64 %rd39, %rd117, %rd111; + add.s64 %rd280, %rd36, %rd39; + add.s64 %rd281, %rd280, %rd31; + add.s64 %rd282, %rd281, %rd32; + add.s64 %rd283, %rd282, %rd33; + add.s64 %rd284, %rd283, %rd34; + add.s64 %rd40, %rd284, %rd37; + mul.hi.u64 %rd43, %rd117, %rd117; + add.s64 %rd285, %rd262, %rd43; + add.s64 %rd286, %rd285, %rd266; + add.s64 %rd287, %rd286, %rd270; + add.s64 %rd288, %rd287, %rd272; + add.s64 %rd44, %rd288, %rd273; + add.s64 %rd289, %rd45, %rd208; + setp.lt.u64 %p69, %rd289, %rd45; + selp.u64 %rd290, 1, 0, %p69; + setp.ne.s64 %p70, %rd184, 0; + selp.u64 %rd291, 1, 0, %p70; + add.s64 %rd292, %rd243, %rd291; + add.s64 %rd293, %rd292, %rd274; + setp.eq.s64 %p71, %rd292, %rd247; + and.pred %p72, %p70, %p71; + setp.lt.u64 %p73, %rd293, %rd276; + or.pred %p74, %p73, %p72; + selp.u64 %rd294, 1, 0, %p74; + add.s64 %rd47, %rd289, %rd294; + setp.lt.u64 %p75, %rd47, %rd289; + selp.u64 %rd295, 1, 0, %p75; + add.s64 %rd296, %rd13, %rd290; + add.s64 %rd46, %rd296, %rd295; + setp.ne.s64 %p76, %rd46, 0; + setp.ne.s64 %p77, %rd47, %rd45; + or.pred %p78, %p77, %p76; + not.pred %p79, %p74; + or.pred %p5, %p78, %p79; + not.pred %p80, %p5; + mov.u64 %rd699, %rd111; + @%p80 bra $L__BB0_3; + + setp.eq.s64 %p81, %rd46, 0; + setp.lt.u64 %p82, %rd47, %rd45; + and.pred %p83, %p82, %p81; + add.s64 %rd297, %rd46, %rd40; + setp.lt.u64 %p84, %rd297, %rd46; + or.pred %p85, %p84, %p83; + selp.u64 %rd699, 1, 0, %p85; + +$L__BB0_3: + setp.lt.u64 %p311, %rd45, %rd279; + mov.u64 %rd700, 1; + mov.u64 %rd682, 0; + mul.hi.u64 %rd681, %rd682, %rd700; + mov.u64 %rd676, 0; + add.s64 %rd675, %rd24, %rd29; + setp.lt.u64 %p310, %rd675, %rd24; + mul.hi.u64 %rd674, %rd676, %rd676; + add.s64 %rd664, %rd261, %rd26; + setp.lt.u64 %p86, %rd18, %rd17; + selp.u64 %rd299, 1, 0, %p86; + add.s64 %rd50, %rd15, %rd299; + setp.lt.u64 %p87, %rd50, %rd15; + selp.u64 %rd300, 1, 0, %p87; + add.s64 %rd51, %rd16, %rd300; + setp.lt.u64 %p88, %rd51, %rd16; + mul.hi.u64 %rd302, %rd24, %rd676; + mul.lo.s64 %rd303, %rd24, 576460752303423505; + add.s64 %rd305, %rd303, %rd302; + setp.lt.u64 %p89, %rd305, %rd303; + add.s64 %rd306, %rd305, %rd27; + setp.lt.u64 %p90, %rd306, %rd305; + add.s64 %rd307, %rd306, %rd28; + setp.lt.u64 %p91, %rd307, %rd306; + mul.hi.u64 %rd308, %rd24, %rd700; + add.s64 %rd309, %rd308, %rd26; + setp.lt.u64 %p92, %rd309, %rd308; + selp.u64 %rd310, 1, 0, %p310; + add.s64 %rd311, %rd309, %rd310; + setp.lt.u64 %p93, %rd311, %rd309; + add.s64 %rd312, %rd302, %rd26; + add.s64 %rd313, %rd312, %rd681; + add.s64 %rd314, %rd313, %rd664; + selp.u64 %rd315, 1, 0, %p92; + add.s64 %rd316, %rd314, %rd315; + selp.u64 %rd317, 1, 0, %p93; + add.s64 %rd318, %rd316, %rd317; + setp.lt.u64 %p94, %rd318, %rd664; + selp.u64 %rd319, 1, 0, %p94; + add.s64 %rd320, %rd307, %rd319; + setp.lt.u64 %p95, %rd320, %rd307; + add.s64 %rd321, %rd42, %rd311; + setp.lt.u64 %p96, %rd321, %rd42; + selp.u64 %rd322, 1, 0, %p96; + add.s64 %rd323, %rd44, %rd322; + add.s64 %rd324, %rd323, %rd318; + setp.lt.u64 %p97, %rd324, %rd323; + setp.eq.s64 %p98, %rd323, 0; + and.pred %p99, %p96, %p98; + or.pred %p100, %p97, %p99; + selp.u64 %rd325, 1, 0, %p100; + add.s64 %rd326, %rd320, %rd325; + setp.lt.u64 %p101, %rd326, %rd320; + selp.u64 %rd327, 1, 0, %p3; + add.s64 %rd328, %rd321, %rd327; + setp.lt.u64 %p102, %rd40, %rd36; + setp.eq.s64 %p103, %rd40, %rd36; + and.pred %p104, %p311, %p103; + or.pred %p105, %p102, %p104; + selp.u64 %rd329, 1, 0, %p105; + add.s64 %rd330, %rd328, %rd329; + setp.lt.u64 %p106, %rd330, %rd321; + selp.u64 %rd331, 1, 0, %p106; + add.s64 %rd332, %rd324, %rd331; + setp.lt.u64 %p107, %rd332, %rd324; + selp.u64 %rd333, 1, 0, %p107; + add.s64 %rd334, %rd326, %rd333; + setp.lt.u64 %p108, %rd334, %rd326; + add.s64 %rd335, %rd699, %rd18; + setp.lt.u64 %p109, %rd335, %rd699; + selp.u64 %rd336, 1, 0, %p109; + add.s64 %rd337, %rd50, %rd336; + setp.lt.u64 %p110, %rd337, %rd50; + selp.u64 %rd338, 1, 0, %p110; + add.s64 %rd54, %rd51, %rd338; + setp.lt.u64 %p111, %rd54, %rd51; + selp.u64 %rd339, 1, 0, %p111; + add.s64 %rd340, %rd8, %rd7; + add.s64 %rd341, %rd340, %rd9; + add.s64 %rd342, %rd341, %rd10; + add.s64 %rd343, %rd342, %rd11; + add.s64 %rd344, %rd343, %rd12; + add.s64 %rd345, %rd344, %rd14; + selp.u64 %rd346, 1, 0, %p1; + add.s64 %rd347, %rd345, %rd346; + selp.u64 %rd348, 1, 0, %p88; + add.s64 %rd52, %rd347, %rd348; + add.s64 %rd53, %rd52, %rd339; + add.s64 %rd349, %rd54, %rd334; + setp.lt.u64 %p112, %rd349, %rd54; + selp.u64 %rd350, 1, 0, %p112; + add.s64 %rd351, %rd337, %rd332; + add.s64 %rd55, %rd335, %rd330; + setp.lt.u64 %p113, %rd55, %rd335; + selp.u64 %rd352, 1, 0, %p113; + add.s64 %rd56, %rd351, %rd352; + setp.eq.s64 %p114, %rd56, %rd337; + and.pred %p115, %p113, %p114; + setp.lt.u64 %p116, %rd56, %rd337; + or.pred %p117, %p116, %p115; + selp.u64 %rd353, 1, 0, %p117; + add.s64 %rd59, %rd349, %rd353; + setp.lt.u64 %p118, %rd59, %rd349; + selp.u64 %rd354, 1, 0, %p118; + add.s64 %rd57, %rd26, %rd674; + mul.hi.u64 %rd355, %rd24, %rd260; + add.s64 %rd356, %rd355, %rd57; + selp.u64 %rd357, 1, 0, %p89; + add.s64 %rd358, %rd356, %rd357; + selp.u64 %rd359, 1, 0, %p90; + add.s64 %rd360, %rd358, %rd359; + selp.u64 %rd361, 1, 0, %p91; + add.s64 %rd362, %rd360, %rd361; + selp.u64 %rd363, 1, 0, %p95; + add.s64 %rd364, %rd362, %rd363; + selp.u64 %rd365, 1, 0, %p101; + add.s64 %rd366, %rd364, %rd365; + selp.u64 %rd367, 1, 0, %p108; + add.s64 %rd368, %rd366, %rd367; + add.s64 %rd369, %rd368, %rd53; + add.s64 %rd370, %rd369, %rd350; + add.s64 %rd58, %rd370, %rd354; + setp.ne.s64 %p119, %rd58, %rd53; + setp.ne.s64 %p120, %rd59, %rd54; + or.pred %p121, %p120, %p119; + not.pred %p122, %p117; + or.pred %p123, %p121, %p122; + not.pred %p124, %p123; + @%p124 bra $L__BB0_5; + + setp.eq.s64 %p125, %rd58, %rd53; + setp.lt.u64 %p126, %rd59, %rd54; + and.pred %p127, %p126, %p125; + setp.lt.u64 %p128, %rd58, %rd53; + or.pred %p129, %p128, %p127; + selp.u64 %rd700, 1, 0, %p129; + +$L__BB0_5: + mov.u64 %rd684, 0; + mul.hi.u64 %rd683, %rd684, %rd111; + mov.u64 %rd678, 0; + mul.hi.u64 %rd677, %rd678, %rd678; + or.b64 %rd372, %rd56, %rd55; + or.b64 %rd373, %rd372, %rd59; + setp.ne.s64 %p130, %rd373, 0; + mov.u64 %rd374, 0; + setp.ne.s64 %p131, %rd58, 576460752303423505; + or.pred %p132, %p130, %p131; + setp.gt.u64 %p133, %rd58, 576460752303423504; + and.pred %p134, %p133, %p132; + selp.u64 %rd375, 1, 0, %p134; + mov.u64 %rd376, -1; + mul.hi.u64 %rd377, %rd700, %rd376; + sub.s64 %rd378, %rd377, %rd700; + neg.s64 %rd379, %rd700; + setp.lt.u64 %p135, %rd378, %rd379; + selp.u64 %rd380, 1, 0, %p135; + mul.hi.u64 %rd62, %rd374, %rd376; + add.s64 %rd381, %rd378, %rd62; + setp.lt.u64 %p136, %rd381, %rd378; + selp.u64 %rd382, 1, 0, %p136; + add.s64 %rd383, %rd381, %rd380; + setp.lt.u64 %p137, %rd383, %rd381; + selp.u64 %rd384, 1, 0, %p137; + add.s64 %rd385, %rd383, %rd59; + setp.lt.u64 %p138, %rd385, %rd383; + selp.u64 %rd386, 1, 0, %p138; + add.s64 %rd387, %rd378, %rd56; + sub.s64 %rd388, %rd55, %rd700; + setp.lt.u64 %p139, %rd388, %rd55; + selp.u64 %rd389, 1, 0, %p139; + add.s64 %rd390, %rd387, %rd389; + setp.eq.s64 %p140, %rd390, %rd56; + and.pred %p141, %p139, %p140; + setp.lt.u64 %p142, %rd390, %rd56; + or.pred %p143, %p142, %p141; + selp.u64 %rd391, 1, 0, %p143; + add.s64 %rd392, %rd385, %rd391; + setp.lt.u64 %p144, %rd392, %rd385; + selp.u64 %rd393, 1, 0, %p144; + mov.u64 %rd702, 1; + mul.hi.u64 %rd394, %rd375, %rd702; + mul.hi.u64 %rd395, %rd375, %rd374; + add.s64 %rd396, %rd395, %rd683; + setp.lt.u64 %p145, %rd396, %rd395; + selp.b64 %rd397, -576460752303423505, 0, %p134; + selp.b64 %rd398, -1, 0, %p145; + setp.lt.u64 %p146, %rd392, %rd396; + selp.b64 %rd399, -1, 0, %p146; + sub.s64 %rd400, %rd392, %rd396; + sub.s64 %rd401, %rd390, %rd394; + setp.lt.u64 %p147, %rd388, %rd375; + selp.b64 %rd402, -1, 0, %p147; + add.s64 %rd403, %rd401, %rd402; + setp.eq.s64 %p148, %rd403, %rd390; + and.pred %p149, %p147, %p148; + setp.gt.u64 %p150, %rd403, %rd390; + or.pred %p151, %p150, %p149; + selp.u64 %rd404, 1, 0, %p151; + setp.lt.u64 %p152, %rd400, %rd404; + selp.b64 %rd405, -1, 0, %p152; + sub.s64 %rd406, %rd400, %rd404; + sub.s64 %rd407, %rd388, %rd375; + add.s64 %rd408, %rd406, %rd3; + setp.lt.u64 %p153, %rd408, %rd406; + selp.u64 %rd409, 1, 0, %p153; + add.s64 %rd65, %rd407, %rd5; + setp.lt.u64 %p154, %rd65, %rd407; + selp.u64 %rd410, 1, 0, %p154; + add.s64 %rd411, %rd4, %rd410; + add.s64 %rd63, %rd411, %rd403; + setp.eq.s64 %p155, %rd63, %rd4; + and.pred %p156, %p154, %p155; + setp.lt.u64 %p157, %rd63, %rd4; + or.pred %p158, %p157, %p156; + selp.u64 %rd412, 1, 0, %p158; + add.s64 %rd66, %rd408, %rd412; + setp.lt.u64 %p159, %rd66, %rd408; + selp.u64 %rd413, 1, 0, %p159; + sub.s64 %rd414, %rd2, %rd677; + sub.s64 %rd415, %rd414, %rd683; + add.s64 %rd416, %rd415, %rd58; + add.s64 %rd417, %rd416, %rd397; + mul.lo.s64 %rd418, %rd700, -576460752303423506; + add.s64 %rd419, %rd417, %rd418; + add.s64 %rd420, %rd419, %rd62; + add.s64 %rd421, %rd420, %rd62; + add.s64 %rd422, %rd421, %rd377; + sub.s64 %rd423, %rd422, %rd395; + add.s64 %rd424, %rd423, %rd380; + add.s64 %rd425, %rd424, %rd382; + add.s64 %rd426, %rd425, %rd398; + add.s64 %rd427, %rd426, %rd384; + add.s64 %rd428, %rd427, %rd386; + add.s64 %rd429, %rd428, %rd393; + add.s64 %rd430, %rd429, %rd399; + add.s64 %rd431, %rd430, %rd405; + add.s64 %rd432, %rd431, %rd409; + add.s64 %rd64, %rd432, %rd413; + setp.ne.s64 %p160, %rd64, 576460752303423505; + or.b64 %rd433, %rd63, %rd65; + or.b64 %rd434, %rd433, %rd66; + setp.ne.s64 %p161, %rd434, 0; + setp.gt.u64 %p162, %rd64, 576460752303423504; + or.pred %p163, %p161, %p160; + and.pred %p6, %p162, %p163; + selp.u64 %rd67, 1, 0, %p6; + mul.hi.u64 %rd68, %rd67, %rd374; + add.s64 %rd69, %rd68, %rd683; + setp.lt.u64 %p7, %rd69, %rd68; + setp.ne.s64 %p164, %rd64, %rd2; + setp.ne.s64 %p165, %rd66, %rd3; + or.pred %p166, %p165, %p164; + not.pred %p167, %p158; + or.pred %p168, %p166, %p167; + not.pred %p169, %p168; + mov.u64 %rd701, %rd702; + @%p169 bra $L__BB0_7; + + setp.eq.s64 %p170, %rd64, %rd2; + setp.lt.u64 %p171, %rd66, %rd3; + and.pred %p172, %p171, %p170; + setp.lt.u64 %p173, %rd64, %rd2; + or.pred %p174, %p173, %p172; + selp.u64 %rd701, 1, 0, %p174; + +$L__BB0_7: + mov.u64 %rd694, 0; + mul.hi.u64 %rd693, %rd694, %rd376; + mov.u64 %rd686, 0; + mul.hi.u64 %rd685, %rd686, %rd111; + mov.u64 %rd680, 0; + mul.hi.u64 %rd679, %rd680, %rd680; + selp.u64 %rd667, 1, 0, %p4; + mul.hi.u64 %rd666, %rd241, %rd680; + add.s64 %rd665, %rd261, %rd26; + sub.s64 %rd436, %rd66, %rd69; + setp.lt.u64 %p175, %rd65, %rd67; + selp.b64 %rd437, -1, 0, %p175; + mul.hi.u64 %rd438, %rd67, %rd702; + sub.s64 %rd439, %rd63, %rd438; + add.s64 %rd440, %rd439, %rd437; + setp.gt.u64 %p176, %rd440, %rd63; + setp.eq.s64 %p177, %rd440, %rd63; + and.pred %p178, %p175, %p177; + or.pred %p179, %p176, %p178; + selp.u64 %rd441, 1, 0, %p179; + sub.s64 %rd442, %rd65, %rd67; + mul.hi.u64 %rd444, %rd701, %rd376; + sub.s64 %rd445, %rd444, %rd701; + neg.s64 %rd446, %rd701; + setp.lt.u64 %p180, %rd445, %rd446; + selp.u64 %rd448, 1, 0, %p180; + add.s64 %rd449, %rd445, %rd693; + setp.lt.u64 %p181, %rd449, %rd445; + selp.u64 %rd450, 1, 0, %p181; + add.s64 %rd451, %rd449, %rd448; + setp.lt.u64 %p182, %rd451, %rd449; + selp.u64 %rd452, 1, 0, %p182; + sub.s64 %rd453, %rd436, %rd441; + add.s64 %rd454, %rd451, %rd453; + setp.lt.u64 %p183, %rd454, %rd451; + selp.u64 %rd455, 1, 0, %p183; + add.s64 %rd456, %rd445, %rd440; + sub.s64 %rd72, %rd442, %rd701; + setp.lt.u64 %p184, %rd72, %rd442; + selp.u64 %rd457, 1, 0, %p184; + add.s64 %rd73, %rd456, %rd457; + setp.eq.s64 %p185, %rd73, %rd440; + and.pred %p186, %p184, %p185; + setp.lt.u64 %p187, %rd73, %rd440; + or.pred %p188, %p187, %p186; + selp.u64 %rd458, 1, 0, %p188; + add.s64 %rd74, %rd454, %rd458; + setp.lt.u64 %p189, %rd74, %rd454; + selp.u64 %rd459, 1, 0, %p189; + add.s64 %rd75, %rd685, %rd679; + sub.s64 %rd460, %rd693, %rd75; + add.s64 %rd76, %rd460, %rd693; + sub.s64 %rd461, %rd76, %rd68; + selp.s64 %rd462, -1, 0, %p7; + add.s64 %rd463, %rd461, %rd462; + setp.lt.u64 %p190, %rd66, %rd69; + selp.b64 %rd464, -1, 0, %p190; + add.s64 %rd465, %rd463, %rd464; + add.s64 %rd466, %rd465, %rd64; + selp.b64 %rd467, -576460752303423505, 0, %p6; + add.s64 %rd468, %rd466, %rd467; + setp.lt.u64 %p191, %rd436, %rd441; + selp.b64 %rd469, -1, 0, %p191; + add.s64 %rd470, %rd468, %rd469; + add.s64 %rd471, %rd470, %rd444; + mul.lo.s64 %rd472, %rd701, -576460752303423506; + add.s64 %rd473, %rd471, %rd472; + add.s64 %rd474, %rd473, %rd448; + add.s64 %rd475, %rd474, %rd450; + add.s64 %rd476, %rd475, %rd452; + add.s64 %rd477, %rd476, %rd455; + add.s64 %rd77, %rd477, %rd459; + sub.s64 %rd478, %rd23, %rd13; + add.s64 %rd479, %rd478, %rd19; + add.s64 %rd480, %rd479, %rd20; + add.s64 %rd481, %rd480, %rd21; + add.s64 %rd482, %rd481, %rd22; + mul.hi.u64 %rd483, %rd482, %rd694; + mov.u64 %rd484, 576460752303423505; + mul.hi.u64 %rd78, %rd482, %rd484; + mul.lo.s64 %rd485, %rd482, 576460752303423505; + add.s64 %rd486, %rd483, %rd485; + setp.lt.u64 %p192, %rd486, %rd483; + selp.u64 %rd79, 1, 0, %p192; + add.s64 %rd487, %rd486, %rd27; + setp.lt.u64 %p193, %rd487, %rd486; + selp.u64 %rd80, 1, 0, %p193; + add.s64 %rd488, %rd487, %rd28; + setp.lt.u64 %p194, %rd488, %rd487; + selp.u64 %rd81, 1, 0, %p194; + mul.hi.u64 %rd489, %rd482, %rd702; + add.s64 %rd490, %rd482, %rd29; + setp.lt.u64 %p195, %rd490, %rd482; + selp.u64 %rd491, 1, 0, %p195; + add.s64 %rd492, %rd489, %rd26; + setp.lt.u64 %p196, %rd492, %rd489; + selp.u64 %rd493, 1, 0, %p196; + add.s64 %rd494, %rd492, %rd491; + setp.lt.u64 %p197, %rd494, %rd492; + selp.u64 %rd495, 1, 0, %p197; + add.s64 %rd496, %rd35, %rd490; + setp.lt.u64 %p198, %rd496, %rd35; + selp.u64 %rd497, 1, 0, %p198; + add.s64 %rd498, %rd496, %rd685; + add.s64 %rd499, %rd498, %rd666; + add.s64 %rd500, %rd499, %rd32; + add.s64 %rd501, %rd500, %rd33; + add.s64 %rd502, %rd501, %rd34; + add.s64 %rd82, %rd502, %rd667; + setp.eq.s64 %p199, %rd82, %rd496; + setp.lt.u64 %p200, %rd45, %rd25; + and.pred %p201, %p200, %p199; + setp.lt.u64 %p202, %rd82, %rd496; + or.pred %p203, %p202, %p201; + selp.u64 %rd503, 1, 0, %p203; + add.s64 %rd504, %rd685, %rd26; + add.s64 %rd505, %rd504, %rd665; + add.s64 %rd506, %rd505, %rd483; + add.s64 %rd507, %rd506, %rd493; + add.s64 %rd508, %rd507, %rd495; + setp.lt.u64 %p204, %rd508, %rd665; + selp.u64 %rd509, 1, 0, %p204; + add.s64 %rd510, %rd488, %rd509; + setp.lt.u64 %p205, %rd510, %rd488; + selp.u64 %rd83, 1, 0, %p205; + add.s64 %rd511, %rd494, %rd42; + setp.lt.u64 %p206, %rd511, %rd494; + selp.u64 %rd512, 1, 0, %p206; + add.s64 %rd513, %rd44, %rd512; + add.s64 %rd84, %rd513, %rd508; + setp.lt.u64 %p8, %rd84, %rd513; + setp.eq.s64 %p207, %rd513, 0; + and.pred %p208, %p206, %p207; + or.pred %p209, %p8, %p208; + selp.u64 %rd514, 1, 0, %p209; + add.s64 %rd85, %rd510, %rd514; + setp.lt.u64 %p9, %rd85, %rd510; + selp.u64 %rd86, 1, 0, %p9; + add.s64 %rd515, %rd503, %rd497; + add.s64 %rd87, %rd515, %rd511; + setp.lt.u64 %p10, %rd87, %rd515; + @%p80 bra $L__BB0_9; + + setp.eq.s64 %p211, %rd46, 0; + setp.lt.u64 %p212, %rd47, %rd45; + and.pred %p213, %p212, %p211; + add.s64 %rd516, %rd46, %rd82; + setp.lt.u64 %p214, %rd516, %rd46; + or.pred %p215, %p214, %p213; + selp.u64 %rd702, 1, 0, %p215; + +$L__BB0_9: + mov.u64 %rd691, 0; + mul.hi.u64 %rd690, %rd691, %rd691; + add.s64 %rd689, %rd26, %rd690; + selp.u64 %rd517, 1, 0, %p10; + add.s64 %rd518, %rd84, %rd517; + setp.lt.u64 %p216, %rd518, %rd84; + selp.u64 %rd519, 1, 0, %p216; + add.s64 %rd520, %rd85, %rd519; + setp.lt.u64 %p217, %rd520, %rd85; + add.s64 %rd93, %rd702, %rd18; + setp.lt.u64 %p218, %rd93, %rd702; + selp.u64 %rd521, 1, 0, %p218; + add.s64 %rd92, %rd50, %rd521; + setp.lt.u64 %p219, %rd92, %rd50; + selp.u64 %rd522, 1, 0, %p219; + add.s64 %rd91, %rd51, %rd522; + setp.lt.u64 %p220, %rd91, %rd51; + selp.u64 %rd523, 1, 0, %p220; + add.s64 %rd90, %rd52, %rd523; + add.s64 %rd524, %rd91, %rd520; + setp.lt.u64 %p221, %rd524, %rd91; + selp.u64 %rd525, 1, 0, %p221; + add.s64 %rd526, %rd92, %rd518; + add.s64 %rd94, %rd93, %rd87; + setp.lt.u64 %p222, %rd94, %rd93; + selp.u64 %rd527, 1, 0, %p222; + add.s64 %rd95, %rd526, %rd527; + setp.eq.s64 %p223, %rd95, %rd92; + and.pred %p224, %p222, %p223; + setp.lt.u64 %p225, %rd95, %rd92; + or.pred %p226, %p225, %p224; + selp.u64 %rd528, 1, 0, %p226; + add.s64 %rd97, %rd524, %rd528; + setp.lt.u64 %p227, %rd97, %rd524; + selp.u64 %rd529, 1, 0, %p227; + add.s64 %rd530, %rd78, %rd689; + add.s64 %rd531, %rd530, %rd79; + add.s64 %rd532, %rd531, %rd80; + add.s64 %rd533, %rd532, %rd81; + add.s64 %rd534, %rd533, %rd83; + add.s64 %rd535, %rd534, %rd86; + selp.u64 %rd536, 1, 0, %p217; + add.s64 %rd537, %rd535, %rd536; + add.s64 %rd538, %rd537, %rd90; + add.s64 %rd539, %rd538, %rd525; + add.s64 %rd96, %rd539, %rd529; + setp.ne.s64 %p228, %rd96, %rd90; + setp.ne.s64 %p229, %rd97, %rd91; + or.pred %p230, %p229, %p228; + @%p230 bra $L__BB0_11; + + mov.u64 %rd703, 1; + @%p226 bra $L__BB0_12; + +$L__BB0_11: + setp.eq.s64 %p236, %rd96, %rd90; + setp.lt.u64 %p237, %rd97, %rd91; + and.pred %p238, %p237, %p236; + setp.lt.u64 %p239, %rd96, %rd90; + or.pred %p240, %p239, %p238; + selp.u64 %rd703, 1, 0, %p240; + +$L__BB0_12: + mov.u64 %rd696, 0; + mul.hi.u64 %rd695, %rd696, %rd376; + mov.u64 %rd688, 0; + mul.hi.u64 %rd687, %rd688, %rd111; + or.b64 %rd541, %rd95, %rd94; + or.b64 %rd542, %rd541, %rd97; + setp.eq.s64 %p241, %rd542, 0; + mov.u64 %rd543, 0; + setp.eq.s64 %p242, %rd96, 576460752303423505; + and.pred %p243, %p241, %p242; + setp.gt.u64 %p244, %rd96, 576460752303423504; + xor.pred %p245, %p244, %p243; + selp.u64 %rd544, 1, 0, %p245; + mov.u64 %rd545, -1; + mul.hi.u64 %rd546, %rd703, %rd545; + sub.s64 %rd547, %rd546, %rd703; + neg.s64 %rd548, %rd703; + setp.lt.u64 %p246, %rd547, %rd548; + selp.u64 %rd549, 1, 0, %p246; + add.s64 %rd550, %rd547, %rd695; + setp.lt.u64 %p247, %rd550, %rd547; + add.s64 %rd551, %rd550, %rd549; + setp.lt.u64 %p248, %rd551, %rd550; + add.s64 %rd552, %rd551, %rd97; + setp.lt.u64 %p249, %rd552, %rd551; + add.s64 %rd553, %rd547, %rd95; + sub.s64 %rd554, %rd94, %rd703; + setp.lt.u64 %p250, %rd554, %rd94; + selp.u64 %rd555, 1, 0, %p250; + add.s64 %rd556, %rd553, %rd555; + setp.eq.s64 %p251, %rd556, %rd95; + and.pred %p252, %p250, %p251; + setp.lt.u64 %p253, %rd556, %rd95; + or.pred %p254, %p253, %p252; + selp.u64 %rd557, 1, 0, %p254; + add.s64 %rd558, %rd552, %rd557; + setp.lt.u64 %p255, %rd558, %rd552; + mov.u64 %rd704, 1; + mul.hi.u64 %rd560, %rd544, %rd704; + mul.hi.u64 %rd561, %rd544, %rd543; + add.s64 %rd562, %rd561, %rd687; + setp.lt.u64 %p256, %rd562, %rd561; + setp.lt.u64 %p257, %rd558, %rd562; + sub.s64 %rd563, %rd558, %rd562; + sub.s64 %rd564, %rd556, %rd560; + setp.lt.u64 %p258, %rd554, %rd544; + selp.b64 %rd565, -1, 0, %p258; + add.s64 %rd566, %rd564, %rd565; + neg.s64 %rd567, %rd566; + setp.eq.s64 %p259, %rd566, %rd556; + and.pred %p260, %p258, %p259; + setp.gt.u64 %p261, %rd566, %rd556; + or.pred %p262, %p261, %p260; + selp.u64 %rd568, 1, 0, %p262; + setp.lt.u64 %p263, %rd563, %rd568; + sub.s64 %rd569, %rd563, %rd568; + sub.s64 %rd570, %rd554, %rd544; + selp.u64 %rd571, 1, 0, %p263; + selp.u64 %rd572, 1, 0, %p257; + selp.b64 %rd573, -1, 0, %p249; + selp.b64 %rd574, -1, 0, %p248; + selp.u64 %rd575, 1, 0, %p256; + selp.b64 %rd576, -1, 0, %p247; + selp.b64 %rd577, -1, 0, %p246; + selp.b64 %rd578, 576460752303423505, 0, %p245; + selp.b64 %rd579, -1, 0, %p255; + setp.ne.s64 %p264, %rd569, 0; + selp.b64 %rd580, -1, 0, %p264; + neg.s64 %rd581, %rd569; + setp.gt.u64 %p265, %rd570, 1; + selp.u64 %rd582, 1, 0, %p265; + setp.ne.s64 %p266, %rd567, %rd582; + or.pred %p267, %p265, %p266; + selp.u64 %rd583, 1, 0, %p267; + setp.lt.u64 %p268, %rd581, %rd583; + selp.b64 %rd584, -1, 0, %p268; + sub.s64 %rd585, %rd581, %rd583; + selp.b64 %rd586, -1, 0, %p265; + sub.s64 %rd587, %rd704, %rd570; + add.s64 %rd588, %rd585, %rd3; + setp.lt.u64 %p269, %rd588, %rd585; + selp.u64 %rd589, 1, 0, %p269; + add.s64 %rd102, %rd587, %rd5; + setp.lt.u64 %p270, %rd102, %rd587; + selp.u64 %rd590, 1, 0, %p270; + add.s64 %rd591, %rd4, %rd586; + add.s64 %rd592, %rd591, %rd590; + sub.s64 %rd100, %rd592, %rd566; + setp.eq.s64 %p271, %rd100, %rd4; + and.pred %p272, %p270, %p271; + setp.lt.u64 %p273, %rd100, %rd4; + or.pred %p274, %p273, %p272; + selp.u64 %rd593, 1, 0, %p274; + add.s64 %rd103, %rd588, %rd593; + setp.lt.u64 %p275, %rd103, %rd588; + selp.u64 %rd594, 1, 0, %p275; + add.s64 %rd595, %rd75, %rd2; + add.s64 %rd596, %rd595, 576460752303423505; + sub.s64 %rd597, %rd596, %rd695; + sub.s64 %rd598, %rd597, %rd695; + sub.s64 %rd599, %rd598, %rd96; + add.s64 %rd600, %rd599, %rd578; + mul.lo.s64 %rd601, %rd703, 576460752303423506; + add.s64 %rd602, %rd600, %rd601; + sub.s64 %rd603, %rd602, %rd546; + add.s64 %rd604, %rd603, %rd561; + add.s64 %rd605, %rd604, %rd577; + add.s64 %rd606, %rd605, %rd576; + add.s64 %rd607, %rd606, %rd575; + add.s64 %rd608, %rd607, %rd574; + add.s64 %rd609, %rd608, %rd573; + add.s64 %rd610, %rd609, %rd579; + add.s64 %rd611, %rd610, %rd572; + add.s64 %rd612, %rd611, %rd571; + add.s64 %rd613, %rd612, %rd580; + add.s64 %rd614, %rd613, %rd584; + add.s64 %rd615, %rd614, %rd589; + add.s64 %rd101, %rd615, %rd594; + setp.eq.s64 %p276, %rd101, 576460752303423505; + or.b64 %rd616, %rd100, %rd102; + or.b64 %rd617, %rd616, %rd103; + setp.eq.s64 %p277, %rd617, 0; + and.pred %p278, %p277, %p276; + setp.gt.u64 %p279, %rd101, 576460752303423504; + xor.pred %p11, %p279, %p278; + selp.u64 %rd104, 1, 0, %p11; + mul.hi.u64 %rd105, %rd104, %rd543; + add.s64 %rd106, %rd105, %rd687; + setp.lt.u64 %p12, %rd106, %rd105; + setp.ne.s64 %p280, %rd101, %rd2; + setp.ne.s64 %p281, %rd103, %rd3; + or.pred %p282, %p281, %p280; + @%p282 bra $L__BB0_14; + + setp.lt.u64 %p284, %rd102, %rd5; + and.pred %p285, %p284, %p271; + or.pred %p287, %p273, %p285; + @%p287 bra $L__BB0_15; + +$L__BB0_14: + setp.eq.s64 %p288, %rd101, %rd2; + setp.lt.u64 %p289, %rd103, %rd3; + and.pred %p290, %p289, %p288; + setp.lt.u64 %p291, %rd101, %rd2; + or.pred %p292, %p291, %p290; + selp.u64 %rd704, 1, 0, %p292; + +$L__BB0_15: + mov.u64 %rd698, 0; + mul.hi.u64 %rd697, %rd698, %rd376; + ld.param.u32 %r25, [radix2_dit_butterfly_param_2]; + ld.param.u32 %r24, [radix2_dit_butterfly_param_3]; + shr.s32 %r23, %r24, %r25; + mov.u32 %r22, %tid.x; + mov.u32 %r21, %ctaid.x; + mov.u32 %r20, %ntid.x; + mad.lo.s32 %r19, %r20, %r21, %r22; + add.s32 %r18, %r23, -1; + and.b32 %r17, %r18, %r19; + ld.param.u64 %rd673, [radix2_dit_butterfly_param_0]; + shl.b32 %r16, %r19, 1; + sub.s32 %r15, %r16, %r17; + cvta.to.global.u64 %rd672, %rd673; + add.s32 %r14, %r15, %r23; + mul.wide.s32 %rd671, %r14, 32; + add.s64 %rd670, %rd672, %rd671; + mul.wide.s32 %rd669, %r15, 32; + add.s64 %rd668, %rd672, %rd669; + sub.s64 %rd619, %rd103, %rd106; + setp.lt.u64 %p293, %rd102, %rd104; + selp.b64 %rd620, -1, 0, %p293; + mov.u64 %rd621, 1; + mul.hi.u64 %rd622, %rd104, %rd621; + sub.s64 %rd623, %rd100, %rd622; + add.s64 %rd624, %rd623, %rd620; + setp.gt.u64 %p294, %rd624, %rd100; + setp.eq.s64 %p295, %rd624, %rd100; + and.pred %p296, %p293, %p295; + or.pred %p297, %p294, %p296; + selp.u64 %rd625, 1, 0, %p297; + sub.s64 %rd626, %rd102, %rd104; + mov.u64 %rd627, -1; + mul.hi.u64 %rd628, %rd704, %rd627; + sub.s64 %rd629, %rd628, %rd704; + neg.s64 %rd630, %rd704; + setp.lt.u64 %p298, %rd629, %rd630; + selp.u64 %rd631, 1, 0, %p298; + add.s64 %rd632, %rd629, %rd697; + setp.lt.u64 %p299, %rd632, %rd629; + selp.u64 %rd633, 1, 0, %p299; + add.s64 %rd634, %rd632, %rd631; + setp.lt.u64 %p300, %rd634, %rd632; + selp.u64 %rd635, 1, 0, %p300; + sub.s64 %rd636, %rd619, %rd625; + add.s64 %rd637, %rd634, %rd636; + setp.lt.u64 %p301, %rd637, %rd634; + selp.u64 %rd638, 1, 0, %p301; + add.s64 %rd639, %rd629, %rd624; + sub.s64 %rd640, %rd626, %rd704; + setp.lt.u64 %p302, %rd640, %rd626; + selp.u64 %rd641, 1, 0, %p302; + add.s64 %rd642, %rd639, %rd641; + setp.eq.s64 %p303, %rd642, %rd624; + and.pred %p304, %p302, %p303; + setp.lt.u64 %p305, %rd642, %rd624; + or.pred %p306, %p305, %p304; + selp.u64 %rd643, 1, 0, %p306; + add.s64 %rd644, %rd637, %rd643; + setp.lt.u64 %p307, %rd644, %rd637; + selp.u64 %rd645, 1, 0, %p307; + sub.s64 %rd646, %rd76, %rd105; + selp.s64 %rd647, -1, 0, %p12; + add.s64 %rd648, %rd646, %rd647; + setp.lt.u64 %p308, %rd103, %rd106; + selp.b64 %rd649, -1, 0, %p308; + add.s64 %rd650, %rd648, %rd649; + add.s64 %rd651, %rd650, %rd101; + selp.b64 %rd652, -576460752303423505, 0, %p11; + add.s64 %rd653, %rd651, %rd652; + setp.lt.u64 %p309, %rd619, %rd625; + selp.b64 %rd654, -1, 0, %p309; + add.s64 %rd655, %rd653, %rd654; + add.s64 %rd656, %rd655, %rd628; + mul.lo.s64 %rd657, %rd704, -576460752303423506; + add.s64 %rd658, %rd656, %rd657; + add.s64 %rd659, %rd658, %rd631; + add.s64 %rd660, %rd659, %rd633; + add.s64 %rd661, %rd660, %rd635; + add.s64 %rd662, %rd661, %rd638; + add.s64 %rd663, %rd662, %rd645; + st.global.u64 [%rd668], %rd77; + st.global.u64 [%rd668+8], %rd74; + st.global.u64 [%rd668+16], %rd73; + st.global.u64 [%rd668+24], %rd72; + st.global.u64 [%rd670], %rd663; + st.global.u64 [%rd670+8], %rd644; + st.global.u64 [%rd670+16], %rd642; + st.global.u64 [%rd670+24], %rd640; + +$L__BB0_16: + ret; + +} + // .globl calc_twiddles +.visible .entry calc_twiddles( + .param .u64 calc_twiddles_param_0, + .param .u64 calc_twiddles_param_1, + .param .u32 calc_twiddles_param_2 +) +{ + .reg .pred %p<421>; + .reg .b32 %r<14>; + .reg .b64 %rd<1037>; + + + ld.param.u64 %rd139, [calc_twiddles_param_1]; + ld.param.u32 %r4, [calc_twiddles_param_2]; + mov.u32 %r5, %ctaid.x; + mov.u32 %r6, %ntid.x; + mov.u32 %r7, %tid.x; + mad.lo.s32 %r13, %r6, %r5, %r7; + setp.ge.u32 %p18, %r13, %r4; + @%p18 bra $L__BB1_21; + + cvta.to.global.u64 %rd141, %rd139; + ld.global.u64 %rd1017, [%rd141]; + mov.u64 %rd142, 0; + ld.global.u64 %rd1018, [%rd141+8]; + ld.global.u64 %rd1019, [%rd141+16]; + ld.global.u64 %rd1020, [%rd141+24]; + mov.u64 %rd143, 5151653887; + mul.hi.u64 %rd144, %rd142, %rd143; + mov.u64 %rd145, -2802499714047; + mul.hi.u64 %rd5, %rd142, %rd145; + add.s64 %rd6, %rd144, %rd5; + mov.u64 %rd140, 1; + mov.u64 %rd146, -9469952; + mul.hi.u64 %rd7, %rd146, %rd140; + mul.hi.u64 %rd147, %rd140, %rd145; + setp.gt.u64 %p19, %rd147, -5151653888; + selp.u64 %rd148, 1, 0, %p19; + add.s64 %rd149, %rd147, 5151653887; + mul.hi.u64 %rd150, %rd140, %rd143; + add.s64 %rd151, %rd150, %rd5; + setp.lt.u64 %p20, %rd151, %rd150; + selp.u64 %rd152, 1, 0, %p20; + add.s64 %rd153, %rd151, %rd148; + setp.lt.u64 %p21, %rd153, %rd151; + selp.u64 %rd154, 1, 0, %p21; + add.s64 %rd155, %rd5, %rd7; + add.s64 %rd8, %rd155, 576413109808302096; + setp.gt.u64 %p2, %rd153, 9469951; + selp.u64 %rd156, 1, 0, %p2; + add.s64 %rd157, %rd6, %rd8; + mul.hi.u64 %rd158, %rd140, %rd142; + add.s64 %rd159, %rd157, %rd158; + add.s64 %rd160, %rd159, %rd152; + add.s64 %rd161, %rd160, %rd154; + add.s64 %rd10, %rd161, %rd156; + add.s64 %rd162, %rd153, -9469952; + mul.hi.u64 %rd11, %rd142, %rd142; + mul.hi.u64 %rd12, %rd142, %rd140; + mov.u64 %rd163, 9469952; + sub.s64 %rd164, %rd163, %rd153; + mov.u64 %rd165, -1; + mul.hi.u64 %rd166, %rd162, %rd165; + mul.hi.u64 %rd167, %rd149, %rd165; + mov.u64 %rd168, -5151653887; + sub.s64 %rd169, %rd168, %rd147; + mul.hi.u64 %rd170, %rd145, %rd165; + setp.gt.u64 %p22, %rd170, -2802499714048; + selp.u64 %rd171, 1, 0, %p22; + add.s64 %rd172, %rd170, 2802499714047; + mov.u64 %rd173, 2802499714047; + add.s64 %rd174, %rd167, %rd169; + setp.lt.u64 %p23, %rd174, %rd167; + add.s64 %rd16, %rd172, %rd169; + setp.lt.u64 %p24, %rd16, %rd172; + selp.u64 %rd175, 1, 0, %p24; + add.s64 %rd176, %rd175, %rd171; + selp.u64 %rd177, 1, 0, %p23; + add.s64 %rd178, %rd174, %rd172; + setp.lt.u64 %p25, %rd178, %rd174; + selp.u64 %rd179, 1, 0, %p25; + add.s64 %rd180, %rd178, %rd164; + setp.lt.u64 %p26, %rd180, %rd178; + selp.u64 %rd181, 1, 0, %p26; + add.s64 %rd14, %rd176, %rd180; + setp.lt.u64 %p27, %rd14, %rd176; + selp.u64 %rd182, 1, 0, %p27; + add.s64 %rd183, %rd166, %rd164; + add.s64 %rd184, %rd183, %rd170; + add.s64 %rd185, %rd184, %rd174; + add.s64 %rd186, %rd185, %rd171; + add.s64 %rd187, %rd186, %rd177; + add.s64 %rd188, %rd187, 576415912307998736; + sub.s64 %rd189, %rd188, %rd10; + add.s64 %rd190, %rd189, %rd179; + add.s64 %rd191, %rd190, %rd181; + add.s64 %rd13, %rd191, %rd182; + mul.hi.u64 %rd192, %rd14, %rd140; + add.s64 %rd15, %rd13, %rd192; + mul.hi.u64 %rd17, %rd142, %rd173; + add.s64 %rd193, %rd17, -576413109808284689; + mul.hi.u64 %rd194, %rd16, %rd140; + mul.hi.u64 %rd195, %rd16, %rd142; + mul.hi.u64 %rd196, %rd173, %rd140; + add.s64 %rd197, %rd196, %rd16; + setp.lt.u64 %p28, %rd197, %rd196; + selp.u64 %rd198, 1, 0, %p28; + mul.hi.u64 %rd199, %rd173, %rd142; + add.s64 %rd200, %rd199, %rd194; + setp.lt.u64 %p29, %rd200, %rd199; + selp.u64 %rd201, 1, 0, %p29; + add.s64 %rd202, %rd200, %rd198; + setp.lt.u64 %p30, %rd202, %rd200; + selp.u64 %rd203, 1, 0, %p30; + add.s64 %rd18, %rd193, %rd15; + add.s64 %rd21, %rd202, %rd14; + setp.lt.u64 %p5, %rd21, %rd202; + selp.u64 %rd204, 1, 0, %p5; + add.s64 %rd205, %rd18, %rd12; + add.s64 %rd206, %rd205, %rd199; + add.s64 %rd207, %rd206, %rd195; + add.s64 %rd208, %rd207, %rd201; + add.s64 %rd209, %rd208, %rd203; + add.s64 %rd20, %rd209, %rd204; + add.s64 %rd210, %rd21, %rd162; + setp.lt.u64 %p31, %rd210, %rd21; + selp.u64 %rd211, 1, 0, %p31; + add.s64 %rd212, %rd170, %rd196; + add.s64 %rd213, %rd212, 2802499714048; + setp.le.u64 %p32, %rd213, %rd197; + selp.u64 %rd214, 1, 0, %p32; + add.s64 %rd23, %rd210, %rd214; + setp.lt.u64 %p33, %rd23, %rd210; + selp.u64 %rd215, 1, 0, %p33; + add.s64 %rd216, %rd10, %rd211; + add.s64 %rd22, %rd216, %rd215; + setp.eq.s64 %p34, %rd22, 0; + setp.eq.s64 %p35, %rd23, %rd21; + and.pred %p36, %p35, %p34; + and.pred %p37, %p32, %p36; + mov.u64 %rd1015, %rd140; + @%p37 bra $L__BB1_3; + + setp.lt.u64 %p39, %rd23, %rd21; + and.pred %p40, %p39, %p34; + add.s64 %rd217, %rd22, %rd20; + setp.lt.u64 %p41, %rd217, %rd22; + or.pred %p42, %p41, %p40; + selp.u64 %rd1015, 1, 0, %p42; + +$L__BB1_3: + mul.hi.u64 %rd1012, %rd14, %rd140; + add.s64 %rd1011, %rd13, %rd1012; + add.s64 %rd1010, %rd17, -576413109808284689; + add.s64 %rd1009, %rd1010, %rd1011; + setp.lt.u64 %p419, %rd1009, %rd1010; + setp.lt.u64 %p418, %rd1011, %rd13; + mov.u64 %rd1008, 5151653887; + mul.hi.u64 %rd1007, %rd142, %rd1008; + setp.lt.u64 %p417, %rd6, %rd1007; + mul.hi.u64 %rd220, %rd142, %rd146; + mov.u64 %rd221, 576413109808302096; + mul.hi.u64 %rd222, %rd142, %rd221; + add.s64 %rd223, %rd222, %rd220; + setp.lt.u64 %p43, %rd223, %rd222; + mul.hi.u64 %rd224, %rd146, %rd142; + mul.hi.u64 %rd226, %rd221, %rd140; + add.s64 %rd227, %rd224, %rd226; + setp.lt.u64 %p44, %rd227, %rd224; + setp.gt.u64 %p45, %rd7, -576413109808302097; + selp.u64 %rd228, 1, 0, %p45; + add.s64 %rd229, %rd227, %rd228; + setp.lt.u64 %p46, %rd229, %rd227; + add.s64 %rd230, %rd11, %rd220; + add.s64 %rd231, %rd6, %rd230; + selp.u64 %rd232, 1, 0, %p417; + add.s64 %rd233, %rd231, %rd232; + setp.lt.u64 %p47, %rd233, %rd220; + selp.u64 %rd234, 1, 0, %p47; + add.s64 %rd235, %rd223, %rd234; + setp.lt.u64 %p48, %rd235, %rd223; + add.s64 %rd236, %rd229, %rd6; + setp.lt.u64 %p49, %rd236, %rd229; + mul.hi.u64 %rd237, %rd221, %rd142; + add.s64 %rd238, %rd224, %rd12; + add.s64 %rd239, %rd238, %rd237; + add.s64 %rd240, %rd239, %rd233; + selp.u64 %rd241, 1, 0, %p44; + add.s64 %rd242, %rd240, %rd241; + selp.u64 %rd243, 1, 0, %p46; + add.s64 %rd244, %rd242, %rd243; + selp.u64 %rd245, 1, 0, %p49; + add.s64 %rd246, %rd244, %rd245; + setp.lt.u64 %p50, %rd246, %rd233; + setp.eq.s64 %p51, %rd246, %rd233; + and.pred %p52, %p49, %p51; + or.pred %p53, %p50, %p52; + selp.u64 %rd247, 1, 0, %p53; + add.s64 %rd248, %rd235, %rd247; + setp.lt.u64 %p54, %rd248, %rd235; + setp.lt.u64 %p55, %rd8, %rd5; + selp.u64 %rd249, 1, 0, %p55; + add.s64 %rd250, %rd236, %rd249; + setp.lt.u64 %p56, %rd10, %rd8; + setp.eq.s64 %p57, %rd10, %rd8; + and.pred %p58, %p2, %p57; + or.pred %p59, %p56, %p58; + selp.u64 %rd251, 1, 0, %p59; + add.s64 %rd252, %rd250, %rd251; + setp.lt.u64 %p60, %rd252, %rd236; + selp.u64 %rd253, 1, 0, %p60; + add.s64 %rd254, %rd246, %rd253; + setp.lt.u64 %p61, %rd254, %rd246; + selp.u64 %rd255, 1, 0, %p61; + add.s64 %rd256, %rd248, %rd255; + setp.lt.u64 %p62, %rd256, %rd248; + mul.hi.u64 %rd257, %rd14, %rd142; + mul.hi.u64 %rd258, %rd13, %rd142; + mul.lo.s64 %rd259, %rd14, 576460752303423505; + mov.u64 %rd260, 576460752303423505; + add.s64 %rd261, %rd259, %rd257; + setp.lt.u64 %p63, %rd261, %rd259; + mul.lo.s64 %rd262, %rd13, 576460752303423505; + add.s64 %rd263, %rd262, %rd258; + setp.lt.u64 %p64, %rd263, %rd262; + mul.hi.u64 %rd264, %rd14, %rd260; + add.s64 %rd265, %rd263, %rd264; + setp.lt.u64 %p65, %rd265, %rd263; + selp.u64 %rd266, 1, 0, %p63; + add.s64 %rd267, %rd265, %rd266; + setp.lt.u64 %p66, %rd267, %rd265; + mul.hi.u64 %rd268, %rd13, %rd140; + add.s64 %rd269, %rd268, %rd257; + setp.lt.u64 %p67, %rd269, %rd268; + selp.u64 %rd270, 1, 0, %p418; + add.s64 %rd271, %rd269, %rd270; + setp.lt.u64 %p68, %rd271, %rd269; + mul.hi.u64 %rd273, %rd260, %rd173; + mul.lo.s64 %rd274, %rd16, 576460752303423505; + add.s64 %rd275, %rd273, %rd274; + setp.lt.u64 %p69, %rd275, %rd273; + mul.hi.u64 %rd276, %rd142, %rd16; + add.s64 %rd277, %rd276, %rd275; + setp.lt.u64 %p70, %rd277, %rd276; + setp.gt.u64 %p71, %rd17, 576413109808284688; + selp.u64 %rd278, 1, 0, %p71; + add.s64 %rd279, %rd277, %rd278; + setp.lt.u64 %p72, %rd279, %rd277; + add.s64 %rd280, %rd257, %rd12; + add.s64 %rd281, %rd280, %rd258; + add.s64 %rd282, %rd281, %rd261; + selp.u64 %rd283, 1, 0, %p67; + add.s64 %rd284, %rd282, %rd283; + selp.u64 %rd285, 1, 0, %p68; + add.s64 %rd286, %rd284, %rd285; + setp.lt.u64 %p73, %rd286, %rd261; + selp.u64 %rd287, 1, 0, %p73; + add.s64 %rd288, %rd267, %rd287; + setp.lt.u64 %p74, %rd288, %rd267; + add.s64 %rd289, %rd271, %rd279; + setp.lt.u64 %p75, %rd289, %rd271; + add.s64 %rd290, %rd17, %rd11; + mul.hi.u64 %rd291, %rd260, %rd16; + add.s64 %rd292, %rd290, %rd291; + selp.u64 %rd293, 1, 0, %p69; + add.s64 %rd294, %rd292, %rd293; + selp.u64 %rd295, 1, 0, %p70; + add.s64 %rd296, %rd294, %rd295; + selp.u64 %rd297, 1, 0, %p72; + add.s64 %rd298, %rd296, %rd297; + selp.u64 %rd299, 1, 0, %p75; + add.s64 %rd300, %rd298, %rd299; + add.s64 %rd301, %rd300, %rd286; + setp.lt.u64 %p76, %rd301, %rd300; + setp.eq.s64 %p77, %rd300, 0; + and.pred %p78, %p75, %p77; + or.pred %p79, %p76, %p78; + selp.u64 %rd302, 1, 0, %p79; + add.s64 %rd303, %rd288, %rd302; + setp.lt.u64 %p80, %rd303, %rd288; + selp.u64 %rd304, 1, 0, %p419; + add.s64 %rd305, %rd289, %rd304; + setp.lt.u64 %p81, %rd20, %rd1009; + setp.eq.s64 %p82, %rd20, %rd1009; + and.pred %p83, %p5, %p82; + or.pred %p84, %p81, %p83; + selp.u64 %rd306, 1, 0, %p84; + add.s64 %rd307, %rd305, %rd306; + setp.lt.u64 %p85, %rd307, %rd289; + selp.u64 %rd308, 1, 0, %p85; + add.s64 %rd309, %rd301, %rd308; + setp.lt.u64 %p86, %rd309, %rd301; + selp.u64 %rd310, 1, 0, %p86; + add.s64 %rd311, %rd303, %rd310; + setp.lt.u64 %p87, %rd311, %rd303; + add.s64 %rd29, %rd1015, %rd252; + setp.lt.u64 %p88, %rd29, %rd1015; + selp.u64 %rd312, 1, 0, %p88; + add.s64 %rd28, %rd254, %rd312; + setp.lt.u64 %p89, %rd28, %rd254; + selp.u64 %rd313, 1, 0, %p89; + add.s64 %rd27, %rd256, %rd313; + setp.lt.u64 %p90, %rd27, %rd256; + selp.u64 %rd314, 1, 0, %p90; + add.s64 %rd315, %rd223, %rd11; + selp.u64 %rd316, 1, 0, %p43; + add.s64 %rd317, %rd315, %rd316; + selp.u64 %rd318, 1, 0, %p48; + add.s64 %rd319, %rd317, %rd318; + selp.u64 %rd320, 1, 0, %p54; + add.s64 %rd321, %rd319, %rd320; + selp.u64 %rd322, 1, 0, %p62; + add.s64 %rd323, %rd321, %rd322; + add.s64 %rd26, %rd323, %rd314; + add.s64 %rd324, %rd27, %rd311; + setp.lt.u64 %p91, %rd324, %rd27; + selp.u64 %rd325, 1, 0, %p91; + add.s64 %rd326, %rd28, %rd309; + add.s64 %rd30, %rd29, %rd307; + setp.lt.u64 %p92, %rd30, %rd29; + selp.u64 %rd327, 1, 0, %p92; + add.s64 %rd31, %rd326, %rd327; + setp.eq.s64 %p93, %rd31, %rd28; + and.pred %p94, %p92, %p93; + setp.lt.u64 %p95, %rd31, %rd28; + or.pred %p96, %p95, %p94; + selp.u64 %rd328, 1, 0, %p96; + add.s64 %rd33, %rd324, %rd328; + setp.lt.u64 %p97, %rd33, %rd324; + selp.u64 %rd329, 1, 0, %p97; + add.s64 %rd330, %rd257, %rd11; + mul.hi.u64 %rd331, %rd13, %rd260; + add.s64 %rd332, %rd330, %rd331; + selp.u64 %rd333, 1, 0, %p64; + add.s64 %rd334, %rd332, %rd333; + selp.u64 %rd335, 1, 0, %p65; + add.s64 %rd336, %rd334, %rd335; + selp.u64 %rd337, 1, 0, %p66; + add.s64 %rd338, %rd336, %rd337; + selp.u64 %rd339, 1, 0, %p74; + add.s64 %rd340, %rd338, %rd339; + selp.u64 %rd341, 1, 0, %p80; add.s64 %rd342, %rd340, %rd341; - add.s64 %rd343, %rd342, %rd331; - setp.lt.u64 %p61, %rd343, %rd337; - selp.u64 %rd344, 1, 0, %p61; - add.s64 %rd345, %rd335, %rd343; - setp.lt.u64 %p62, %rd345, %rd335; - selp.u64 %rd30, 1, 0, %p62; - add.s64 %rd346, %rd345, %rd339; - setp.lt.u64 %p63, %rd346, %rd345; - selp.u64 %rd31, 1, 0, %p63; - mul.hi.u64 %rd347, %rd281, %rd108; - mul.hi.u64 %rd348, %rd281, %rd764; - add.s64 %rd349, %rd347, %rd27; - add.s64 %rd350, %rd349, %rd348; - mul.hi.u64 %rd351, %rd291, %rd108; - mul.hi.u64 %rd352, %rd291, %rd764; - add.s64 %rd353, %rd351, %rd27; - add.s64 %rd354, %rd347, %rd10; - add.s64 %rd355, %rd354, %rd347; - add.s64 %rd356, %rd350, %rd291; - setp.lt.u64 %p64, %rd356, %rd350; - selp.u64 %rd357, 1, 0, %p64; - add.s64 %rd358, %rd353, %rd355; - add.s64 %rd359, %rd358, %rd352; - setp.lt.u64 %p65, %rd359, %rd355; - selp.u64 %rd360, 1, 0, %p65; - add.s64 %rd361, %rd359, %rd357; - setp.lt.u64 %p66, %rd361, %rd359; - selp.u64 %rd362, 1, 0, %p66; - add.s64 %rd363, %rd338, %rd322; - setp.lt.u64 %p67, %rd363, %rd338; - selp.u64 %rd364, 1, 0, %p67; - add.s64 %rd41, %rd361, %rd23; - setp.lt.u64 %p68, %rd41, %rd361; - selp.u64 %rd365, 1, 0, %p68; - add.s64 %rd366, %rd28, %rd10; - add.s64 %rd367, %rd363, %rd366; - add.s64 %rd368, %rd367, %rd351; - add.s64 %rd369, %rd368, %rd351; - add.s64 %rd370, %rd369, %rd355; - add.s64 %rd371, %rd370, %rd360; - add.s64 %rd372, %rd371, %rd362; - add.s64 %rd32, %rd372, %rd365; - setp.eq.s64 %p69, %rd32, %rd363; - and.pred %p70, %p68, %p69; - setp.lt.u64 %p71, %rd32, %rd363; - or.pred %p72, %p71, %p70; - selp.u64 %rd373, 1, 0, %p72; - add.s64 %rd374, %rd25, %rd312; - add.s64 %rd375, %rd374, %rd26; - add.s64 %rd376, %rd375, %rd28; - add.s64 %rd377, %rd376, %rd326; - add.s64 %rd34, %rd377, %rd327; - add.s64 %rd38, %rd346, %rd35; - setp.lt.u64 %p3, %rd38, %rd346; - add.s64 %rd378, %rd341, %rd22; - add.s64 %rd379, %rd378, %rd330; - add.s64 %rd380, %rd379, %rd333; - add.s64 %rd381, %rd380, %rd336; - add.s64 %rd37, %rd381, %rd344; - add.s64 %rd382, %rd38, %rd364; - add.s64 %rd39, %rd382, %rd373; - setp.lt.u64 %p73, %rd39, %rd38; - selp.u64 %rd40, 1, 0, %p73; - add.s64 %rd383, %rd41, %rd234; - setp.lt.u64 %p74, %rd383, %rd41; - selp.u64 %rd384, 1, 0, %p74; - setp.ne.s64 %p75, %rd203, 0; - selp.u64 %rd385, 1, 0, %p75; - add.s64 %rd386, %rd286, %rd385; - add.s64 %rd387, %rd386, %rd350; - setp.eq.s64 %p76, %rd386, %rd291; - setp.lt.u64 %p77, %rd387, %rd356; - and.pred %p78, %p75, %p76; - or.pred %p79, %p77, %p78; - selp.u64 %rd388, 1, 0, %p79; - add.s64 %rd44, %rd383, %rd388; - setp.lt.u64 %p80, %rd44, %rd383; - selp.u64 %rd389, 1, 0, %p80; - add.s64 %rd390, %rd246, %rd384; - add.s64 %rd42, %rd390, %rd389; - add.s64 %rd43, %rd42, %rd32; - setp.ne.s64 %p81, %rd42, 0; - setp.ne.s64 %p82, %rd44, %rd41; - or.pred %p83, %p82, %p81; - not.pred %p84, %p79; - or.pred %p4, %p83, %p84; - not.pred %p85, %p4; - @%p85 bra $L__BB0_2; - - setp.eq.s64 %p86, %rd42, 0; - setp.lt.u64 %p87, %rd44, %rd41; - and.pred %p88, %p87, %p86; - setp.lt.u64 %p89, %rd43, %rd32; - or.pred %p90, %p89, %p88; - selp.u64 %rd764, 1, 0, %p90; - -$L__BB0_2: - mov.u64 %rd759, 0; - mul.hi.u64 %rd758, %rd759, %rd759; - setp.lt.u64 %p294, %rd17, %rd259; - selp.u64 %rd740, 1, 0, %p3; - mov.u64 %rd765, 1; - mov.u64 %rd738, 0; - setp.lt.u64 %p91, %rd19, %rd18; - selp.u64 %rd392, 1, 0, %p91; - add.s64 %rd47, %rd16, %rd392; - setp.lt.u64 %p92, %rd47, %rd16; - selp.u64 %rd393, 1, 0, %p92; - add.s64 %rd48, %rd17, %rd393; - setp.lt.u64 %p93, %rd48, %rd17; - mul.hi.u64 %rd396, %rd738, %rd316; - mul.lo.s64 %rd397, %rd21, 576460752303423505; - add.s64 %rd398, %rd397, %rd25; - setp.lt.u64 %p94, %rd398, %rd397; - mul.hi.u64 %rd399, %rd23, %rd316; - add.s64 %rd400, %rd399, %rd24; - add.s64 %rd401, %rd400, %rd396; - add.s64 %rd402, %rd398, %rd401; - setp.lt.u64 %p95, %rd402, %rd398; - selp.u64 %rd403, 1, 0, %p2; - add.s64 %rd404, %rd402, %rd403; - setp.lt.u64 %p96, %rd404, %rd402; - setp.lt.u64 %p97, %rd34, %rd26; - selp.u64 %rd405, 1, 0, %p97; - add.s64 %rd49, %rd404, %rd405; - setp.lt.u64 %p98, %rd49, %rd404; - add.s64 %rd406, %rd34, %rd37; - add.s64 %rd407, %rd406, %rd30; - add.s64 %rd408, %rd407, %rd31; - add.s64 %rd409, %rd408, %rd740; - setp.lt.u64 %p99, %rd409, %rd34; - setp.eq.s64 %p100, %rd409, %rd34; - and.pred %p101, %p3, %p100; - or.pred %p102, %p99, %p101; - selp.u64 %rd410, 1, 0, %p102; - add.s64 %rd411, %rd49, %rd410; - setp.lt.u64 %p103, %rd411, %rd49; - add.s64 %rd412, %rd409, %rd40; - setp.lt.u64 %p104, %rd412, %rd409; - selp.u64 %rd413, 1, 0, %p104; - add.s64 %rd414, %rd411, %rd413; - setp.lt.u64 %p105, %rd414, %rd411; - add.s64 %rd415, %rd764, %rd19; - setp.lt.u64 %p106, %rd415, %rd764; - selp.u64 %rd416, 1, 0, %p106; - add.s64 %rd417, %rd47, %rd416; - setp.lt.u64 %p107, %rd417, %rd47; - selp.u64 %rd418, 1, 0, %p107; - add.s64 %rd52, %rd48, %rd418; - setp.lt.u64 %p108, %rd52, %rd48; - selp.u64 %rd419, 1, 0, %p108; - add.s64 %rd420, %rd7, %rd758; - add.s64 %rd421, %rd420, %rd7; - add.s64 %rd422, %rd14, %rd421; - add.s64 %rd423, %rd422, %rd8; - add.s64 %rd424, %rd423, %rd9; - add.s64 %rd425, %rd424, %rd11; - add.s64 %rd426, %rd425, %rd12; - add.s64 %rd427, %rd426, %rd13; - add.s64 %rd428, %rd427, %rd15; - selp.u64 %rd429, 1, 0, %p294; - add.s64 %rd430, %rd428, %rd429; - selp.u64 %rd431, 1, 0, %p93; - add.s64 %rd50, %rd430, %rd431; - add.s64 %rd51, %rd50, %rd419; - add.s64 %rd432, %rd52, %rd414; - setp.lt.u64 %p109, %rd432, %rd52; - selp.u64 %rd433, 1, 0, %p109; - add.s64 %rd434, %rd417, %rd412; - add.s64 %rd53, %rd415, %rd39; - setp.lt.u64 %p110, %rd53, %rd415; - selp.u64 %rd435, 1, 0, %p110; - add.s64 %rd54, %rd434, %rd435; - setp.eq.s64 %p111, %rd54, %rd417; - and.pred %p112, %p110, %p111; - setp.lt.u64 %p113, %rd54, %rd417; - or.pred %p114, %p113, %p112; - selp.u64 %rd436, 1, 0, %p114; - add.s64 %rd57, %rd432, %rd436; - setp.lt.u64 %p115, %rd57, %rd432; - selp.u64 %rd437, 1, 0, %p115; - add.s64 %rd438, %rd29, %rd22; - add.s64 %rd439, %rd438, %rd396; - mul.hi.u64 %rd440, %rd21, %rd316; - add.s64 %rd441, %rd439, %rd440; - selp.u64 %rd442, 1, 0, %p94; - add.s64 %rd443, %rd441, %rd442; - selp.u64 %rd444, 1, 0, %p95; - add.s64 %rd445, %rd443, %rd444; - selp.u64 %rd446, 1, 0, %p96; - add.s64 %rd447, %rd445, %rd446; - selp.u64 %rd448, 1, 0, %p98; - add.s64 %rd55, %rd447, %rd448; - selp.u64 %rd449, 1, 0, %p103; - add.s64 %rd450, %rd55, %rd449; - selp.u64 %rd451, 1, 0, %p105; - add.s64 %rd452, %rd450, %rd451; - add.s64 %rd453, %rd452, %rd51; - add.s64 %rd454, %rd453, %rd433; - add.s64 %rd56, %rd454, %rd437; - setp.ne.s64 %p116, %rd56, %rd51; - setp.ne.s64 %p117, %rd57, %rd52; - or.pred %p118, %p117, %p116; - not.pred %p119, %p114; - or.pred %p120, %p118, %p119; - not.pred %p121, %p120; - @%p121 bra $L__BB0_4; - - setp.eq.s64 %p122, %rd56, %rd51; - setp.lt.u64 %p123, %rd57, %rd52; - and.pred %p124, %p123, %p122; - setp.lt.u64 %p125, %rd56, %rd51; - or.pred %p126, %p125, %p124; - selp.u64 %rd765, 1, 0, %p126; - -$L__BB0_4: - mov.u64 %rd761, 0; - mul.hi.u64 %rd760, %rd761, %rd761; - mov.u64 %rd743, -1; - mov.u64 %rd766, 1; - mov.u64 %rd741, 0; - or.b64 %rd456, %rd54, %rd53; - or.b64 %rd457, %rd456, %rd57; - setp.ne.s64 %p127, %rd457, 0; - setp.ne.s64 %p128, %rd56, 576460752303423505; - or.pred %p129, %p127, %p128; - setp.gt.u64 %p130, %rd56, 576460752303423504; - and.pred %p131, %p130, %p129; - selp.u64 %rd459, 1, 0, %p131; - add.s64 %rd460, %rd20, %rd760; - add.s64 %rd60, %rd460, %rd20; - mul.hi.u64 %rd461, %rd765, %rd741; - add.s64 %rd462, %rd461, %rd20; - mul.hi.u64 %rd464, %rd765, %rd743; - add.s64 %rd465, %rd462, %rd464; - sub.s64 %rd466, %rd465, %rd765; - neg.s64 %rd467, %rd765; - setp.lt.u64 %p132, %rd466, %rd467; - selp.u64 %rd468, 1, 0, %p132; - add.s64 %rd469, %rd466, %rd60; - setp.lt.u64 %p133, %rd469, %rd466; - selp.u64 %rd470, 1, 0, %p133; - add.s64 %rd471, %rd469, %rd468; - setp.lt.u64 %p134, %rd471, %rd469; - selp.u64 %rd472, 1, 0, %p134; - add.s64 %rd473, %rd471, %rd57; - setp.lt.u64 %p135, %rd473, %rd471; - selp.u64 %rd474, 1, 0, %p135; - sub.s64 %rd475, %rd53, %rd765; - setp.lt.u64 %p136, %rd475, %rd53; - selp.u64 %rd476, 1, 0, %p136; - add.s64 %rd477, %rd54, %rd476; - add.s64 %rd478, %rd477, %rd466; - setp.eq.s64 %p137, %rd478, %rd54; - and.pred %p138, %p136, %p137; - setp.lt.u64 %p139, %rd478, %rd54; - or.pred %p140, %p139, %p138; - selp.u64 %rd479, 1, 0, %p140; - add.s64 %rd480, %rd473, %rd479; - setp.lt.u64 %p141, %rd480, %rd473; - selp.u64 %rd481, 1, 0, %p141; - mul.hi.u64 %rd482, %rd459, %rd741; - mul.hi.u64 %rd483, %rd459, %rd766; - add.s64 %rd484, %rd482, %rd760; - add.s64 %rd485, %rd484, %rd482; - add.s64 %rd486, %rd485, %rd28; - setp.lt.u64 %p142, %rd486, %rd485; - selp.b64 %rd487, -576460752303423505, 0, %p131; - selp.b64 %rd488, -1, 0, %p142; - setp.lt.u64 %p143, %rd480, %rd486; - selp.b64 %rd489, -1, 0, %p143; - sub.s64 %rd490, %rd480, %rd486; - setp.lt.u64 %p144, %rd475, %rd459; - selp.b64 %rd491, -1, 0, %p144; - add.s64 %rd492, %rd482, %rd27; - sub.s64 %rd493, %rd491, %rd492; - sub.s64 %rd494, %rd493, %rd483; - add.s64 %rd495, %rd494, %rd478; - setp.eq.s64 %p145, %rd494, 0; - and.pred %p146, %p144, %p145; - setp.gt.u64 %p147, %rd495, %rd478; - or.pred %p148, %p147, %p146; - selp.u64 %rd496, 1, 0, %p148; - setp.lt.u64 %p149, %rd490, %rd496; - selp.b64 %rd497, -1, 0, %p149; - sub.s64 %rd498, %rd490, %rd496; - sub.s64 %rd499, %rd475, %rd459; - add.s64 %rd500, %rd498, %rd3; - setp.lt.u64 %p150, %rd500, %rd498; - selp.u64 %rd501, 1, 0, %p150; - add.s64 %rd63, %rd499, %rd5; - setp.lt.u64 %p151, %rd63, %rd499; - selp.u64 %rd502, 1, 0, %p151; - add.s64 %rd503, %rd4, %rd502; - add.s64 %rd61, %rd503, %rd495; - setp.eq.s64 %p152, %rd61, %rd4; - and.pred %p153, %p151, %p152; - setp.lt.u64 %p154, %rd61, %rd4; - or.pred %p155, %p154, %p153; - selp.u64 %rd504, 1, 0, %p155; - add.s64 %rd64, %rd500, %rd504; - setp.lt.u64 %p156, %rd64, %rd500; - selp.u64 %rd505, 1, 0, %p156; - sub.s64 %rd506, %rd2, %rd22; - add.s64 %rd507, %rd506, %rd60; - add.s64 %rd508, %rd507, %rd60; - sub.s64 %rd509, %rd508, %rd28; - add.s64 %rd510, %rd509, %rd56; - add.s64 %rd511, %rd510, %rd487; - mul.lo.s64 %rd512, %rd765, -576460752303423506; - add.s64 %rd513, %rd511, %rd512; - add.s64 %rd514, %rd513, %rd465; - sub.s64 %rd515, %rd514, %rd485; - add.s64 %rd516, %rd515, %rd468; - add.s64 %rd517, %rd516, %rd470; - add.s64 %rd518, %rd517, %rd488; - add.s64 %rd519, %rd518, %rd472; - add.s64 %rd520, %rd519, %rd474; - add.s64 %rd521, %rd520, %rd481; - add.s64 %rd522, %rd521, %rd489; - add.s64 %rd523, %rd522, %rd497; - add.s64 %rd524, %rd523, %rd501; - add.s64 %rd62, %rd524, %rd505; - setp.ne.s64 %p157, %rd62, 576460752303423505; - or.b64 %rd525, %rd61, %rd63; - or.b64 %rd526, %rd525, %rd64; - setp.ne.s64 %p158, %rd526, 0; - setp.gt.u64 %p159, %rd62, 576460752303423504; - or.pred %p160, %p158, %p157; - and.pred %p5, %p159, %p160; - selp.u64 %rd65, 1, 0, %p5; - mul.hi.u64 %rd66, %rd65, %rd741; - add.s64 %rd527, %rd66, %rd760; - add.s64 %rd67, %rd527, %rd66; - add.s64 %rd68, %rd67, %rd28; - setp.lt.u64 %p6, %rd68, %rd67; - setp.ne.s64 %p161, %rd62, %rd2; - setp.ne.s64 %p162, %rd64, %rd3; + selp.u64 %rd343, 1, 0, %p87; + add.s64 %rd344, %rd342, %rd343; + add.s64 %rd345, %rd344, %rd26; + add.s64 %rd346, %rd345, %rd325; + add.s64 %rd32, %rd346, %rd329; + setp.ne.s64 %p98, %rd32, %rd26; + setp.ne.s64 %p99, %rd33, %rd27; + or.pred %p100, %p99, %p98; + @%p100 bra $L__BB1_5; + + mov.u64 %rd1016, 1; + @%p96 bra $L__BB1_6; + +$L__BB1_5: + setp.eq.s64 %p106, %rd32, %rd26; + setp.lt.u64 %p107, %rd33, %rd27; + and.pred %p108, %p107, %p106; + setp.lt.u64 %p109, %rd32, %rd26; + or.pred %p110, %p109, %p108; + selp.u64 %rd1016, 1, 0, %p110; + +$L__BB1_6: + or.b64 %rd348, %rd31, %rd30; + or.b64 %rd349, %rd348, %rd33; + setp.eq.s64 %p111, %rd349, 0; + mov.u64 %rd350, 0; + setp.eq.s64 %p112, %rd32, 576460752303423505; + and.pred %p113, %p111, %p112; + setp.gt.u64 %p114, %rd32, 576460752303423504; + xor.pred %p115, %p114, %p113; + selp.u64 %rd351, 1, 0, %p115; + mov.u64 %rd352, -1; + mul.hi.u64 %rd353, %rd1016, %rd352; + sub.s64 %rd354, %rd353, %rd1016; + neg.s64 %rd355, %rd1016; + setp.lt.u64 %p116, %rd354, %rd355; + selp.u64 %rd356, 1, 0, %p116; + mul.hi.u64 %rd36, %rd350, %rd352; + add.s64 %rd357, %rd354, %rd36; + setp.lt.u64 %p117, %rd357, %rd354; + selp.u64 %rd358, 1, 0, %p117; + add.s64 %rd359, %rd357, %rd356; + setp.lt.u64 %p118, %rd359, %rd357; + selp.u64 %rd360, 1, 0, %p118; + add.s64 %rd361, %rd359, %rd33; + setp.lt.u64 %p119, %rd361, %rd359; + selp.u64 %rd362, 1, 0, %p119; + add.s64 %rd363, %rd354, %rd31; + sub.s64 %rd364, %rd30, %rd1016; + setp.lt.u64 %p120, %rd364, %rd30; + selp.u64 %rd365, 1, 0, %p120; + add.s64 %rd366, %rd363, %rd365; + setp.eq.s64 %p121, %rd366, %rd31; + and.pred %p122, %p120, %p121; + setp.lt.u64 %p123, %rd366, %rd31; + or.pred %p124, %p123, %p122; + selp.u64 %rd367, 1, 0, %p124; + add.s64 %rd368, %rd361, %rd367; + setp.lt.u64 %p125, %rd368, %rd361; + selp.u64 %rd369, 1, 0, %p125; + mov.u64 %rd370, 1; + mul.hi.u64 %rd371, %rd351, %rd370; + mul.hi.u64 %rd372, %rd351, %rd350; + add.s64 %rd373, %rd372, %rd12; + setp.lt.u64 %p126, %rd373, %rd372; + selp.b64 %rd374, -576460752303423505, 0, %p115; + selp.b64 %rd375, -1, 0, %p126; + setp.lt.u64 %p127, %rd368, %rd373; + selp.b64 %rd376, -1, 0, %p127; + sub.s64 %rd377, %rd368, %rd373; + sub.s64 %rd378, %rd366, %rd371; + setp.lt.u64 %p128, %rd364, %rd351; + selp.b64 %rd379, -1, 0, %p128; + add.s64 %rd1023, %rd378, %rd379; + setp.eq.s64 %p129, %rd1023, %rd366; + and.pred %p130, %p128, %p129; + setp.gt.u64 %p131, %rd1023, %rd366; + or.pred %p132, %p131, %p130; + selp.u64 %rd380, 1, 0, %p132; + setp.lt.u64 %p133, %rd377, %rd380; + selp.b64 %rd381, -1, 0, %p133; + add.s64 %rd38, %rd12, %rd11; + sub.s64 %rd382, %rd32, %rd38; + add.s64 %rd383, %rd382, %rd374; + mul.lo.s64 %rd384, %rd1016, -576460752303423506; + add.s64 %rd385, %rd383, %rd384; + add.s64 %rd386, %rd385, %rd36; + add.s64 %rd387, %rd386, %rd36; + add.s64 %rd388, %rd387, %rd353; + sub.s64 %rd389, %rd388, %rd372; + add.s64 %rd390, %rd389, %rd356; + add.s64 %rd391, %rd390, %rd358; + add.s64 %rd392, %rd391, %rd375; + add.s64 %rd393, %rd392, %rd360; + add.s64 %rd394, %rd393, %rd362; + add.s64 %rd395, %rd394, %rd369; + add.s64 %rd396, %rd395, %rd376; + add.s64 %rd1021, %rd396, %rd381; + sub.s64 %rd1022, %rd377, %rd380; + sub.s64 %rd1024, %rd364, %rd351; + setp.eq.s32 %p134, %r13, 0; + @%p134 bra $L__BB1_20; + +$L__BB1_7: + and.b32 %r8, %r13, 1; + setp.eq.b32 %p135, %r8, 1; + mov.pred %p136, 0; + xor.pred %p137, %p135, %p136; + not.pred %p138, %p137; + @%p138 bra $L__BB1_14; + + mov.u64 %rd398, 0; + mul.lo.s64 %rd399, %rd1017, %rd1022; + mul.hi.u64 %rd400, %rd1022, %rd1018; + add.s64 %rd401, %rd400, %rd399; + setp.lt.u64 %p139, %rd401, %rd400; + selp.u64 %rd402, 1, 0, %p139; + mul.hi.u64 %rd51, %rd1021, %rd1017; + mul.lo.s64 %rd403, %rd1018, %rd1021; + add.s64 %rd404, %rd401, %rd403; + setp.lt.u64 %p140, %rd404, %rd401; + selp.u64 %rd405, 1, 0, %p140; + mul.lo.s64 %rd406, %rd1017, %rd1021; + mul.hi.u64 %rd407, %rd1021, %rd1018; + add.s64 %rd408, %rd407, %rd406; + setp.lt.u64 %p141, %rd408, %rd407; + selp.u64 %rd52, 1, 0, %p141; + mul.hi.u64 %rd409, %rd1022, %rd1017; + add.s64 %rd410, %rd409, %rd408; + setp.lt.u64 %p142, %rd410, %rd409; + selp.u64 %rd53, 1, 0, %p142; + add.s64 %rd411, %rd410, %rd402; + add.s64 %rd412, %rd411, %rd405; + setp.lt.u64 %p143, %rd412, %rd410; + selp.u64 %rd54, 1, 0, %p143; + mul.lo.s64 %rd413, %rd1019, %rd1022; + mul.hi.u64 %rd414, %rd1022, %rd1020; + add.s64 %rd415, %rd414, %rd413; + setp.lt.u64 %p144, %rd415, %rd414; + selp.u64 %rd416, 1, 0, %p144; + mul.lo.s64 %rd417, %rd1020, %rd1021; + add.s64 %rd418, %rd415, %rd417; + setp.lt.u64 %p145, %rd418, %rd415; + selp.u64 %rd419, 1, 0, %p145; + mul.lo.s64 %rd420, %rd1019, %rd1021; + mul.hi.u64 %rd421, %rd1021, %rd1020; + add.s64 %rd422, %rd421, %rd420; + setp.lt.u64 %p146, %rd422, %rd421; + selp.u64 %rd423, 1, 0, %p146; + mul.hi.u64 %rd424, %rd1022, %rd1019; + add.s64 %rd425, %rd424, %rd422; + setp.lt.u64 %p147, %rd425, %rd424; + selp.u64 %rd426, 1, 0, %p147; + add.s64 %rd427, %rd425, %rd416; + add.s64 %rd428, %rd427, %rd419; + setp.lt.u64 %p148, %rd428, %rd425; + selp.u64 %rd429, 1, 0, %p148; + mul.lo.s64 %rd430, %rd1018, %rd1023; + mul.hi.u64 %rd431, %rd1018, %rd1024; + add.s64 %rd432, %rd431, %rd430; + setp.lt.u64 %p149, %rd432, %rd431; + selp.u64 %rd433, 1, 0, %p149; + mul.lo.s64 %rd434, %rd1017, %rd1024; + add.s64 %rd435, %rd432, %rd434; + setp.lt.u64 %p150, %rd435, %rd432; + selp.u64 %rd436, 1, 0, %p150; + mul.lo.s64 %rd437, %rd1017, %rd1023; + mul.hi.u64 %rd438, %rd1017, %rd1024; + add.s64 %rd439, %rd438, %rd437; + setp.lt.u64 %p151, %rd439, %rd438; + selp.u64 %rd440, 1, 0, %p151; + mul.hi.u64 %rd441, %rd1018, %rd1023; + add.s64 %rd442, %rd441, %rd439; + setp.lt.u64 %p152, %rd442, %rd441; + selp.u64 %rd443, 1, 0, %p152; + add.s64 %rd444, %rd442, %rd433; + add.s64 %rd445, %rd444, %rd436; + setp.lt.u64 %p153, %rd445, %rd442; + selp.u64 %rd446, 1, 0, %p153; + mul.lo.s64 %rd447, %rd1019, %rd1024; + mul.hi.u64 %rd448, %rd1024, %rd1020; + add.s64 %rd449, %rd448, %rd447; + setp.lt.u64 %p154, %rd449, %rd448; + selp.u64 %rd450, 1, 0, %p154; + mul.lo.s64 %rd451, %rd1020, %rd1023; + add.s64 %rd452, %rd449, %rd451; + setp.lt.u64 %p155, %rd452, %rd449; + selp.u64 %rd453, 1, 0, %p155; + mul.lo.s64 %rd454, %rd1019, %rd1023; + mul.hi.u64 %rd455, %rd1023, %rd1020; + add.s64 %rd456, %rd455, %rd454; + setp.lt.u64 %p156, %rd456, %rd455; + selp.u64 %rd457, 1, 0, %p156; + mul.hi.u64 %rd458, %rd1024, %rd1019; + add.s64 %rd459, %rd458, %rd456; + setp.lt.u64 %p157, %rd459, %rd458; + selp.u64 %rd460, 1, 0, %p157; + add.s64 %rd461, %rd459, %rd450; + add.s64 %rd462, %rd461, %rd453; + setp.lt.u64 %p158, %rd462, %rd459; + selp.u64 %rd463, 1, 0, %p158; + mul.lo.s64 %rd464, %rd1018, %rd1024; + mul.lo.s64 %rd465, %rd1020, %rd1022; + add.s64 %rd466, %rd464, %rd465; + setp.lt.u64 %p159, %rd466, %rd464; + selp.u64 %rd467, 1, 0, %p159; + add.s64 %rd468, %rd418, %rd467; + add.s64 %rd469, %rd468, %rd435; + setp.eq.s64 %p160, %rd469, %rd418; + and.pred %p161, %p159, %p160; + setp.lt.u64 %p162, %rd469, %rd418; or.pred %p163, %p162, %p161; - not.pred %p164, %p155; - or.pred %p165, %p163, %p164; - not.pred %p166, %p165; - @%p166 bra $L__BB0_6; - - setp.eq.s64 %p167, %rd62, %rd2; - setp.lt.u64 %p168, %rd64, %rd3; - and.pred %p169, %p168, %p167; - setp.lt.u64 %p170, %rd62, %rd2; - or.pred %p171, %p170, %p169; - selp.u64 %rd766, 1, 0, %p171; - -$L__BB0_6: - selp.u64 %rd747, 1, 0, %p3; - mov.u64 %rd746, -1; - mov.u64 %rd767, 1; - mov.u64 %rd744, 0; - sub.s64 %rd529, %rd64, %rd68; - setp.lt.u64 %p172, %rd63, %rd65; - selp.b64 %rd530, -1, 0, %p172; - add.s64 %rd531, %rd66, %rd27; - mul.hi.u64 %rd532, %rd65, %rd767; - add.s64 %rd533, %rd531, %rd532; - sub.s64 %rd534, %rd61, %rd533; - add.s64 %rd535, %rd534, %rd530; - setp.gt.u64 %p173, %rd535, %rd61; - setp.eq.s64 %p174, %rd535, %rd61; - and.pred %p175, %p172, %p174; - or.pred %p176, %p173, %p175; - selp.u64 %rd536, 1, 0, %p176; - sub.s64 %rd537, %rd63, %rd65; - mul.hi.u64 %rd539, %rd766, %rd744; - add.s64 %rd540, %rd539, %rd20; - mul.hi.u64 %rd542, %rd766, %rd746; - add.s64 %rd543, %rd540, %rd542; - sub.s64 %rd544, %rd543, %rd766; - neg.s64 %rd545, %rd766; - setp.lt.u64 %p177, %rd544, %rd545; - selp.u64 %rd546, 1, 0, %p177; - add.s64 %rd547, %rd544, %rd60; - setp.lt.u64 %p178, %rd547, %rd544; - selp.u64 %rd548, 1, 0, %p178; + selp.u64 %rd470, 1, 0, %p163; + add.s64 %rd471, %rd462, %rd466; + setp.lt.u64 %p164, %rd471, %rd462; + selp.u64 %rd472, 1, 0, %p164; + mul.hi.u64 %rd473, %rd398, %rd1020; + add.s64 %rd474, %rd469, %rd473; + mul.hi.u64 %rd475, %rd1023, %rd1019; + add.s64 %rd476, %rd474, %rd475; + mul.hi.u64 %rd477, %rd1024, %rd398; + add.s64 %rd478, %rd476, %rd477; + add.s64 %rd479, %rd478, %rd457; + add.s64 %rd480, %rd479, %rd460; + add.s64 %rd481, %rd480, %rd463; + add.s64 %rd482, %rd481, %rd472; + setp.eq.s64 %p165, %rd482, %rd469; + and.pred %p166, %p164, %p165; + setp.lt.u64 %p167, %rd482, %rd469; + or.pred %p168, %p167, %p166; + selp.u64 %rd483, 1, 0, %p168; + mul.lo.s64 %rd484, %rd1018, %rd1022; + add.s64 %rd485, %rd428, %rd484; + setp.lt.u64 %p169, %rd485, %rd428; + selp.u64 %rd486, 1, 0, %p169; + mul.hi.u64 %rd55, %rd1022, %rd398; + add.s64 %rd487, %rd55, %rd404; + add.s64 %rd488, %rd487, %rd473; + mul.hi.u64 %rd489, %rd1021, %rd1019; + add.s64 %rd490, %rd488, %rd489; + add.s64 %rd491, %rd490, %rd423; + add.s64 %rd492, %rd491, %rd426; + add.s64 %rd493, %rd492, %rd429; + add.s64 %rd494, %rd493, %rd486; + setp.eq.s64 %p170, %rd494, %rd404; + and.pred %p171, %p169, %p170; + setp.lt.u64 %p172, %rd494, %rd404; + or.pred %p173, %p172, %p171; + selp.u64 %rd495, 1, 0, %p173; + add.s64 %rd496, %rd412, %rd495; + setp.lt.u64 %p174, %rd496, %rd412; + selp.u64 %rd56, 1, 0, %p174; + add.s64 %rd497, %rd445, %rd485; + setp.lt.u64 %p175, %rd497, %rd445; + selp.u64 %rd498, 1, 0, %p175; + mul.hi.u64 %rd499, %rd1017, %rd1023; + mul.hi.u64 %rd500, %rd398, %rd1024; + add.s64 %rd501, %rd499, %rd500; + mul.hi.u64 %rd502, %rd1018, %rd398; + add.s64 %rd503, %rd501, %rd502; + add.s64 %rd504, %rd503, %rd440; + add.s64 %rd505, %rd504, %rd494; + add.s64 %rd506, %rd505, %rd443; + add.s64 %rd507, %rd506, %rd446; + add.s64 %rd508, %rd507, %rd498; + setp.eq.s64 %p176, %rd508, %rd494; + and.pred %p177, %p175, %p176; + setp.lt.u64 %p178, %rd508, %rd494; + or.pred %p179, %p178, %p177; + selp.u64 %rd509, 1, 0, %p179; + add.s64 %rd57, %rd496, %rd509; + add.s64 %rd510, %rd497, %rd470; + add.s64 %rd59, %rd510, %rd483; + setp.lt.u64 %p180, %rd59, %rd497; + selp.u64 %rd511, 1, 0, %p180; + add.s64 %rd60, %rd508, %rd511; + setp.lt.u64 %p7, %rd60, %rd508; + mul.lo.s64 %rd512, %rd1020, %rd1024; + mul.lo.s64 %rd513, %rd512, 576460752303423504; + mov.u64 %rd514, -1; + mul.hi.u64 %rd515, %rd471, %rd514; + mul.hi.u64 %rd516, %rd512, %rd514; + neg.s64 %rd61, %rd512; + mul.hi.u64 %rd517, %rd452, %rd514; + sub.s64 %rd518, %rd516, %rd512; + setp.lt.u64 %p181, %rd518, %rd61; + selp.u64 %rd519, 1, 0, %p181; + neg.s64 %rd520, %rd452; + sub.s64 %rd521, %rd517, %rd452; + sub.s64 %rd62, %rd518, %rd452; + setp.lt.u64 %p182, %rd62, %rd518; + selp.u64 %rd522, 1, 0, %p182; + add.s64 %rd523, %rd522, %rd519; + setp.lt.u64 %p183, %rd521, %rd520; + selp.u64 %rd524, 1, 0, %p183; + add.s64 %rd525, %rd521, %rd518; + setp.lt.u64 %p184, %rd525, %rd521; + selp.u64 %rd526, 1, 0, %p184; + sub.s64 %rd527, %rd525, %rd471; + setp.lt.u64 %p185, %rd527, %rd525; + selp.u64 %rd528, 1, 0, %p185; + add.s64 %rd64, %rd523, %rd527; + setp.lt.u64 %p186, %rd64, %rd523; + selp.u64 %rd529, 1, 0, %p186; + add.s64 %rd530, %rd515, %rd513; + add.s64 %rd531, %rd530, %rd516; + sub.s64 %rd532, %rd531, %rd471; + add.s64 %rd533, %rd532, %rd521; + add.s64 %rd534, %rd533, %rd519; + sub.s64 %rd535, %rd534, %rd482; + add.s64 %rd536, %rd535, %rd524; + add.s64 %rd537, %rd536, %rd526; + add.s64 %rd538, %rd537, %rd528; + add.s64 %rd63, %rd538, %rd529; + mov.u64 %rd397, 1; + mul.hi.u64 %rd539, %rd64, %rd397; + add.s64 %rd65, %rd63, %rd539; + mul.hi.u64 %rd66, %rd398, %rd61; + mul.lo.s64 %rd540, %rd512, -576460752303423505; + add.s64 %rd67, %rd66, %rd540; + setp.lt.u64 %p9, %rd67, %rd66; + mul.hi.u64 %rd541, %rd61, %rd397; + mul.hi.u64 %rd542, %rd62, %rd397; + mul.hi.u64 %rd543, %rd61, %rd398; + mul.hi.u64 %rd544, %rd62, %rd398; + add.s64 %rd545, %rd541, %rd62; + setp.lt.u64 %p187, %rd545, %rd541; + selp.u64 %rd546, 1, 0, %p187; + add.s64 %rd547, %rd543, %rd542; + setp.lt.u64 %p188, %rd547, %rd543; + selp.u64 %rd548, 1, 0, %p188; add.s64 %rd549, %rd547, %rd546; - setp.lt.u64 %p179, %rd549, %rd547; - selp.u64 %rd550, 1, 0, %p179; - sub.s64 %rd551, %rd529, %rd536; - add.s64 %rd552, %rd549, %rd551; - setp.lt.u64 %p180, %rd552, %rd549; - selp.u64 %rd553, 1, 0, %p180; - sub.s64 %rd71, %rd537, %rd766; - setp.lt.u64 %p181, %rd71, %rd537; - selp.u64 %rd554, 1, 0, %p181; - add.s64 %rd555, %rd535, %rd554; - add.s64 %rd72, %rd555, %rd544; - setp.eq.s64 %p182, %rd72, %rd535; - and.pred %p183, %p181, %p182; - setp.lt.u64 %p184, %rd72, %rd535; - or.pred %p185, %p184, %p183; - selp.u64 %rd556, 1, 0, %p185; - add.s64 %rd73, %rd552, %rd556; - setp.lt.u64 %p186, %rd73, %rd552; - selp.u64 %rd557, 1, 0, %p186; - sub.s64 %rd558, %rd60, %rd22; - add.s64 %rd559, %rd558, %rd60; - sub.s64 %rd74, %rd559, %rd28; - sub.s64 %rd560, %rd74, %rd67; - selp.s64 %rd561, -1, 0, %p6; - add.s64 %rd562, %rd560, %rd561; - setp.lt.u64 %p187, %rd64, %rd68; - selp.b64 %rd563, -1, 0, %p187; - add.s64 %rd564, %rd562, %rd563; - add.s64 %rd565, %rd564, %rd62; - selp.b64 %rd566, -576460752303423505, 0, %p5; - add.s64 %rd567, %rd565, %rd566; - setp.lt.u64 %p188, %rd529, %rd536; - selp.b64 %rd568, -1, 0, %p188; - add.s64 %rd569, %rd567, %rd568; - mul.lo.s64 %rd570, %rd766, -576460752303423506; - add.s64 %rd571, %rd569, %rd570; - add.s64 %rd572, %rd571, %rd543; - add.s64 %rd573, %rd572, %rd546; - add.s64 %rd574, %rd573, %rd548; - add.s64 %rd575, %rd574, %rd550; - add.s64 %rd576, %rd575, %rd553; - add.s64 %rd75, %rd576, %rd557; - add.s64 %rd577, %rd37, %rd30; - add.s64 %rd578, %rd577, %rd31; - add.s64 %rd579, %rd578, %rd34; - add.s64 %rd76, %rd579, %rd747; - setp.eq.s64 %p189, %rd76, %rd34; - setp.lt.u64 %p190, %rd38, %rd35; - and.pred %p191, %p190, %p189; - setp.lt.u64 %p192, %rd76, %rd34; - or.pred %p193, %p192, %p191; - selp.u64 %rd580, 1, 0, %p193; - add.s64 %rd77, %rd49, %rd580; - setp.lt.u64 %p7, %rd77, %rd49; - @%p85 bra $L__BB0_8; - - setp.eq.s64 %p195, %rd42, 0; - setp.lt.u64 %p196, %rd44, %rd41; - and.pred %p197, %p196, %p195; - setp.lt.u64 %p198, %rd43, %rd32; + setp.lt.u64 %p189, %rd549, %rd547; + selp.u64 %rd550, 1, 0, %p189; + add.s64 %rd68, %rd67, %rd65; + setp.lt.u64 %p10, %rd68, %rd67; + add.s64 %rd71, %rd549, %rd64; + setp.lt.u64 %p11, %rd71, %rd549; + selp.u64 %rd551, 1, 0, %p11; + add.s64 %rd552, %rd68, %rd12; + add.s64 %rd553, %rd552, %rd543; + add.s64 %rd554, %rd553, %rd544; + add.s64 %rd555, %rd554, %rd548; + add.s64 %rd556, %rd555, %rd550; + add.s64 %rd70, %rd556, %rd551; + add.s64 %rd557, %rd71, %rd471; + setp.lt.u64 %p190, %rd557, %rd71; + selp.u64 %rd558, 1, 0, %p190; + setp.ne.s64 %p191, %rd512, 0; + selp.u64 %rd559, 1, 0, %p191; + add.s64 %rd560, %rd518, %rd559; + add.s64 %rd561, %rd560, %rd541; + setp.eq.s64 %p192, %rd560, %rd62; + and.pred %p193, %p191, %p192; + setp.lt.u64 %p194, %rd561, %rd545; + or.pred %p195, %p194, %p193; + selp.u64 %rd562, 1, 0, %p195; + add.s64 %rd73, %rd557, %rd562; + setp.lt.u64 %p196, %rd73, %rd557; + selp.u64 %rd563, 1, 0, %p196; + add.s64 %rd564, %rd482, %rd558; + add.s64 %rd72, %rd564, %rd563; + setp.ne.s64 %p197, %rd72, 0; + setp.ne.s64 %p198, %rd73, %rd71; or.pred %p199, %p198, %p197; - selp.u64 %rd767, 1, 0, %p199; - -$L__BB0_8: - add.s64 %rd581, %rd76, %rd40; - setp.lt.u64 %p200, %rd581, %rd76; - selp.u64 %rd582, 1, 0, %p200; - add.s64 %rd583, %rd77, %rd582; - setp.lt.u64 %p201, %rd583, %rd77; - add.s64 %rd83, %rd767, %rd19; - setp.lt.u64 %p202, %rd83, %rd767; - selp.u64 %rd584, 1, 0, %p202; - add.s64 %rd82, %rd47, %rd584; - setp.lt.u64 %p203, %rd82, %rd47; - selp.u64 %rd585, 1, 0, %p203; - add.s64 %rd81, %rd48, %rd585; - setp.lt.u64 %p204, %rd81, %rd48; - selp.u64 %rd586, 1, 0, %p204; - add.s64 %rd80, %rd50, %rd586; - add.s64 %rd587, %rd81, %rd583; - setp.lt.u64 %p205, %rd587, %rd81; - selp.u64 %rd588, 1, 0, %p205; - add.s64 %rd589, %rd82, %rd581; - add.s64 %rd84, %rd83, %rd39; - setp.lt.u64 %p206, %rd84, %rd83; - selp.u64 %rd590, 1, 0, %p206; - add.s64 %rd85, %rd589, %rd590; - setp.eq.s64 %p207, %rd85, %rd82; - and.pred %p208, %p206, %p207; - setp.lt.u64 %p209, %rd85, %rd82; - or.pred %p210, %p209, %p208; - selp.u64 %rd591, 1, 0, %p210; - add.s64 %rd87, %rd587, %rd591; - setp.lt.u64 %p211, %rd87, %rd587; - selp.u64 %rd592, 1, 0, %p211; - selp.u64 %rd593, 1, 0, %p7; - add.s64 %rd594, %rd55, %rd593; - selp.u64 %rd595, 1, 0, %p201; + not.pred %p200, %p195; + or.pred %p201, %p199, %p200; + not.pred %p202, %p201; + mov.u64 %rd1025, %rd397; + @%p202 bra $L__BB1_10; + + setp.eq.s64 %p203, %rd72, 0; + setp.lt.u64 %p204, %rd73, %rd71; + and.pred %p205, %p204, %p203; + add.s64 %rd565, %rd72, %rd70; + setp.lt.u64 %p206, %rd565, %rd72; + or.pred %p207, %p206, %p205; + selp.u64 %rd1025, 1, 0, %p207; + +$L__BB1_10: + mul.hi.u64 %rd1014, %rd64, %rd397; + add.s64 %rd1013, %rd63, %rd1014; + setp.lt.u64 %p420, %rd1013, %rd63; + setp.lt.u64 %p415, %rd57, %rd496; + selp.u64 %rd1002, 1, 0, %p415; + mul.hi.u64 %rd1001, %rd398, %rd1018; + selp.u64 %rd566, 1, 0, %p7; + add.s64 %rd567, %rd57, %rd566; + setp.lt.u64 %p208, %rd567, %rd57; + mul.hi.u64 %rd569, %rd64, %rd398; + mul.hi.u64 %rd570, %rd63, %rd398; + mul.lo.s64 %rd571, %rd64, 576460752303423505; + mov.u64 %rd572, 576460752303423505; + add.s64 %rd573, %rd571, %rd569; + setp.lt.u64 %p209, %rd573, %rd571; + mul.lo.s64 %rd574, %rd63, 576460752303423505; + add.s64 %rd575, %rd574, %rd570; + setp.lt.u64 %p210, %rd575, %rd574; + mul.hi.u64 %rd576, %rd64, %rd572; + add.s64 %rd577, %rd575, %rd576; + setp.lt.u64 %p211, %rd577, %rd575; + selp.u64 %rd578, 1, 0, %p209; + add.s64 %rd579, %rd577, %rd578; + setp.lt.u64 %p212, %rd579, %rd577; + mul.hi.u64 %rd581, %rd63, %rd397; + add.s64 %rd582, %rd581, %rd569; + setp.lt.u64 %p213, %rd582, %rd581; + selp.u64 %rd583, 1, 0, %p420; + add.s64 %rd584, %rd582, %rd583; + setp.lt.u64 %p214, %rd584, %rd582; + mul.lo.s64 %rd585, %rd62, 576460752303423505; + mul.hi.u64 %rd586, %rd572, %rd61; + add.s64 %rd587, %rd586, %rd585; + setp.lt.u64 %p215, %rd587, %rd586; + mul.hi.u64 %rd588, %rd398, %rd62; + add.s64 %rd589, %rd588, %rd587; + setp.lt.u64 %p216, %rd589, %rd588; + selp.u64 %rd590, 1, 0, %p9; + add.s64 %rd591, %rd589, %rd590; + setp.lt.u64 %p217, %rd591, %rd589; + add.s64 %rd592, %rd570, %rd569; + add.s64 %rd593, %rd592, %rd12; + add.s64 %rd594, %rd593, %rd573; + selp.u64 %rd595, 1, 0, %p213; add.s64 %rd596, %rd594, %rd595; - add.s64 %rd597, %rd596, %rd80; - add.s64 %rd598, %rd597, %rd588; - add.s64 %rd86, %rd598, %rd592; - setp.ne.s64 %p212, %rd86, %rd80; - setp.ne.s64 %p213, %rd87, %rd81; - or.pred %p214, %p213, %p212; - @%p214 bra $L__BB0_10; - - mov.u64 %rd768, 1; - @%p210 bra $L__BB0_11; - -$L__BB0_10: - setp.eq.s64 %p220, %rd86, %rd80; - setp.lt.u64 %p221, %rd87, %rd81; - and.pred %p222, %p221, %p220; - setp.lt.u64 %p223, %rd86, %rd80; - or.pred %p224, %p223, %p222; - selp.u64 %rd768, 1, 0, %p224; + selp.u64 %rd597, 1, 0, %p214; + add.s64 %rd598, %rd596, %rd597; + setp.lt.u64 %p218, %rd598, %rd573; + selp.u64 %rd599, 1, 0, %p218; + add.s64 %rd600, %rd579, %rd599; + setp.lt.u64 %p219, %rd600, %rd579; + add.s64 %rd601, %rd591, %rd584; + setp.lt.u64 %p220, %rd601, %rd591; + add.s64 %rd602, %rd66, %rd11; + mul.hi.u64 %rd603, %rd572, %rd62; + add.s64 %rd604, %rd602, %rd603; + selp.u64 %rd605, 1, 0, %p215; + add.s64 %rd606, %rd604, %rd605; + selp.u64 %rd607, 1, 0, %p216; + add.s64 %rd608, %rd606, %rd607; + selp.u64 %rd609, 1, 0, %p217; + add.s64 %rd610, %rd608, %rd609; + selp.u64 %rd611, 1, 0, %p220; + add.s64 %rd612, %rd610, %rd611; + add.s64 %rd613, %rd612, %rd598; + setp.lt.u64 %p221, %rd613, %rd612; + setp.eq.s64 %p222, %rd612, 0; + and.pred %p223, %p220, %p222; + or.pred %p224, %p221, %p223; + selp.u64 %rd614, 1, 0, %p224; + add.s64 %rd615, %rd600, %rd614; + setp.lt.u64 %p225, %rd615, %rd600; + selp.u64 %rd616, 1, 0, %p10; + add.s64 %rd617, %rd601, %rd616; + setp.lt.u64 %p226, %rd70, %rd68; + setp.eq.s64 %p227, %rd70, %rd68; + and.pred %p228, %p11, %p227; + or.pred %p229, %p226, %p228; + selp.u64 %rd618, 1, 0, %p229; + add.s64 %rd619, %rd617, %rd618; + setp.lt.u64 %p230, %rd619, %rd601; + selp.u64 %rd620, 1, 0, %p230; + add.s64 %rd621, %rd613, %rd620; + setp.lt.u64 %p231, %rd621, %rd613; + selp.u64 %rd622, 1, 0, %p231; + add.s64 %rd623, %rd615, %rd622; + setp.lt.u64 %p232, %rd623, %rd615; + add.s64 %rd79, %rd1025, %rd59; + setp.lt.u64 %p233, %rd79, %rd1025; + selp.u64 %rd624, 1, 0, %p233; + add.s64 %rd78, %rd60, %rd624; + setp.lt.u64 %p234, %rd78, %rd60; + selp.u64 %rd625, 1, 0, %p234; + add.s64 %rd77, %rd567, %rd625; + setp.lt.u64 %p235, %rd77, %rd567; + selp.u64 %rd626, 1, 0, %p235; + add.s64 %rd627, %rd51, %rd1001; + add.s64 %rd628, %rd627, %rd55; + add.s64 %rd629, %rd628, %rd52; + add.s64 %rd630, %rd629, %rd53; + add.s64 %rd631, %rd630, %rd54; + add.s64 %rd632, %rd631, %rd56; + add.s64 %rd633, %rd632, %rd1002; + selp.u64 %rd634, 1, 0, %p208; + add.s64 %rd635, %rd633, %rd634; + add.s64 %rd76, %rd635, %rd626; + add.s64 %rd636, %rd77, %rd623; + setp.lt.u64 %p236, %rd636, %rd77; + selp.u64 %rd637, 1, 0, %p236; + add.s64 %rd638, %rd78, %rd621; + add.s64 %rd80, %rd79, %rd619; + setp.lt.u64 %p237, %rd80, %rd79; + selp.u64 %rd639, 1, 0, %p237; + add.s64 %rd81, %rd638, %rd639; + setp.eq.s64 %p238, %rd81, %rd78; + and.pred %p239, %p237, %p238; + setp.lt.u64 %p240, %rd81, %rd78; + or.pred %p241, %p240, %p239; + selp.u64 %rd640, 1, 0, %p241; + add.s64 %rd83, %rd636, %rd640; + setp.lt.u64 %p242, %rd83, %rd636; + selp.u64 %rd641, 1, 0, %p242; + add.s64 %rd642, %rd569, %rd11; + mul.hi.u64 %rd643, %rd63, %rd572; + add.s64 %rd644, %rd642, %rd643; + selp.u64 %rd645, 1, 0, %p210; + add.s64 %rd646, %rd644, %rd645; + selp.u64 %rd647, 1, 0, %p211; + add.s64 %rd648, %rd646, %rd647; + selp.u64 %rd649, 1, 0, %p212; + add.s64 %rd650, %rd648, %rd649; + selp.u64 %rd651, 1, 0, %p219; + add.s64 %rd652, %rd650, %rd651; + selp.u64 %rd653, 1, 0, %p225; + add.s64 %rd654, %rd652, %rd653; + selp.u64 %rd655, 1, 0, %p232; + add.s64 %rd656, %rd654, %rd655; + add.s64 %rd657, %rd656, %rd76; + add.s64 %rd658, %rd657, %rd637; + add.s64 %rd82, %rd658, %rd641; + setp.ne.s64 %p243, %rd82, %rd76; + setp.ne.s64 %p244, %rd83, %rd77; + or.pred %p245, %p244, %p243; + @%p245 bra $L__BB1_12; -$L__BB0_11: - mov.u64 %rd763, 0; - mul.hi.u64 %rd762, %rd763, %rd763; - mov.u64 %rd749, -1; - mov.u64 %rd748, 0; - or.b64 %rd600, %rd85, %rd84; - or.b64 %rd601, %rd600, %rd87; - setp.eq.s64 %p225, %rd601, 0; - setp.eq.s64 %p226, %rd86, 576460752303423505; - and.pred %p227, %p225, %p226; - setp.gt.u64 %p228, %rd86, 576460752303423504; - xor.pred %p229, %p228, %p227; - selp.u64 %rd603, 1, 0, %p229; - mul.hi.u64 %rd604, %rd768, %rd748; - add.s64 %rd605, %rd604, %rd20; - mul.hi.u64 %rd607, %rd768, %rd749; - add.s64 %rd608, %rd605, %rd607; - sub.s64 %rd609, %rd608, %rd768; - neg.s64 %rd610, %rd768; - setp.lt.u64 %p230, %rd609, %rd610; - selp.u64 %rd611, 1, 0, %p230; - add.s64 %rd612, %rd609, %rd60; - setp.lt.u64 %p231, %rd612, %rd609; - add.s64 %rd613, %rd612, %rd611; - setp.lt.u64 %p232, %rd613, %rd612; - add.s64 %rd614, %rd613, %rd87; - setp.lt.u64 %p233, %rd614, %rd613; - sub.s64 %rd615, %rd84, %rd768; - setp.lt.u64 %p234, %rd615, %rd84; - selp.u64 %rd616, 1, 0, %p234; - add.s64 %rd617, %rd85, %rd616; - add.s64 %rd618, %rd617, %rd609; - setp.eq.s64 %p235, %rd618, %rd85; - and.pred %p236, %p234, %p235; - setp.lt.u64 %p237, %rd618, %rd85; - or.pred %p238, %p237, %p236; - selp.u64 %rd619, 1, 0, %p238; - add.s64 %rd620, %rd614, %rd619; - setp.lt.u64 %p239, %rd620, %rd614; - mul.hi.u64 %rd621, %rd603, %rd748; - mov.u64 %rd622, 1; - mul.hi.u64 %rd623, %rd603, %rd622; - add.s64 %rd624, %rd621, %rd762; - add.s64 %rd625, %rd624, %rd621; - add.s64 %rd626, %rd625, %rd28; - setp.lt.u64 %p240, %rd626, %rd625; - setp.lt.u64 %p241, %rd620, %rd626; - sub.s64 %rd627, %rd620, %rd626; - setp.lt.u64 %p242, %rd615, %rd603; - selp.b64 %rd628, -1, 0, %p242; - add.s64 %rd629, %rd621, %rd27; - sub.s64 %rd630, %rd628, %rd629; - sub.s64 %rd631, %rd630, %rd623; - add.s64 %rd632, %rd631, %rd618; - neg.s64 %rd633, %rd632; - setp.eq.s64 %p243, %rd631, 0; - and.pred %p244, %p242, %p243; - setp.gt.u64 %p245, %rd632, %rd618; + mov.u64 %rd1026, 1; + @%p241 bra $L__BB1_13; + +$L__BB1_12: + setp.eq.s64 %p251, %rd82, %rd76; + setp.lt.u64 %p252, %rd83, %rd77; + and.pred %p253, %p252, %p251; + setp.lt.u64 %p254, %rd82, %rd76; + or.pred %p255, %p254, %p253; + selp.u64 %rd1026, 1, 0, %p255; + +$L__BB1_13: + or.b64 %rd660, %rd81, %rd80; + or.b64 %rd661, %rd660, %rd83; + setp.eq.s64 %p256, %rd661, 0; + mov.u64 %rd662, 0; + setp.eq.s64 %p257, %rd82, 576460752303423505; + and.pred %p258, %p256, %p257; + setp.gt.u64 %p259, %rd82, 576460752303423504; + xor.pred %p260, %p259, %p258; + selp.u64 %rd663, 1, 0, %p260; + mov.u64 %rd664, -1; + mul.hi.u64 %rd665, %rd1026, %rd664; + sub.s64 %rd666, %rd665, %rd1026; + neg.s64 %rd667, %rd1026; + setp.lt.u64 %p261, %rd666, %rd667; + selp.u64 %rd668, 1, 0, %p261; + add.s64 %rd669, %rd666, %rd36; + setp.lt.u64 %p262, %rd669, %rd666; + selp.u64 %rd670, 1, 0, %p262; + add.s64 %rd671, %rd669, %rd668; + setp.lt.u64 %p263, %rd671, %rd669; + selp.u64 %rd672, 1, 0, %p263; + add.s64 %rd673, %rd671, %rd83; + setp.lt.u64 %p264, %rd673, %rd671; + selp.u64 %rd674, 1, 0, %p264; + add.s64 %rd675, %rd666, %rd81; + sub.s64 %rd676, %rd80, %rd1026; + setp.lt.u64 %p265, %rd676, %rd80; + selp.u64 %rd677, 1, 0, %p265; + add.s64 %rd678, %rd675, %rd677; + setp.eq.s64 %p266, %rd678, %rd81; + and.pred %p267, %p265, %p266; + setp.lt.u64 %p268, %rd678, %rd81; + or.pred %p269, %p268, %p267; + selp.u64 %rd679, 1, 0, %p269; + add.s64 %rd680, %rd673, %rd679; + setp.lt.u64 %p270, %rd680, %rd673; + selp.u64 %rd681, 1, 0, %p270; + mov.u64 %rd682, 1; + mul.hi.u64 %rd683, %rd663, %rd682; + mul.hi.u64 %rd684, %rd663, %rd662; + add.s64 %rd685, %rd684, %rd12; + setp.lt.u64 %p271, %rd685, %rd684; + selp.b64 %rd686, -576460752303423505, 0, %p260; + selp.b64 %rd687, -1, 0, %p271; + setp.lt.u64 %p272, %rd680, %rd685; + selp.b64 %rd688, -1, 0, %p272; + sub.s64 %rd689, %rd680, %rd685; + sub.s64 %rd690, %rd678, %rd683; + setp.lt.u64 %p273, %rd676, %rd663; + selp.b64 %rd691, -1, 0, %p273; + add.s64 %rd1023, %rd690, %rd691; + setp.eq.s64 %p274, %rd1023, %rd678; + and.pred %p275, %p273, %p274; + setp.gt.u64 %p276, %rd1023, %rd678; + or.pred %p277, %p276, %p275; + selp.u64 %rd692, 1, 0, %p277; + setp.lt.u64 %p278, %rd689, %rd692; + selp.b64 %rd693, -1, 0, %p278; + sub.s64 %rd694, %rd82, %rd38; + add.s64 %rd695, %rd694, %rd686; + mul.lo.s64 %rd696, %rd1026, -576460752303423506; + add.s64 %rd697, %rd695, %rd696; + add.s64 %rd698, %rd697, %rd36; + add.s64 %rd699, %rd698, %rd36; + add.s64 %rd700, %rd699, %rd665; + sub.s64 %rd701, %rd700, %rd684; + add.s64 %rd702, %rd701, %rd668; + add.s64 %rd703, %rd702, %rd670; + add.s64 %rd704, %rd703, %rd687; + add.s64 %rd705, %rd704, %rd672; + add.s64 %rd706, %rd705, %rd674; + add.s64 %rd707, %rd706, %rd681; + add.s64 %rd708, %rd707, %rd688; + add.s64 %rd1021, %rd708, %rd693; + sub.s64 %rd1022, %rd689, %rd692; + sub.s64 %rd1024, %rd676, %rd663; + +$L__BB1_14: + mov.u64 %rd710, 0; + shr.u32 %r13, %r13, 1; + mul.lo.s64 %rd711, %rd1017, %rd1018; + mul.hi.u64 %rd712, %rd1018, %rd1018; + add.s64 %rd713, %rd712, %rd711; + setp.lt.u64 %p279, %rd713, %rd712; + selp.u64 %rd714, 1, 0, %p279; + mul.hi.u64 %rd95, %rd1017, %rd1017; + add.s64 %rd715, %rd713, %rd711; + setp.lt.u64 %p280, %rd715, %rd713; + selp.u64 %rd716, 1, 0, %p280; + mul.lo.s64 %rd717, %rd1017, %rd1017; + mul.hi.u64 %rd718, %rd1017, %rd1018; + add.s64 %rd719, %rd718, %rd717; + setp.lt.u64 %p281, %rd719, %rd718; + selp.u64 %rd96, 1, 0, %p281; + mul.hi.u64 %rd720, %rd1018, %rd1017; + add.s64 %rd721, %rd720, %rd719; + setp.lt.u64 %p282, %rd721, %rd720; + selp.u64 %rd97, 1, 0, %p282; + add.s64 %rd722, %rd721, %rd714; + add.s64 %rd723, %rd722, %rd716; + setp.lt.u64 %p283, %rd723, %rd721; + selp.u64 %rd98, 1, 0, %p283; + mul.lo.s64 %rd724, %rd1018, %rd1019; + mul.hi.u64 %rd725, %rd1018, %rd1020; + add.s64 %rd726, %rd725, %rd724; + setp.lt.u64 %p284, %rd726, %rd725; + selp.u64 %rd727, 1, 0, %p284; + mul.lo.s64 %rd728, %rd1017, %rd1020; + add.s64 %rd729, %rd726, %rd728; + setp.lt.u64 %p285, %rd729, %rd726; + selp.u64 %rd730, 1, 0, %p285; + mul.lo.s64 %rd731, %rd1017, %rd1019; + mul.hi.u64 %rd732, %rd1017, %rd1020; + add.s64 %rd733, %rd732, %rd731; + setp.lt.u64 %p286, %rd733, %rd732; + selp.u64 %rd734, 1, 0, %p286; + mul.hi.u64 %rd735, %rd1018, %rd1019; + add.s64 %rd736, %rd735, %rd733; + setp.lt.u64 %p287, %rd736, %rd735; + selp.u64 %rd737, 1, 0, %p287; + add.s64 %rd738, %rd736, %rd727; + add.s64 %rd739, %rd738, %rd730; + setp.lt.u64 %p288, %rd739, %rd736; + selp.u64 %rd740, 1, 0, %p288; + mul.lo.s64 %rd741, %rd1019, %rd1020; + mul.hi.u64 %rd742, %rd1020, %rd1020; + add.s64 %rd743, %rd742, %rd741; + setp.lt.u64 %p289, %rd743, %rd742; + selp.u64 %rd744, 1, 0, %p289; + add.s64 %rd745, %rd743, %rd741; + setp.lt.u64 %p290, %rd745, %rd743; + selp.u64 %rd746, 1, 0, %p290; + mul.lo.s64 %rd747, %rd1019, %rd1019; + mul.hi.u64 %rd748, %rd1019, %rd1020; + add.s64 %rd749, %rd748, %rd747; + setp.lt.u64 %p291, %rd749, %rd748; + selp.u64 %rd750, 1, 0, %p291; + mul.hi.u64 %rd751, %rd1020, %rd1019; + add.s64 %rd752, %rd751, %rd749; + setp.lt.u64 %p292, %rd752, %rd751; + selp.u64 %rd753, 1, 0, %p292; + add.s64 %rd754, %rd752, %rd744; + add.s64 %rd755, %rd754, %rd746; + setp.lt.u64 %p293, %rd755, %rd752; + selp.u64 %rd756, 1, 0, %p293; + mul.lo.s64 %rd757, %rd1018, %rd1020; + shl.b64 %rd758, %rd757, 1; + mov.u64 %rd709, 1; + setp.lt.u64 %p294, %rd758, %rd757; + selp.u64 %rd759, 1, 0, %p294; + add.s64 %rd760, %rd729, %rd759; + add.s64 %rd761, %rd760, %rd729; + setp.lt.u64 %p295, %rd761, %rd760; + setp.eq.s64 %p296, %rd760, 0; + and.pred %p297, %p294, %p296; + or.pred %p298, %p295, %p297; + selp.u64 %rd762, 1, 0, %p298; + add.s64 %rd763, %rd755, %rd758; + setp.lt.u64 %p299, %rd763, %rd755; + selp.u64 %rd764, 1, 0, %p299; + mul.hi.u64 %rd765, %rd710, %rd1020; + add.s64 %rd766, %rd761, %rd765; + mul.hi.u64 %rd767, %rd1019, %rd1019; + add.s64 %rd768, %rd766, %rd767; + mul.hi.u64 %rd769, %rd1020, %rd710; + add.s64 %rd770, %rd768, %rd769; + add.s64 %rd771, %rd770, %rd750; + add.s64 %rd772, %rd771, %rd753; + add.s64 %rd773, %rd772, %rd756; + add.s64 %rd774, %rd773, %rd764; + setp.eq.s64 %p300, %rd774, %rd761; + and.pred %p301, %p299, %p300; + setp.lt.u64 %p302, %rd774, %rd761; + or.pred %p303, %p302, %p301; + selp.u64 %rd775, 1, 0, %p303; + mul.lo.s64 %rd776, %rd1018, %rd1018; + add.s64 %rd777, %rd739, %rd776; + setp.lt.u64 %p304, %rd777, %rd739; + selp.u64 %rd778, 1, 0, %p304; + mul.hi.u64 %rd99, %rd1018, %rd710; + add.s64 %rd779, %rd99, %rd715; + add.s64 %rd780, %rd779, %rd765; + mul.hi.u64 %rd781, %rd1017, %rd1019; + add.s64 %rd782, %rd780, %rd781; + add.s64 %rd783, %rd782, %rd734; + add.s64 %rd784, %rd783, %rd737; + add.s64 %rd785, %rd784, %rd740; + add.s64 %rd786, %rd785, %rd778; + setp.eq.s64 %p305, %rd786, %rd715; + and.pred %p306, %p304, %p305; + setp.lt.u64 %p307, %rd786, %rd715; + or.pred %p308, %p307, %p306; + selp.u64 %rd787, 1, 0, %p308; + add.s64 %rd788, %rd723, %rd787; + setp.lt.u64 %p309, %rd788, %rd723; + selp.u64 %rd100, 1, 0, %p309; + add.s64 %rd789, %rd739, %rd777; + setp.lt.u64 %p310, %rd789, %rd739; + selp.u64 %rd790, 1, 0, %p310; + add.s64 %rd791, %rd781, %rd765; + add.s64 %rd792, %rd791, %rd99; + add.s64 %rd793, %rd792, %rd734; + add.s64 %rd794, %rd793, %rd786; + add.s64 %rd795, %rd794, %rd737; + add.s64 %rd796, %rd795, %rd740; + add.s64 %rd797, %rd796, %rd790; + setp.eq.s64 %p311, %rd797, %rd786; + and.pred %p312, %p310, %p311; + setp.lt.u64 %p313, %rd797, %rd786; + or.pred %p314, %p313, %p312; + selp.u64 %rd798, 1, 0, %p314; + add.s64 %rd101, %rd788, %rd798; + setp.lt.u64 %p12, %rd101, %rd788; + selp.u64 %rd102, 1, 0, %p12; + add.s64 %rd799, %rd789, %rd762; + add.s64 %rd103, %rd799, %rd775; + setp.lt.u64 %p315, %rd103, %rd789; + selp.u64 %rd800, 1, 0, %p315; + add.s64 %rd104, %rd797, %rd800; + setp.lt.u64 %p13, %rd104, %rd797; + mul.lo.s64 %rd801, %rd1020, %rd1020; + mul.lo.s64 %rd802, %rd801, 576460752303423504; + mov.u64 %rd803, -1; + mul.hi.u64 %rd804, %rd763, %rd803; + mul.hi.u64 %rd805, %rd801, %rd803; + neg.s64 %rd105, %rd801; + mul.hi.u64 %rd806, %rd745, %rd803; + sub.s64 %rd807, %rd805, %rd801; + setp.lt.u64 %p316, %rd807, %rd105; + selp.u64 %rd808, 1, 0, %p316; + neg.s64 %rd809, %rd745; + sub.s64 %rd810, %rd806, %rd745; + sub.s64 %rd106, %rd807, %rd745; + setp.lt.u64 %p317, %rd106, %rd807; + selp.u64 %rd811, 1, 0, %p317; + add.s64 %rd812, %rd811, %rd808; + setp.lt.u64 %p318, %rd810, %rd809; + selp.u64 %rd813, 1, 0, %p318; + add.s64 %rd814, %rd810, %rd807; + setp.lt.u64 %p319, %rd814, %rd810; + selp.u64 %rd815, 1, 0, %p319; + sub.s64 %rd816, %rd814, %rd763; + setp.lt.u64 %p320, %rd816, %rd814; + selp.u64 %rd817, 1, 0, %p320; + add.s64 %rd108, %rd812, %rd816; + setp.lt.u64 %p321, %rd108, %rd812; + selp.u64 %rd818, 1, 0, %p321; + add.s64 %rd819, %rd804, %rd802; + add.s64 %rd820, %rd819, %rd805; + sub.s64 %rd821, %rd820, %rd763; + add.s64 %rd822, %rd821, %rd810; + add.s64 %rd823, %rd822, %rd808; + sub.s64 %rd824, %rd823, %rd774; + add.s64 %rd825, %rd824, %rd813; + add.s64 %rd826, %rd825, %rd815; + add.s64 %rd827, %rd826, %rd817; + add.s64 %rd107, %rd827, %rd818; + mul.hi.u64 %rd828, %rd108, %rd709; + add.s64 %rd109, %rd107, %rd828; + mul.hi.u64 %rd110, %rd710, %rd105; + mul.lo.s64 %rd829, %rd801, -576460752303423505; + add.s64 %rd111, %rd110, %rd829; + setp.lt.u64 %p15, %rd111, %rd110; + mul.hi.u64 %rd830, %rd105, %rd709; + mul.hi.u64 %rd831, %rd106, %rd709; + mul.hi.u64 %rd832, %rd105, %rd710; + mul.hi.u64 %rd833, %rd106, %rd710; + add.s64 %rd834, %rd830, %rd106; + setp.lt.u64 %p322, %rd834, %rd830; + selp.u64 %rd835, 1, 0, %p322; + add.s64 %rd836, %rd832, %rd831; + setp.lt.u64 %p323, %rd836, %rd832; + selp.u64 %rd837, 1, 0, %p323; + add.s64 %rd838, %rd836, %rd835; + setp.lt.u64 %p324, %rd838, %rd836; + selp.u64 %rd839, 1, 0, %p324; + add.s64 %rd112, %rd111, %rd109; + setp.lt.u64 %p16, %rd112, %rd111; + add.s64 %rd115, %rd838, %rd108; + setp.lt.u64 %p17, %rd115, %rd838; + selp.u64 %rd840, 1, 0, %p17; + add.s64 %rd841, %rd112, %rd12; + add.s64 %rd842, %rd841, %rd832; + add.s64 %rd843, %rd842, %rd833; + add.s64 %rd844, %rd843, %rd837; + add.s64 %rd845, %rd844, %rd839; + add.s64 %rd114, %rd845, %rd840; + add.s64 %rd846, %rd115, %rd763; + setp.lt.u64 %p325, %rd846, %rd115; + selp.u64 %rd847, 1, 0, %p325; + setp.ne.s64 %p326, %rd801, 0; + selp.u64 %rd848, 1, 0, %p326; + add.s64 %rd849, %rd807, %rd848; + add.s64 %rd850, %rd849, %rd830; + setp.eq.s64 %p327, %rd849, %rd106; + and.pred %p328, %p326, %p327; + setp.lt.u64 %p329, %rd850, %rd834; + or.pred %p330, %p329, %p328; + selp.u64 %rd851, 1, 0, %p330; + add.s64 %rd117, %rd846, %rd851; + setp.lt.u64 %p331, %rd117, %rd846; + selp.u64 %rd852, 1, 0, %p331; + add.s64 %rd853, %rd774, %rd847; + add.s64 %rd116, %rd853, %rd852; + setp.ne.s64 %p332, %rd116, 0; + setp.ne.s64 %p333, %rd117, %rd115; + or.pred %p334, %p333, %p332; + not.pred %p335, %p330; + or.pred %p336, %p334, %p335; + not.pred %p337, %p336; + mov.u64 %rd1031, %rd709; + @%p337 bra $L__BB1_16; + + setp.eq.s64 %p338, %rd116, 0; + setp.lt.u64 %p339, %rd117, %rd115; + and.pred %p340, %p339, %p338; + add.s64 %rd854, %rd116, %rd114; + setp.lt.u64 %p341, %rd854, %rd116; + or.pred %p342, %p341, %p340; + selp.u64 %rd1031, 1, 0, %p342; + +$L__BB1_16: + mul.hi.u64 %rd1006, %rd710, %rd1018; + mul.hi.u64 %rd1005, %rd108, %rd709; + add.s64 %rd1004, %rd107, %rd1005; + setp.lt.u64 %p416, %rd1004, %rd107; + selp.u64 %rd855, 1, 0, %p13; + add.s64 %rd856, %rd101, %rd855; + setp.lt.u64 %p343, %rd856, %rd101; + mul.hi.u64 %rd858, %rd108, %rd710; + mul.hi.u64 %rd859, %rd107, %rd710; + mul.lo.s64 %rd860, %rd108, 576460752303423505; + mov.u64 %rd861, 576460752303423505; + add.s64 %rd862, %rd860, %rd858; + setp.lt.u64 %p344, %rd862, %rd860; + mul.lo.s64 %rd863, %rd107, 576460752303423505; + add.s64 %rd864, %rd863, %rd859; + setp.lt.u64 %p345, %rd864, %rd863; + mul.hi.u64 %rd865, %rd108, %rd861; + add.s64 %rd866, %rd864, %rd865; + setp.lt.u64 %p346, %rd866, %rd864; + selp.u64 %rd867, 1, 0, %p344; + add.s64 %rd868, %rd866, %rd867; + setp.lt.u64 %p347, %rd868, %rd866; + mul.hi.u64 %rd870, %rd107, %rd709; + add.s64 %rd871, %rd870, %rd858; + setp.lt.u64 %p348, %rd871, %rd870; + selp.u64 %rd872, 1, 0, %p416; + add.s64 %rd873, %rd871, %rd872; + setp.lt.u64 %p349, %rd873, %rd871; + mul.lo.s64 %rd874, %rd106, 576460752303423505; + mul.hi.u64 %rd875, %rd861, %rd105; + add.s64 %rd876, %rd875, %rd874; + setp.lt.u64 %p350, %rd876, %rd875; + mul.hi.u64 %rd877, %rd710, %rd106; + add.s64 %rd878, %rd877, %rd876; + setp.lt.u64 %p351, %rd878, %rd877; + selp.u64 %rd879, 1, 0, %p15; + add.s64 %rd880, %rd878, %rd879; + setp.lt.u64 %p352, %rd880, %rd878; + add.s64 %rd881, %rd859, %rd858; + add.s64 %rd882, %rd881, %rd12; + add.s64 %rd883, %rd882, %rd862; + selp.u64 %rd884, 1, 0, %p348; + add.s64 %rd885, %rd883, %rd884; + selp.u64 %rd886, 1, 0, %p349; + add.s64 %rd887, %rd885, %rd886; + setp.lt.u64 %p353, %rd887, %rd862; + selp.u64 %rd888, 1, 0, %p353; + add.s64 %rd889, %rd868, %rd888; + setp.lt.u64 %p354, %rd889, %rd868; + add.s64 %rd890, %rd880, %rd873; + setp.lt.u64 %p355, %rd890, %rd880; + add.s64 %rd891, %rd110, %rd11; + mul.hi.u64 %rd892, %rd861, %rd106; + add.s64 %rd893, %rd891, %rd892; + selp.u64 %rd894, 1, 0, %p350; + add.s64 %rd895, %rd893, %rd894; + selp.u64 %rd896, 1, 0, %p351; + add.s64 %rd897, %rd895, %rd896; + selp.u64 %rd898, 1, 0, %p352; + add.s64 %rd899, %rd897, %rd898; + selp.u64 %rd900, 1, 0, %p355; + add.s64 %rd901, %rd899, %rd900; + add.s64 %rd902, %rd901, %rd887; + setp.lt.u64 %p356, %rd902, %rd901; + setp.eq.s64 %p357, %rd901, 0; + and.pred %p358, %p355, %p357; + or.pred %p359, %p356, %p358; + selp.u64 %rd903, 1, 0, %p359; + add.s64 %rd904, %rd889, %rd903; + setp.lt.u64 %p360, %rd904, %rd889; + selp.u64 %rd905, 1, 0, %p16; + add.s64 %rd906, %rd890, %rd905; + setp.lt.u64 %p361, %rd114, %rd112; + setp.eq.s64 %p362, %rd114, %rd112; + and.pred %p363, %p17, %p362; + or.pred %p364, %p361, %p363; + selp.u64 %rd907, 1, 0, %p364; + add.s64 %rd908, %rd906, %rd907; + setp.lt.u64 %p365, %rd908, %rd890; + selp.u64 %rd909, 1, 0, %p365; + add.s64 %rd910, %rd902, %rd909; + setp.lt.u64 %p366, %rd910, %rd902; + selp.u64 %rd911, 1, 0, %p366; + add.s64 %rd912, %rd904, %rd911; + setp.lt.u64 %p367, %rd912, %rd904; + add.s64 %rd123, %rd1031, %rd103; + setp.lt.u64 %p368, %rd123, %rd1031; + selp.u64 %rd913, 1, 0, %p368; + add.s64 %rd122, %rd104, %rd913; + setp.lt.u64 %p369, %rd122, %rd104; + selp.u64 %rd914, 1, 0, %p369; + add.s64 %rd121, %rd856, %rd914; + setp.lt.u64 %p370, %rd121, %rd856; + selp.u64 %rd915, 1, 0, %p370; + add.s64 %rd916, %rd95, %rd1006; + add.s64 %rd917, %rd916, %rd99; + add.s64 %rd918, %rd917, %rd96; + add.s64 %rd919, %rd918, %rd97; + add.s64 %rd920, %rd919, %rd98; + add.s64 %rd921, %rd920, %rd100; + add.s64 %rd922, %rd921, %rd102; + selp.u64 %rd923, 1, 0, %p343; + add.s64 %rd924, %rd922, %rd923; + add.s64 %rd120, %rd924, %rd915; + add.s64 %rd925, %rd121, %rd912; + setp.lt.u64 %p371, %rd925, %rd121; + selp.u64 %rd926, 1, 0, %p371; + add.s64 %rd927, %rd122, %rd910; + add.s64 %rd124, %rd123, %rd908; + setp.lt.u64 %p372, %rd124, %rd123; + selp.u64 %rd928, 1, 0, %p372; + add.s64 %rd125, %rd927, %rd928; + setp.eq.s64 %p373, %rd125, %rd122; + and.pred %p374, %p372, %p373; + setp.lt.u64 %p375, %rd125, %rd122; + or.pred %p376, %p375, %p374; + selp.u64 %rd929, 1, 0, %p376; + add.s64 %rd127, %rd925, %rd929; + setp.lt.u64 %p377, %rd127, %rd925; + selp.u64 %rd930, 1, 0, %p377; + add.s64 %rd931, %rd858, %rd11; + mul.hi.u64 %rd932, %rd107, %rd861; + add.s64 %rd933, %rd931, %rd932; + selp.u64 %rd934, 1, 0, %p345; + add.s64 %rd935, %rd933, %rd934; + selp.u64 %rd936, 1, 0, %p346; + add.s64 %rd937, %rd935, %rd936; + selp.u64 %rd938, 1, 0, %p347; + add.s64 %rd939, %rd937, %rd938; + selp.u64 %rd940, 1, 0, %p354; + add.s64 %rd941, %rd939, %rd940; + selp.u64 %rd942, 1, 0, %p360; + add.s64 %rd943, %rd941, %rd942; + selp.u64 %rd944, 1, 0, %p367; + add.s64 %rd945, %rd943, %rd944; + add.s64 %rd946, %rd945, %rd120; + add.s64 %rd947, %rd946, %rd926; + add.s64 %rd126, %rd947, %rd930; + setp.ne.s64 %p378, %rd126, %rd120; + setp.ne.s64 %p379, %rd127, %rd121; + or.pred %p380, %p379, %p378; + @%p380 bra $L__BB1_18; + + mov.u64 %rd1032, 1; + @%p376 bra $L__BB1_19; + +$L__BB1_18: + setp.eq.s64 %p386, %rd126, %rd120; + setp.lt.u64 %p387, %rd127, %rd121; + and.pred %p388, %p387, %p386; + setp.lt.u64 %p389, %rd126, %rd120; + or.pred %p390, %p389, %p388; + selp.u64 %rd1032, 1, 0, %p390; + +$L__BB1_19: + or.b64 %rd949, %rd125, %rd124; + or.b64 %rd950, %rd949, %rd127; + setp.eq.s64 %p391, %rd950, 0; + mov.u64 %rd951, 0; + setp.eq.s64 %p392, %rd126, 576460752303423505; + and.pred %p393, %p391, %p392; + setp.gt.u64 %p394, %rd126, 576460752303423504; + xor.pred %p395, %p394, %p393; + selp.u64 %rd952, 1, 0, %p395; + mov.u64 %rd953, -1; + mul.hi.u64 %rd954, %rd1032, %rd953; + sub.s64 %rd955, %rd954, %rd1032; + neg.s64 %rd956, %rd1032; + setp.lt.u64 %p396, %rd955, %rd956; + selp.u64 %rd957, 1, 0, %p396; + add.s64 %rd958, %rd955, %rd36; + setp.lt.u64 %p397, %rd958, %rd955; + selp.u64 %rd959, 1, 0, %p397; + add.s64 %rd960, %rd958, %rd957; + setp.lt.u64 %p398, %rd960, %rd958; + selp.u64 %rd961, 1, 0, %p398; + add.s64 %rd962, %rd960, %rd127; + setp.lt.u64 %p399, %rd962, %rd960; + selp.u64 %rd963, 1, 0, %p399; + add.s64 %rd964, %rd955, %rd125; + sub.s64 %rd965, %rd124, %rd1032; + setp.lt.u64 %p400, %rd965, %rd124; + selp.u64 %rd966, 1, 0, %p400; + add.s64 %rd967, %rd964, %rd966; + setp.eq.s64 %p401, %rd967, %rd125; + and.pred %p402, %p400, %p401; + setp.lt.u64 %p403, %rd967, %rd125; + or.pred %p404, %p403, %p402; + selp.u64 %rd968, 1, 0, %p404; + add.s64 %rd969, %rd962, %rd968; + setp.lt.u64 %p405, %rd969, %rd962; + selp.u64 %rd970, 1, 0, %p405; + mov.u64 %rd971, 1; + mul.hi.u64 %rd972, %rd952, %rd971; + mul.hi.u64 %rd973, %rd952, %rd951; + add.s64 %rd974, %rd973, %rd12; + setp.lt.u64 %p406, %rd974, %rd973; + selp.b64 %rd975, -576460752303423505, 0, %p395; + selp.b64 %rd976, -1, 0, %p406; + setp.lt.u64 %p407, %rd969, %rd974; + selp.b64 %rd977, -1, 0, %p407; + sub.s64 %rd978, %rd969, %rd974; + sub.s64 %rd979, %rd967, %rd972; + setp.lt.u64 %p408, %rd965, %rd952; + selp.b64 %rd980, -1, 0, %p408; + add.s64 %rd1019, %rd979, %rd980; + setp.eq.s64 %p409, %rd1019, %rd967; + and.pred %p410, %p408, %p409; + setp.gt.u64 %p411, %rd1019, %rd967; + or.pred %p412, %p411, %p410; + selp.u64 %rd981, 1, 0, %p412; + setp.lt.u64 %p413, %rd978, %rd981; + selp.b64 %rd982, -1, 0, %p413; + sub.s64 %rd983, %rd126, %rd38; + add.s64 %rd984, %rd983, %rd975; + mul.lo.s64 %rd985, %rd1032, -576460752303423506; + add.s64 %rd986, %rd984, %rd985; + add.s64 %rd987, %rd986, %rd36; + add.s64 %rd988, %rd987, %rd36; + add.s64 %rd989, %rd988, %rd954; + sub.s64 %rd990, %rd989, %rd973; + add.s64 %rd991, %rd990, %rd957; + add.s64 %rd992, %rd991, %rd959; + add.s64 %rd993, %rd992, %rd976; + add.s64 %rd994, %rd993, %rd961; + add.s64 %rd995, %rd994, %rd963; + add.s64 %rd996, %rd995, %rd970; + add.s64 %rd997, %rd996, %rd977; + add.s64 %rd1017, %rd997, %rd982; + sub.s64 %rd1018, %rd978, %rd981; + sub.s64 %rd1020, %rd965, %rd952; + setp.ne.s32 %p414, %r13, 0; + @%p414 bra $L__BB1_7; + +$L__BB1_20: + ld.param.u64 %rd1003, [calc_twiddles_param_0]; + mov.u32 %r12, %tid.x; + mov.u32 %r11, %ctaid.x; + mov.u32 %r10, %ntid.x; + mad.lo.s32 %r9, %r10, %r11, %r12; + cvta.to.global.u64 %rd998, %rd1003; + mul.wide.u32 %rd999, %r9, 32; + add.s64 %rd1000, %rd998, %rd999; + st.global.u64 [%rd1000], %rd1021; + st.global.u64 [%rd1000+8], %rd1022; + st.global.u64 [%rd1000+16], %rd1023; + st.global.u64 [%rd1000+24], %rd1024; + +$L__BB1_21: + ret; + +} + // .globl calc_twiddles_bitrev +.visible .entry calc_twiddles_bitrev( + .param .u64 calc_twiddles_bitrev_param_0, + .param .u64 calc_twiddles_bitrev_param_1, + .param .u32 calc_twiddles_bitrev_param_2 +) +{ + .reg .pred %p<421>; + .reg .b32 %r<19>; + .reg .b64 %rd<1035>; + + + ld.param.u64 %rd139, [calc_twiddles_bitrev_param_1]; + ld.param.u32 %r5, [calc_twiddles_bitrev_param_2]; + mov.u32 %r6, %ctaid.x; + mov.u32 %r7, %ntid.x; + mov.u32 %r8, %tid.x; + mad.lo.s32 %r1, %r7, %r6, %r8; + setp.ge.u32 %p18, %r1, %r5; + @%p18 bra $L__BB2_21; + + cvta.to.global.u64 %rd141, %rd139; + ld.global.u64 %rd1015, [%rd141]; + mov.u64 %rd142, 0; + ld.global.u64 %rd1016, [%rd141+8]; + ld.global.u64 %rd1017, [%rd141+16]; + ld.global.u64 %rd1018, [%rd141+24]; + clz.b32 %r9, %r5; + add.s32 %r10, %r9, 1; + brev.b32 %r11, %r1; + shr.u32 %r12, %r11, %r10; + setp.eq.s32 %p19, %r5, 1; + selp.b32 %r18, %r1, %r12, %p19; + mov.u64 %rd143, 5151653887; + mul.hi.u64 %rd144, %rd142, %rd143; + mov.u64 %rd145, -2802499714047; + mul.hi.u64 %rd5, %rd142, %rd145; + add.s64 %rd6, %rd144, %rd5; + mov.u64 %rd140, 1; + mov.u64 %rd146, -9469952; + mul.hi.u64 %rd7, %rd146, %rd140; + mul.hi.u64 %rd147, %rd140, %rd145; + setp.gt.u64 %p20, %rd147, -5151653888; + selp.u64 %rd148, 1, 0, %p20; + add.s64 %rd149, %rd147, 5151653887; + mul.hi.u64 %rd150, %rd140, %rd143; + add.s64 %rd151, %rd150, %rd5; + setp.lt.u64 %p21, %rd151, %rd150; + selp.u64 %rd152, 1, 0, %p21; + add.s64 %rd153, %rd151, %rd148; + setp.lt.u64 %p22, %rd153, %rd151; + selp.u64 %rd154, 1, 0, %p22; + add.s64 %rd155, %rd5, %rd7; + add.s64 %rd8, %rd155, 576413109808302096; + setp.gt.u64 %p2, %rd153, 9469951; + selp.u64 %rd156, 1, 0, %p2; + add.s64 %rd157, %rd6, %rd8; + mul.hi.u64 %rd158, %rd140, %rd142; + add.s64 %rd159, %rd157, %rd158; + add.s64 %rd160, %rd159, %rd152; + add.s64 %rd161, %rd160, %rd154; + add.s64 %rd10, %rd161, %rd156; + add.s64 %rd162, %rd153, -9469952; + mul.hi.u64 %rd11, %rd142, %rd142; + mul.hi.u64 %rd12, %rd142, %rd140; + mov.u64 %rd163, 9469952; + sub.s64 %rd164, %rd163, %rd153; + mov.u64 %rd165, -1; + mul.hi.u64 %rd166, %rd162, %rd165; + mul.hi.u64 %rd167, %rd149, %rd165; + mov.u64 %rd168, -5151653887; + sub.s64 %rd169, %rd168, %rd147; + mul.hi.u64 %rd170, %rd145, %rd165; + setp.gt.u64 %p23, %rd170, -2802499714048; + selp.u64 %rd171, 1, 0, %p23; + add.s64 %rd172, %rd170, 2802499714047; + mov.u64 %rd173, 2802499714047; + add.s64 %rd174, %rd167, %rd169; + setp.lt.u64 %p24, %rd174, %rd167; + add.s64 %rd16, %rd172, %rd169; + setp.lt.u64 %p25, %rd16, %rd172; + selp.u64 %rd175, 1, 0, %p25; + add.s64 %rd176, %rd175, %rd171; + selp.u64 %rd177, 1, 0, %p24; + add.s64 %rd178, %rd174, %rd172; + setp.lt.u64 %p26, %rd178, %rd174; + selp.u64 %rd179, 1, 0, %p26; + add.s64 %rd180, %rd178, %rd164; + setp.lt.u64 %p27, %rd180, %rd178; + selp.u64 %rd181, 1, 0, %p27; + add.s64 %rd14, %rd176, %rd180; + setp.lt.u64 %p28, %rd14, %rd176; + selp.u64 %rd182, 1, 0, %p28; + add.s64 %rd183, %rd166, %rd164; + add.s64 %rd184, %rd183, %rd170; + add.s64 %rd185, %rd184, %rd174; + add.s64 %rd186, %rd185, %rd171; + add.s64 %rd187, %rd186, %rd177; + add.s64 %rd188, %rd187, 576415912307998736; + sub.s64 %rd189, %rd188, %rd10; + add.s64 %rd190, %rd189, %rd179; + add.s64 %rd191, %rd190, %rd181; + add.s64 %rd13, %rd191, %rd182; + mul.hi.u64 %rd192, %rd14, %rd140; + add.s64 %rd15, %rd13, %rd192; + mul.hi.u64 %rd17, %rd142, %rd173; + add.s64 %rd193, %rd17, -576413109808284689; + mul.hi.u64 %rd194, %rd16, %rd140; + mul.hi.u64 %rd195, %rd16, %rd142; + mul.hi.u64 %rd196, %rd173, %rd140; + add.s64 %rd197, %rd196, %rd16; + setp.lt.u64 %p29, %rd197, %rd196; + selp.u64 %rd198, 1, 0, %p29; + mul.hi.u64 %rd199, %rd173, %rd142; + add.s64 %rd200, %rd199, %rd194; + setp.lt.u64 %p30, %rd200, %rd199; + selp.u64 %rd201, 1, 0, %p30; + add.s64 %rd202, %rd200, %rd198; + setp.lt.u64 %p31, %rd202, %rd200; + selp.u64 %rd203, 1, 0, %p31; + add.s64 %rd18, %rd193, %rd15; + setp.lt.u64 %p4, %rd18, %rd193; + add.s64 %rd21, %rd202, %rd14; + setp.lt.u64 %p5, %rd21, %rd202; + selp.u64 %rd204, 1, 0, %p5; + add.s64 %rd205, %rd18, %rd12; + add.s64 %rd206, %rd205, %rd199; + add.s64 %rd207, %rd206, %rd195; + add.s64 %rd208, %rd207, %rd201; + add.s64 %rd209, %rd208, %rd203; + add.s64 %rd20, %rd209, %rd204; + add.s64 %rd210, %rd21, %rd162; + setp.lt.u64 %p32, %rd210, %rd21; + selp.u64 %rd211, 1, 0, %p32; + add.s64 %rd212, %rd170, %rd196; + add.s64 %rd213, %rd212, 2802499714048; + setp.le.u64 %p33, %rd213, %rd197; + selp.u64 %rd214, 1, 0, %p33; + add.s64 %rd23, %rd210, %rd214; + setp.lt.u64 %p34, %rd23, %rd210; + selp.u64 %rd215, 1, 0, %p34; + add.s64 %rd216, %rd10, %rd211; + add.s64 %rd22, %rd216, %rd215; + setp.eq.s64 %p35, %rd22, 0; + setp.eq.s64 %p36, %rd23, %rd21; + and.pred %p37, %p36, %p35; + and.pred %p38, %p33, %p37; + mov.u64 %rd1013, %rd140; + @%p38 bra $L__BB2_3; + + setp.lt.u64 %p40, %rd23, %rd21; + and.pred %p41, %p40, %p35; + add.s64 %rd217, %rd22, %rd20; + setp.lt.u64 %p42, %rd217, %rd22; + or.pred %p43, %p42, %p41; + selp.u64 %rd1013, 1, 0, %p43; + +$L__BB2_3: + mul.hi.u64 %rd1010, %rd14, %rd140; + add.s64 %rd1009, %rd13, %rd1010; + setp.lt.u64 %p419, %rd1009, %rd13; + mov.u64 %rd1008, 5151653887; + mul.hi.u64 %rd1007, %rd142, %rd1008; + setp.lt.u64 %p418, %rd6, %rd1007; + mul.hi.u64 %rd220, %rd142, %rd146; + mov.u64 %rd221, 576413109808302096; + mul.hi.u64 %rd222, %rd142, %rd221; + add.s64 %rd223, %rd222, %rd220; + setp.lt.u64 %p44, %rd223, %rd222; + mul.hi.u64 %rd224, %rd146, %rd142; + mul.hi.u64 %rd226, %rd221, %rd140; + add.s64 %rd227, %rd224, %rd226; + setp.lt.u64 %p45, %rd227, %rd224; + setp.gt.u64 %p46, %rd7, -576413109808302097; + selp.u64 %rd228, 1, 0, %p46; + add.s64 %rd229, %rd227, %rd228; + setp.lt.u64 %p47, %rd229, %rd227; + add.s64 %rd230, %rd11, %rd220; + add.s64 %rd231, %rd6, %rd230; + selp.u64 %rd232, 1, 0, %p418; + add.s64 %rd233, %rd231, %rd232; + setp.lt.u64 %p48, %rd233, %rd220; + selp.u64 %rd234, 1, 0, %p48; + add.s64 %rd235, %rd223, %rd234; + setp.lt.u64 %p49, %rd235, %rd223; + add.s64 %rd236, %rd229, %rd6; + setp.lt.u64 %p50, %rd236, %rd229; + mul.hi.u64 %rd237, %rd221, %rd142; + add.s64 %rd238, %rd224, %rd12; + add.s64 %rd239, %rd238, %rd237; + add.s64 %rd240, %rd239, %rd233; + selp.u64 %rd241, 1, 0, %p45; + add.s64 %rd242, %rd240, %rd241; + selp.u64 %rd243, 1, 0, %p47; + add.s64 %rd244, %rd242, %rd243; + selp.u64 %rd245, 1, 0, %p50; + add.s64 %rd246, %rd244, %rd245; + setp.lt.u64 %p51, %rd246, %rd233; + setp.eq.s64 %p52, %rd246, %rd233; + and.pred %p53, %p50, %p52; + or.pred %p54, %p51, %p53; + selp.u64 %rd247, 1, 0, %p54; + add.s64 %rd248, %rd235, %rd247; + setp.lt.u64 %p55, %rd248, %rd235; + setp.lt.u64 %p56, %rd8, %rd5; + selp.u64 %rd249, 1, 0, %p56; + add.s64 %rd250, %rd236, %rd249; + setp.lt.u64 %p57, %rd10, %rd8; + setp.eq.s64 %p58, %rd10, %rd8; + and.pred %p59, %p2, %p58; + or.pred %p60, %p57, %p59; + selp.u64 %rd251, 1, 0, %p60; + add.s64 %rd252, %rd250, %rd251; + setp.lt.u64 %p61, %rd252, %rd236; + selp.u64 %rd253, 1, 0, %p61; + add.s64 %rd254, %rd246, %rd253; + setp.lt.u64 %p62, %rd254, %rd246; + selp.u64 %rd255, 1, 0, %p62; + add.s64 %rd256, %rd248, %rd255; + setp.lt.u64 %p63, %rd256, %rd248; + mul.hi.u64 %rd257, %rd14, %rd142; + mul.hi.u64 %rd258, %rd13, %rd142; + mul.lo.s64 %rd259, %rd14, 576460752303423505; + mov.u64 %rd260, 576460752303423505; + add.s64 %rd261, %rd259, %rd257; + setp.lt.u64 %p64, %rd261, %rd259; + mul.lo.s64 %rd262, %rd13, 576460752303423505; + add.s64 %rd263, %rd262, %rd258; + setp.lt.u64 %p65, %rd263, %rd262; + mul.hi.u64 %rd264, %rd14, %rd260; + add.s64 %rd265, %rd263, %rd264; + setp.lt.u64 %p66, %rd265, %rd263; + selp.u64 %rd266, 1, 0, %p64; + add.s64 %rd267, %rd265, %rd266; + setp.lt.u64 %p67, %rd267, %rd265; + mul.hi.u64 %rd268, %rd13, %rd140; + add.s64 %rd269, %rd268, %rd257; + setp.lt.u64 %p68, %rd269, %rd268; + selp.u64 %rd270, 1, 0, %p419; + add.s64 %rd271, %rd269, %rd270; + setp.lt.u64 %p69, %rd271, %rd269; + mul.hi.u64 %rd273, %rd260, %rd173; + mul.lo.s64 %rd274, %rd16, 576460752303423505; + add.s64 %rd275, %rd273, %rd274; + setp.lt.u64 %p70, %rd275, %rd273; + mul.hi.u64 %rd276, %rd142, %rd16; + add.s64 %rd277, %rd276, %rd275; + setp.lt.u64 %p71, %rd277, %rd276; + setp.gt.u64 %p72, %rd17, 576413109808284688; + selp.u64 %rd278, 1, 0, %p72; + add.s64 %rd279, %rd277, %rd278; + setp.lt.u64 %p73, %rd279, %rd277; + add.s64 %rd280, %rd257, %rd12; + add.s64 %rd281, %rd280, %rd258; + add.s64 %rd282, %rd281, %rd261; + selp.u64 %rd283, 1, 0, %p68; + add.s64 %rd284, %rd282, %rd283; + selp.u64 %rd285, 1, 0, %p69; + add.s64 %rd286, %rd284, %rd285; + setp.lt.u64 %p74, %rd286, %rd261; + selp.u64 %rd287, 1, 0, %p74; + add.s64 %rd288, %rd267, %rd287; + setp.lt.u64 %p75, %rd288, %rd267; + add.s64 %rd289, %rd271, %rd279; + setp.lt.u64 %p76, %rd289, %rd271; + add.s64 %rd290, %rd17, %rd11; + mul.hi.u64 %rd291, %rd260, %rd16; + add.s64 %rd292, %rd290, %rd291; + selp.u64 %rd293, 1, 0, %p70; + add.s64 %rd294, %rd292, %rd293; + selp.u64 %rd295, 1, 0, %p71; + add.s64 %rd296, %rd294, %rd295; + selp.u64 %rd297, 1, 0, %p73; + add.s64 %rd298, %rd296, %rd297; + selp.u64 %rd299, 1, 0, %p76; + add.s64 %rd300, %rd298, %rd299; + add.s64 %rd301, %rd300, %rd286; + setp.lt.u64 %p77, %rd301, %rd300; + setp.eq.s64 %p78, %rd300, 0; + and.pred %p79, %p76, %p78; + or.pred %p80, %p77, %p79; + selp.u64 %rd302, 1, 0, %p80; + add.s64 %rd303, %rd288, %rd302; + setp.lt.u64 %p81, %rd303, %rd288; + selp.u64 %rd304, 1, 0, %p4; + add.s64 %rd305, %rd289, %rd304; + setp.lt.u64 %p82, %rd20, %rd18; + setp.eq.s64 %p83, %rd20, %rd18; + and.pred %p84, %p5, %p83; + or.pred %p85, %p82, %p84; + selp.u64 %rd306, 1, 0, %p85; + add.s64 %rd307, %rd305, %rd306; + setp.lt.u64 %p86, %rd307, %rd289; + selp.u64 %rd308, 1, 0, %p86; + add.s64 %rd309, %rd301, %rd308; + setp.lt.u64 %p87, %rd309, %rd301; + selp.u64 %rd310, 1, 0, %p87; + add.s64 %rd311, %rd303, %rd310; + setp.lt.u64 %p88, %rd311, %rd303; + add.s64 %rd29, %rd1013, %rd252; + setp.lt.u64 %p89, %rd29, %rd1013; + selp.u64 %rd312, 1, 0, %p89; + add.s64 %rd28, %rd254, %rd312; + setp.lt.u64 %p90, %rd28, %rd254; + selp.u64 %rd313, 1, 0, %p90; + add.s64 %rd27, %rd256, %rd313; + setp.lt.u64 %p91, %rd27, %rd256; + selp.u64 %rd314, 1, 0, %p91; + add.s64 %rd315, %rd223, %rd11; + selp.u64 %rd316, 1, 0, %p44; + add.s64 %rd317, %rd315, %rd316; + selp.u64 %rd318, 1, 0, %p49; + add.s64 %rd319, %rd317, %rd318; + selp.u64 %rd320, 1, 0, %p55; + add.s64 %rd321, %rd319, %rd320; + selp.u64 %rd322, 1, 0, %p63; + add.s64 %rd323, %rd321, %rd322; + add.s64 %rd26, %rd323, %rd314; + add.s64 %rd324, %rd27, %rd311; + setp.lt.u64 %p92, %rd324, %rd27; + selp.u64 %rd325, 1, 0, %p92; + add.s64 %rd326, %rd28, %rd309; + add.s64 %rd30, %rd29, %rd307; + setp.lt.u64 %p93, %rd30, %rd29; + selp.u64 %rd327, 1, 0, %p93; + add.s64 %rd31, %rd326, %rd327; + setp.eq.s64 %p94, %rd31, %rd28; + and.pred %p95, %p93, %p94; + setp.lt.u64 %p96, %rd31, %rd28; + or.pred %p97, %p96, %p95; + selp.u64 %rd328, 1, 0, %p97; + add.s64 %rd33, %rd324, %rd328; + setp.lt.u64 %p98, %rd33, %rd324; + selp.u64 %rd329, 1, 0, %p98; + add.s64 %rd330, %rd257, %rd11; + mul.hi.u64 %rd331, %rd13, %rd260; + add.s64 %rd332, %rd330, %rd331; + selp.u64 %rd333, 1, 0, %p65; + add.s64 %rd334, %rd332, %rd333; + selp.u64 %rd335, 1, 0, %p66; + add.s64 %rd336, %rd334, %rd335; + selp.u64 %rd337, 1, 0, %p67; + add.s64 %rd338, %rd336, %rd337; + selp.u64 %rd339, 1, 0, %p75; + add.s64 %rd340, %rd338, %rd339; + selp.u64 %rd341, 1, 0, %p81; + add.s64 %rd342, %rd340, %rd341; + selp.u64 %rd343, 1, 0, %p88; + add.s64 %rd344, %rd342, %rd343; + add.s64 %rd345, %rd344, %rd26; + add.s64 %rd346, %rd345, %rd325; + add.s64 %rd32, %rd346, %rd329; + setp.ne.s64 %p99, %rd32, %rd26; + setp.ne.s64 %p100, %rd33, %rd27; + or.pred %p101, %p100, %p99; + @%p101 bra $L__BB2_5; + + mov.u64 %rd1014, 1; + @%p97 bra $L__BB2_6; + +$L__BB2_5: + setp.eq.s64 %p107, %rd32, %rd26; + setp.lt.u64 %p108, %rd33, %rd27; + and.pred %p109, %p108, %p107; + setp.lt.u64 %p110, %rd32, %rd26; + or.pred %p111, %p110, %p109; + selp.u64 %rd1014, 1, 0, %p111; + +$L__BB2_6: + or.b64 %rd348, %rd31, %rd30; + or.b64 %rd349, %rd348, %rd33; + setp.eq.s64 %p112, %rd349, 0; + mov.u64 %rd350, 0; + setp.eq.s64 %p113, %rd32, 576460752303423505; + and.pred %p114, %p112, %p113; + setp.gt.u64 %p115, %rd32, 576460752303423504; + xor.pred %p116, %p115, %p114; + selp.u64 %rd351, 1, 0, %p116; + mov.u64 %rd352, -1; + mul.hi.u64 %rd353, %rd1014, %rd352; + sub.s64 %rd354, %rd353, %rd1014; + neg.s64 %rd355, %rd1014; + setp.lt.u64 %p117, %rd354, %rd355; + selp.u64 %rd356, 1, 0, %p117; + mul.hi.u64 %rd36, %rd350, %rd352; + add.s64 %rd357, %rd354, %rd36; + setp.lt.u64 %p118, %rd357, %rd354; + selp.u64 %rd358, 1, 0, %p118; + add.s64 %rd359, %rd357, %rd356; + setp.lt.u64 %p119, %rd359, %rd357; + selp.u64 %rd360, 1, 0, %p119; + add.s64 %rd361, %rd359, %rd33; + setp.lt.u64 %p120, %rd361, %rd359; + selp.u64 %rd362, 1, 0, %p120; + add.s64 %rd363, %rd354, %rd31; + sub.s64 %rd364, %rd30, %rd1014; + setp.lt.u64 %p121, %rd364, %rd30; + selp.u64 %rd365, 1, 0, %p121; + add.s64 %rd366, %rd363, %rd365; + setp.eq.s64 %p122, %rd366, %rd31; + and.pred %p123, %p121, %p122; + setp.lt.u64 %p124, %rd366, %rd31; + or.pred %p125, %p124, %p123; + selp.u64 %rd367, 1, 0, %p125; + add.s64 %rd368, %rd361, %rd367; + setp.lt.u64 %p126, %rd368, %rd361; + selp.u64 %rd369, 1, 0, %p126; + mov.u64 %rd370, 1; + mul.hi.u64 %rd371, %rd351, %rd370; + mul.hi.u64 %rd372, %rd351, %rd350; + add.s64 %rd373, %rd372, %rd12; + setp.lt.u64 %p127, %rd373, %rd372; + selp.b64 %rd374, -576460752303423505, 0, %p116; + selp.b64 %rd375, -1, 0, %p127; + setp.lt.u64 %p128, %rd368, %rd373; + selp.b64 %rd376, -1, 0, %p128; + sub.s64 %rd377, %rd368, %rd373; + sub.s64 %rd378, %rd366, %rd371; + setp.lt.u64 %p129, %rd364, %rd351; + selp.b64 %rd379, -1, 0, %p129; + add.s64 %rd1021, %rd378, %rd379; + setp.eq.s64 %p130, %rd1021, %rd366; + and.pred %p131, %p129, %p130; + setp.gt.u64 %p132, %rd1021, %rd366; + or.pred %p133, %p132, %p131; + selp.u64 %rd380, 1, 0, %p133; + setp.lt.u64 %p134, %rd377, %rd380; + selp.b64 %rd381, -1, 0, %p134; + add.s64 %rd38, %rd12, %rd11; + sub.s64 %rd382, %rd32, %rd38; + add.s64 %rd383, %rd382, %rd374; + mul.lo.s64 %rd384, %rd1014, -576460752303423506; + add.s64 %rd385, %rd383, %rd384; + add.s64 %rd386, %rd385, %rd36; + add.s64 %rd387, %rd386, %rd36; + add.s64 %rd388, %rd387, %rd353; + sub.s64 %rd389, %rd388, %rd372; + add.s64 %rd390, %rd389, %rd356; + add.s64 %rd391, %rd390, %rd358; + add.s64 %rd392, %rd391, %rd375; + add.s64 %rd393, %rd392, %rd360; + add.s64 %rd394, %rd393, %rd362; + add.s64 %rd395, %rd394, %rd369; + add.s64 %rd396, %rd395, %rd376; + add.s64 %rd1019, %rd396, %rd381; + sub.s64 %rd1020, %rd377, %rd380; + sub.s64 %rd1022, %rd364, %rd351; + setp.eq.s32 %p135, %r18, 0; + @%p135 bra $L__BB2_20; + +$L__BB2_7: + and.b32 %r13, %r18, 1; + setp.eq.b32 %p136, %r13, 1; + mov.pred %p137, 0; + xor.pred %p138, %p136, %p137; + not.pred %p139, %p138; + @%p139 bra $L__BB2_14; + + mov.u64 %rd398, 0; + mul.lo.s64 %rd399, %rd1015, %rd1020; + mul.hi.u64 %rd400, %rd1020, %rd1016; + add.s64 %rd401, %rd400, %rd399; + setp.lt.u64 %p140, %rd401, %rd400; + selp.u64 %rd402, 1, 0, %p140; + mul.hi.u64 %rd51, %rd1019, %rd1015; + mul.lo.s64 %rd403, %rd1016, %rd1019; + add.s64 %rd404, %rd401, %rd403; + setp.lt.u64 %p141, %rd404, %rd401; + selp.u64 %rd405, 1, 0, %p141; + mul.lo.s64 %rd406, %rd1015, %rd1019; + mul.hi.u64 %rd407, %rd1019, %rd1016; + add.s64 %rd408, %rd407, %rd406; + setp.lt.u64 %p142, %rd408, %rd407; + selp.u64 %rd52, 1, 0, %p142; + mul.hi.u64 %rd409, %rd1020, %rd1015; + add.s64 %rd410, %rd409, %rd408; + setp.lt.u64 %p143, %rd410, %rd409; + selp.u64 %rd53, 1, 0, %p143; + add.s64 %rd411, %rd410, %rd402; + add.s64 %rd412, %rd411, %rd405; + setp.lt.u64 %p144, %rd412, %rd410; + selp.u64 %rd54, 1, 0, %p144; + mul.lo.s64 %rd413, %rd1017, %rd1020; + mul.hi.u64 %rd414, %rd1020, %rd1018; + add.s64 %rd415, %rd414, %rd413; + setp.lt.u64 %p145, %rd415, %rd414; + selp.u64 %rd416, 1, 0, %p145; + mul.lo.s64 %rd417, %rd1018, %rd1019; + add.s64 %rd418, %rd415, %rd417; + setp.lt.u64 %p146, %rd418, %rd415; + selp.u64 %rd419, 1, 0, %p146; + mul.lo.s64 %rd420, %rd1017, %rd1019; + mul.hi.u64 %rd421, %rd1019, %rd1018; + add.s64 %rd422, %rd421, %rd420; + setp.lt.u64 %p147, %rd422, %rd421; + selp.u64 %rd423, 1, 0, %p147; + mul.hi.u64 %rd424, %rd1020, %rd1017; + add.s64 %rd425, %rd424, %rd422; + setp.lt.u64 %p148, %rd425, %rd424; + selp.u64 %rd426, 1, 0, %p148; + add.s64 %rd427, %rd425, %rd416; + add.s64 %rd428, %rd427, %rd419; + setp.lt.u64 %p149, %rd428, %rd425; + selp.u64 %rd429, 1, 0, %p149; + mul.lo.s64 %rd430, %rd1016, %rd1021; + mul.hi.u64 %rd431, %rd1016, %rd1022; + add.s64 %rd432, %rd431, %rd430; + setp.lt.u64 %p150, %rd432, %rd431; + selp.u64 %rd433, 1, 0, %p150; + mul.lo.s64 %rd434, %rd1015, %rd1022; + add.s64 %rd435, %rd432, %rd434; + setp.lt.u64 %p151, %rd435, %rd432; + selp.u64 %rd436, 1, 0, %p151; + mul.lo.s64 %rd437, %rd1015, %rd1021; + mul.hi.u64 %rd438, %rd1015, %rd1022; + add.s64 %rd439, %rd438, %rd437; + setp.lt.u64 %p152, %rd439, %rd438; + selp.u64 %rd440, 1, 0, %p152; + mul.hi.u64 %rd441, %rd1016, %rd1021; + add.s64 %rd442, %rd441, %rd439; + setp.lt.u64 %p153, %rd442, %rd441; + selp.u64 %rd443, 1, 0, %p153; + add.s64 %rd444, %rd442, %rd433; + add.s64 %rd445, %rd444, %rd436; + setp.lt.u64 %p154, %rd445, %rd442; + selp.u64 %rd446, 1, 0, %p154; + mul.lo.s64 %rd447, %rd1017, %rd1022; + mul.hi.u64 %rd448, %rd1022, %rd1018; + add.s64 %rd449, %rd448, %rd447; + setp.lt.u64 %p155, %rd449, %rd448; + selp.u64 %rd450, 1, 0, %p155; + mul.lo.s64 %rd451, %rd1018, %rd1021; + add.s64 %rd452, %rd449, %rd451; + setp.lt.u64 %p156, %rd452, %rd449; + selp.u64 %rd453, 1, 0, %p156; + mul.lo.s64 %rd454, %rd1017, %rd1021; + mul.hi.u64 %rd455, %rd1021, %rd1018; + add.s64 %rd456, %rd455, %rd454; + setp.lt.u64 %p157, %rd456, %rd455; + selp.u64 %rd457, 1, 0, %p157; + mul.hi.u64 %rd458, %rd1022, %rd1017; + add.s64 %rd459, %rd458, %rd456; + setp.lt.u64 %p158, %rd459, %rd458; + selp.u64 %rd460, 1, 0, %p158; + add.s64 %rd461, %rd459, %rd450; + add.s64 %rd462, %rd461, %rd453; + setp.lt.u64 %p159, %rd462, %rd459; + selp.u64 %rd463, 1, 0, %p159; + mul.lo.s64 %rd464, %rd1016, %rd1022; + mul.lo.s64 %rd465, %rd1018, %rd1020; + add.s64 %rd466, %rd464, %rd465; + setp.lt.u64 %p160, %rd466, %rd464; + selp.u64 %rd467, 1, 0, %p160; + add.s64 %rd468, %rd418, %rd467; + add.s64 %rd469, %rd468, %rd435; + setp.eq.s64 %p161, %rd469, %rd418; + and.pred %p162, %p160, %p161; + setp.lt.u64 %p163, %rd469, %rd418; + or.pred %p164, %p163, %p162; + selp.u64 %rd470, 1, 0, %p164; + add.s64 %rd471, %rd462, %rd466; + setp.lt.u64 %p165, %rd471, %rd462; + selp.u64 %rd472, 1, 0, %p165; + mul.hi.u64 %rd473, %rd398, %rd1018; + add.s64 %rd474, %rd469, %rd473; + mul.hi.u64 %rd475, %rd1021, %rd1017; + add.s64 %rd476, %rd474, %rd475; + mul.hi.u64 %rd477, %rd1022, %rd398; + add.s64 %rd478, %rd476, %rd477; + add.s64 %rd479, %rd478, %rd457; + add.s64 %rd480, %rd479, %rd460; + add.s64 %rd481, %rd480, %rd463; + add.s64 %rd482, %rd481, %rd472; + setp.eq.s64 %p166, %rd482, %rd469; + and.pred %p167, %p165, %p166; + setp.lt.u64 %p168, %rd482, %rd469; + or.pred %p169, %p168, %p167; + selp.u64 %rd483, 1, 0, %p169; + mul.lo.s64 %rd484, %rd1016, %rd1020; + add.s64 %rd485, %rd428, %rd484; + setp.lt.u64 %p170, %rd485, %rd428; + selp.u64 %rd486, 1, 0, %p170; + mul.hi.u64 %rd55, %rd1020, %rd398; + add.s64 %rd487, %rd55, %rd404; + add.s64 %rd488, %rd487, %rd473; + mul.hi.u64 %rd489, %rd1019, %rd1017; + add.s64 %rd490, %rd488, %rd489; + add.s64 %rd491, %rd490, %rd423; + add.s64 %rd492, %rd491, %rd426; + add.s64 %rd493, %rd492, %rd429; + add.s64 %rd494, %rd493, %rd486; + setp.eq.s64 %p171, %rd494, %rd404; + and.pred %p172, %p170, %p171; + setp.lt.u64 %p173, %rd494, %rd404; + or.pred %p174, %p173, %p172; + selp.u64 %rd495, 1, 0, %p174; + add.s64 %rd496, %rd412, %rd495; + setp.lt.u64 %p175, %rd496, %rd412; + selp.u64 %rd56, 1, 0, %p175; + add.s64 %rd497, %rd445, %rd485; + setp.lt.u64 %p176, %rd497, %rd445; + selp.u64 %rd498, 1, 0, %p176; + mul.hi.u64 %rd499, %rd1015, %rd1021; + mul.hi.u64 %rd500, %rd398, %rd1022; + add.s64 %rd501, %rd499, %rd500; + mul.hi.u64 %rd502, %rd1016, %rd398; + add.s64 %rd503, %rd501, %rd502; + add.s64 %rd504, %rd503, %rd440; + add.s64 %rd505, %rd504, %rd494; + add.s64 %rd506, %rd505, %rd443; + add.s64 %rd507, %rd506, %rd446; + add.s64 %rd508, %rd507, %rd498; + setp.eq.s64 %p177, %rd508, %rd494; + and.pred %p178, %p176, %p177; + setp.lt.u64 %p179, %rd508, %rd494; + or.pred %p180, %p179, %p178; + selp.u64 %rd509, 1, 0, %p180; + add.s64 %rd57, %rd496, %rd509; + add.s64 %rd510, %rd497, %rd470; + add.s64 %rd59, %rd510, %rd483; + setp.lt.u64 %p181, %rd59, %rd497; + selp.u64 %rd511, 1, 0, %p181; + add.s64 %rd60, %rd508, %rd511; + setp.lt.u64 %p7, %rd60, %rd508; + mul.lo.s64 %rd512, %rd1018, %rd1022; + mul.lo.s64 %rd513, %rd512, 576460752303423504; + mov.u64 %rd514, -1; + mul.hi.u64 %rd515, %rd471, %rd514; + mul.hi.u64 %rd516, %rd512, %rd514; + neg.s64 %rd61, %rd512; + mul.hi.u64 %rd517, %rd452, %rd514; + sub.s64 %rd518, %rd516, %rd512; + setp.lt.u64 %p182, %rd518, %rd61; + selp.u64 %rd519, 1, 0, %p182; + neg.s64 %rd520, %rd452; + sub.s64 %rd521, %rd517, %rd452; + sub.s64 %rd62, %rd518, %rd452; + setp.lt.u64 %p183, %rd62, %rd518; + selp.u64 %rd522, 1, 0, %p183; + add.s64 %rd523, %rd522, %rd519; + setp.lt.u64 %p184, %rd521, %rd520; + selp.u64 %rd524, 1, 0, %p184; + add.s64 %rd525, %rd521, %rd518; + setp.lt.u64 %p185, %rd525, %rd521; + selp.u64 %rd526, 1, 0, %p185; + sub.s64 %rd527, %rd525, %rd471; + setp.lt.u64 %p186, %rd527, %rd525; + selp.u64 %rd528, 1, 0, %p186; + add.s64 %rd64, %rd523, %rd527; + setp.lt.u64 %p187, %rd64, %rd523; + selp.u64 %rd529, 1, 0, %p187; + add.s64 %rd530, %rd515, %rd513; + add.s64 %rd531, %rd530, %rd516; + sub.s64 %rd532, %rd531, %rd471; + add.s64 %rd533, %rd532, %rd521; + add.s64 %rd534, %rd533, %rd519; + sub.s64 %rd535, %rd534, %rd482; + add.s64 %rd536, %rd535, %rd524; + add.s64 %rd537, %rd536, %rd526; + add.s64 %rd538, %rd537, %rd528; + add.s64 %rd63, %rd538, %rd529; + mov.u64 %rd397, 1; + mul.hi.u64 %rd539, %rd64, %rd397; + add.s64 %rd65, %rd63, %rd539; + mul.hi.u64 %rd66, %rd398, %rd61; + mul.lo.s64 %rd540, %rd512, -576460752303423505; + add.s64 %rd67, %rd66, %rd540; + setp.lt.u64 %p9, %rd67, %rd66; + mul.hi.u64 %rd541, %rd61, %rd397; + mul.hi.u64 %rd542, %rd62, %rd397; + mul.hi.u64 %rd543, %rd61, %rd398; + mul.hi.u64 %rd544, %rd62, %rd398; + add.s64 %rd545, %rd541, %rd62; + setp.lt.u64 %p188, %rd545, %rd541; + selp.u64 %rd546, 1, 0, %p188; + add.s64 %rd547, %rd543, %rd542; + setp.lt.u64 %p189, %rd547, %rd543; + selp.u64 %rd548, 1, 0, %p189; + add.s64 %rd549, %rd547, %rd546; + setp.lt.u64 %p190, %rd549, %rd547; + selp.u64 %rd550, 1, 0, %p190; + add.s64 %rd68, %rd67, %rd65; + setp.lt.u64 %p10, %rd68, %rd67; + add.s64 %rd71, %rd549, %rd64; + setp.lt.u64 %p11, %rd71, %rd549; + selp.u64 %rd551, 1, 0, %p11; + add.s64 %rd552, %rd68, %rd12; + add.s64 %rd553, %rd552, %rd543; + add.s64 %rd554, %rd553, %rd544; + add.s64 %rd555, %rd554, %rd548; + add.s64 %rd556, %rd555, %rd550; + add.s64 %rd70, %rd556, %rd551; + add.s64 %rd557, %rd71, %rd471; + setp.lt.u64 %p191, %rd557, %rd71; + selp.u64 %rd558, 1, 0, %p191; + setp.ne.s64 %p192, %rd512, 0; + selp.u64 %rd559, 1, 0, %p192; + add.s64 %rd560, %rd518, %rd559; + add.s64 %rd561, %rd560, %rd541; + setp.eq.s64 %p193, %rd560, %rd62; + and.pred %p194, %p192, %p193; + setp.lt.u64 %p195, %rd561, %rd545; + or.pred %p196, %p195, %p194; + selp.u64 %rd562, 1, 0, %p196; + add.s64 %rd73, %rd557, %rd562; + setp.lt.u64 %p197, %rd73, %rd557; + selp.u64 %rd563, 1, 0, %p197; + add.s64 %rd564, %rd482, %rd558; + add.s64 %rd72, %rd564, %rd563; + setp.ne.s64 %p198, %rd72, 0; + setp.ne.s64 %p199, %rd73, %rd71; + or.pred %p200, %p199, %p198; + not.pred %p201, %p196; + or.pred %p202, %p200, %p201; + not.pred %p203, %p202; + mov.u64 %rd1023, %rd397; + @%p203 bra $L__BB2_10; + + setp.eq.s64 %p204, %rd72, 0; + setp.lt.u64 %p205, %rd73, %rd71; + and.pred %p206, %p205, %p204; + add.s64 %rd565, %rd72, %rd70; + setp.lt.u64 %p207, %rd565, %rd72; + or.pred %p208, %p207, %p206; + selp.u64 %rd1023, 1, 0, %p208; + +$L__BB2_10: + mul.hi.u64 %rd1012, %rd64, %rd397; + add.s64 %rd1011, %rd63, %rd1012; + setp.lt.u64 %p420, %rd1011, %rd63; + setp.lt.u64 %p416, %rd57, %rd496; + selp.u64 %rd1002, 1, 0, %p416; + mul.hi.u64 %rd1001, %rd398, %rd1016; + selp.u64 %rd566, 1, 0, %p7; + add.s64 %rd567, %rd57, %rd566; + setp.lt.u64 %p209, %rd567, %rd57; + mul.hi.u64 %rd569, %rd64, %rd398; + mul.hi.u64 %rd570, %rd63, %rd398; + mul.lo.s64 %rd571, %rd64, 576460752303423505; + mov.u64 %rd572, 576460752303423505; + add.s64 %rd573, %rd571, %rd569; + setp.lt.u64 %p210, %rd573, %rd571; + mul.lo.s64 %rd574, %rd63, 576460752303423505; + add.s64 %rd575, %rd574, %rd570; + setp.lt.u64 %p211, %rd575, %rd574; + mul.hi.u64 %rd576, %rd64, %rd572; + add.s64 %rd577, %rd575, %rd576; + setp.lt.u64 %p212, %rd577, %rd575; + selp.u64 %rd578, 1, 0, %p210; + add.s64 %rd579, %rd577, %rd578; + setp.lt.u64 %p213, %rd579, %rd577; + mul.hi.u64 %rd581, %rd63, %rd397; + add.s64 %rd582, %rd581, %rd569; + setp.lt.u64 %p214, %rd582, %rd581; + selp.u64 %rd583, 1, 0, %p420; + add.s64 %rd584, %rd582, %rd583; + setp.lt.u64 %p215, %rd584, %rd582; + mul.lo.s64 %rd585, %rd62, 576460752303423505; + mul.hi.u64 %rd586, %rd572, %rd61; + add.s64 %rd587, %rd586, %rd585; + setp.lt.u64 %p216, %rd587, %rd586; + mul.hi.u64 %rd588, %rd398, %rd62; + add.s64 %rd589, %rd588, %rd587; + setp.lt.u64 %p217, %rd589, %rd588; + selp.u64 %rd590, 1, 0, %p9; + add.s64 %rd591, %rd589, %rd590; + setp.lt.u64 %p218, %rd591, %rd589; + add.s64 %rd592, %rd570, %rd569; + add.s64 %rd593, %rd592, %rd12; + add.s64 %rd594, %rd593, %rd573; + selp.u64 %rd595, 1, 0, %p214; + add.s64 %rd596, %rd594, %rd595; + selp.u64 %rd597, 1, 0, %p215; + add.s64 %rd598, %rd596, %rd597; + setp.lt.u64 %p219, %rd598, %rd573; + selp.u64 %rd599, 1, 0, %p219; + add.s64 %rd600, %rd579, %rd599; + setp.lt.u64 %p220, %rd600, %rd579; + add.s64 %rd601, %rd591, %rd584; + setp.lt.u64 %p221, %rd601, %rd591; + add.s64 %rd602, %rd66, %rd11; + mul.hi.u64 %rd603, %rd572, %rd62; + add.s64 %rd604, %rd602, %rd603; + selp.u64 %rd605, 1, 0, %p216; + add.s64 %rd606, %rd604, %rd605; + selp.u64 %rd607, 1, 0, %p217; + add.s64 %rd608, %rd606, %rd607; + selp.u64 %rd609, 1, 0, %p218; + add.s64 %rd610, %rd608, %rd609; + selp.u64 %rd611, 1, 0, %p221; + add.s64 %rd612, %rd610, %rd611; + add.s64 %rd613, %rd612, %rd598; + setp.lt.u64 %p222, %rd613, %rd612; + setp.eq.s64 %p223, %rd612, 0; + and.pred %p224, %p221, %p223; + or.pred %p225, %p222, %p224; + selp.u64 %rd614, 1, 0, %p225; + add.s64 %rd615, %rd600, %rd614; + setp.lt.u64 %p226, %rd615, %rd600; + selp.u64 %rd616, 1, 0, %p10; + add.s64 %rd617, %rd601, %rd616; + setp.lt.u64 %p227, %rd70, %rd68; + setp.eq.s64 %p228, %rd70, %rd68; + and.pred %p229, %p11, %p228; + or.pred %p230, %p227, %p229; + selp.u64 %rd618, 1, 0, %p230; + add.s64 %rd619, %rd617, %rd618; + setp.lt.u64 %p231, %rd619, %rd601; + selp.u64 %rd620, 1, 0, %p231; + add.s64 %rd621, %rd613, %rd620; + setp.lt.u64 %p232, %rd621, %rd613; + selp.u64 %rd622, 1, 0, %p232; + add.s64 %rd623, %rd615, %rd622; + setp.lt.u64 %p233, %rd623, %rd615; + add.s64 %rd79, %rd1023, %rd59; + setp.lt.u64 %p234, %rd79, %rd1023; + selp.u64 %rd624, 1, 0, %p234; + add.s64 %rd78, %rd60, %rd624; + setp.lt.u64 %p235, %rd78, %rd60; + selp.u64 %rd625, 1, 0, %p235; + add.s64 %rd77, %rd567, %rd625; + setp.lt.u64 %p236, %rd77, %rd567; + selp.u64 %rd626, 1, 0, %p236; + add.s64 %rd627, %rd51, %rd1001; + add.s64 %rd628, %rd627, %rd55; + add.s64 %rd629, %rd628, %rd52; + add.s64 %rd630, %rd629, %rd53; + add.s64 %rd631, %rd630, %rd54; + add.s64 %rd632, %rd631, %rd56; + add.s64 %rd633, %rd632, %rd1002; + selp.u64 %rd634, 1, 0, %p209; + add.s64 %rd635, %rd633, %rd634; + add.s64 %rd76, %rd635, %rd626; + add.s64 %rd636, %rd77, %rd623; + setp.lt.u64 %p237, %rd636, %rd77; + selp.u64 %rd637, 1, 0, %p237; + add.s64 %rd638, %rd78, %rd621; + add.s64 %rd80, %rd79, %rd619; + setp.lt.u64 %p238, %rd80, %rd79; + selp.u64 %rd639, 1, 0, %p238; + add.s64 %rd81, %rd638, %rd639; + setp.eq.s64 %p239, %rd81, %rd78; + and.pred %p240, %p238, %p239; + setp.lt.u64 %p241, %rd81, %rd78; + or.pred %p242, %p241, %p240; + selp.u64 %rd640, 1, 0, %p242; + add.s64 %rd83, %rd636, %rd640; + setp.lt.u64 %p243, %rd83, %rd636; + selp.u64 %rd641, 1, 0, %p243; + add.s64 %rd642, %rd569, %rd11; + mul.hi.u64 %rd643, %rd63, %rd572; + add.s64 %rd644, %rd642, %rd643; + selp.u64 %rd645, 1, 0, %p211; + add.s64 %rd646, %rd644, %rd645; + selp.u64 %rd647, 1, 0, %p212; + add.s64 %rd648, %rd646, %rd647; + selp.u64 %rd649, 1, 0, %p213; + add.s64 %rd650, %rd648, %rd649; + selp.u64 %rd651, 1, 0, %p220; + add.s64 %rd652, %rd650, %rd651; + selp.u64 %rd653, 1, 0, %p226; + add.s64 %rd654, %rd652, %rd653; + selp.u64 %rd655, 1, 0, %p233; + add.s64 %rd656, %rd654, %rd655; + add.s64 %rd657, %rd656, %rd76; + add.s64 %rd658, %rd657, %rd637; + add.s64 %rd82, %rd658, %rd641; + setp.ne.s64 %p244, %rd82, %rd76; + setp.ne.s64 %p245, %rd83, %rd77; or.pred %p246, %p245, %p244; - selp.u64 %rd634, 1, 0, %p246; - setp.lt.u64 %p247, %rd627, %rd634; - sub.s64 %rd635, %rd627, %rd634; - sub.s64 %rd636, %rd615, %rd603; - selp.u64 %rd637, 1, 0, %p247; - selp.u64 %rd638, 1, 0, %p241; - selp.b64 %rd639, 576460752303423505, 0, %p229; - selp.b64 %rd640, -1, 0, %p230; - selp.b64 %rd641, -1, 0, %p231; - selp.u64 %rd642, 1, 0, %p240; - selp.b64 %rd643, -1, 0, %p232; - selp.b64 %rd644, -1, 0, %p233; - selp.b64 %rd645, -1, 0, %p239; - setp.ne.s64 %p248, %rd635, 0; - selp.b64 %rd646, -1, 0, %p248; - neg.s64 %rd647, %rd635; - setp.gt.u64 %p249, %rd636, 1; - selp.u64 %rd648, 1, 0, %p249; - setp.ne.s64 %p250, %rd633, %rd648; - or.pred %p251, %p249, %p250; - selp.u64 %rd649, 1, 0, %p251; - setp.lt.u64 %p252, %rd647, %rd649; - selp.b64 %rd650, -1, 0, %p252; - sub.s64 %rd651, %rd647, %rd649; - selp.b64 %rd652, -1, 0, %p249; - sub.s64 %rd653, %rd622, %rd636; - add.s64 %rd654, %rd651, %rd3; - setp.lt.u64 %p253, %rd654, %rd651; - selp.u64 %rd655, 1, 0, %p253; - add.s64 %rd92, %rd653, %rd5; - setp.lt.u64 %p254, %rd92, %rd653; - selp.u64 %rd656, 1, 0, %p254; - add.s64 %rd657, %rd4, %rd652; - add.s64 %rd658, %rd657, %rd656; - sub.s64 %rd90, %rd658, %rd632; - setp.eq.s64 %p255, %rd90, %rd4; - and.pred %p256, %p254, %p255; - setp.lt.u64 %p257, %rd90, %rd4; - or.pred %p258, %p257, %p256; - selp.u64 %rd659, 1, 0, %p258; - add.s64 %rd93, %rd654, %rd659; - setp.lt.u64 %p259, %rd93, %rd654; - selp.u64 %rd660, 1, 0, %p259; - add.s64 %rd661, %rd2, %rd22; - add.s64 %rd662, %rd661, 576460752303423505; - sub.s64 %rd663, %rd662, %rd60; - sub.s64 %rd664, %rd663, %rd60; - add.s64 %rd665, %rd664, %rd28; - sub.s64 %rd666, %rd665, %rd86; - add.s64 %rd667, %rd666, %rd639; - mul.lo.s64 %rd668, %rd768, 576460752303423506; - add.s64 %rd669, %rd667, %rd668; - sub.s64 %rd670, %rd669, %rd608; - add.s64 %rd671, %rd670, %rd625; - add.s64 %rd672, %rd671, %rd640; - add.s64 %rd673, %rd672, %rd641; - add.s64 %rd674, %rd673, %rd642; - add.s64 %rd675, %rd674, %rd643; - add.s64 %rd676, %rd675, %rd644; - add.s64 %rd677, %rd676, %rd645; - add.s64 %rd678, %rd677, %rd638; - add.s64 %rd679, %rd678, %rd637; - add.s64 %rd680, %rd679, %rd646; - add.s64 %rd681, %rd680, %rd650; - add.s64 %rd682, %rd681, %rd655; - add.s64 %rd91, %rd682, %rd660; - setp.eq.s64 %p260, %rd91, 576460752303423505; - or.b64 %rd683, %rd90, %rd92; - or.b64 %rd684, %rd683, %rd93; - setp.eq.s64 %p261, %rd684, 0; - and.pred %p262, %p261, %p260; - setp.gt.u64 %p263, %rd91, 576460752303423504; - xor.pred %p8, %p263, %p262; - selp.u64 %rd94, 1, 0, %p8; - mul.hi.u64 %rd95, %rd94, %rd748; - add.s64 %rd685, %rd95, %rd762; - add.s64 %rd96, %rd685, %rd95; - add.s64 %rd97, %rd96, %rd28; - setp.lt.u64 %p9, %rd97, %rd96; - setp.ne.s64 %p264, %rd91, %rd2; - setp.ne.s64 %p265, %rd93, %rd3; - or.pred %p266, %p265, %p264; - @%p266 bra $L__BB0_13; - - setp.lt.u64 %p268, %rd92, %rd5; - and.pred %p269, %p268, %p255; - or.pred %p271, %p257, %p269; - mov.u64 %rd769, %rd622; - @%p271 bra $L__BB0_14; - -$L__BB0_13: - setp.eq.s64 %p272, %rd91, %rd2; - setp.lt.u64 %p273, %rd93, %rd3; - and.pred %p274, %p273, %p272; - setp.lt.u64 %p275, %rd91, %rd2; - or.pred %p276, %p275, %p274; - selp.u64 %rd769, 1, 0, %p276; + @%p246 bra $L__BB2_12; -$L__BB0_14: - mov.u32 %r12, %ntid.x; - mov.u32 %r11, %tid.x; - mov.u32 %r10, %ctaid.x; - shl.b32 %r9, %r12, 1; - mad.lo.s32 %r8, %r9, %r10, %r11; - ld.param.u64 %rd757, [radix2_dit_butterfly_param_0]; - mul.wide.s32 %rd756, %r8, 32; - cvta.to.global.u64 %rd755, %rd757; - add.s64 %rd754, %rd755, %rd756; - shl.b32 %r7, %r12, 2; - mul.wide.s32 %rd753, %r7, 8; - add.s64 %rd752, %rd754, %rd753; - mov.u64 %rd751, -1; - mov.u64 %rd750, 0; - sub.s64 %rd687, %rd93, %rd97; - setp.lt.u64 %p277, %rd92, %rd94; - selp.b64 %rd688, -1, 0, %p277; - add.s64 %rd689, %rd95, %rd27; - mul.hi.u64 %rd691, %rd94, %rd622; - add.s64 %rd692, %rd689, %rd691; - sub.s64 %rd693, %rd90, %rd692; - add.s64 %rd694, %rd693, %rd688; - setp.gt.u64 %p278, %rd694, %rd90; - setp.eq.s64 %p279, %rd694, %rd90; - and.pred %p280, %p277, %p279; - or.pred %p281, %p278, %p280; - selp.u64 %rd695, 1, 0, %p281; - sub.s64 %rd696, %rd92, %rd94; - mul.hi.u64 %rd698, %rd769, %rd750; - add.s64 %rd699, %rd698, %rd20; - mul.hi.u64 %rd701, %rd769, %rd751; - add.s64 %rd702, %rd699, %rd701; - sub.s64 %rd703, %rd702, %rd769; - neg.s64 %rd704, %rd769; - setp.lt.u64 %p282, %rd703, %rd704; - selp.u64 %rd705, 1, 0, %p282; - add.s64 %rd706, %rd703, %rd60; - setp.lt.u64 %p283, %rd706, %rd703; - selp.u64 %rd707, 1, 0, %p283; - add.s64 %rd708, %rd706, %rd705; - setp.lt.u64 %p284, %rd708, %rd706; - selp.u64 %rd709, 1, 0, %p284; - sub.s64 %rd710, %rd687, %rd695; - add.s64 %rd711, %rd708, %rd710; - setp.lt.u64 %p285, %rd711, %rd708; - selp.u64 %rd712, 1, 0, %p285; - sub.s64 %rd713, %rd696, %rd769; - setp.lt.u64 %p286, %rd713, %rd696; - selp.u64 %rd714, 1, 0, %p286; - add.s64 %rd715, %rd694, %rd714; - add.s64 %rd716, %rd715, %rd703; - setp.eq.s64 %p287, %rd716, %rd694; - and.pred %p288, %p286, %p287; - setp.lt.u64 %p289, %rd716, %rd694; - or.pred %p290, %p289, %p288; - selp.u64 %rd717, 1, 0, %p290; - add.s64 %rd718, %rd711, %rd717; - setp.lt.u64 %p291, %rd718, %rd711; - selp.u64 %rd719, 1, 0, %p291; - sub.s64 %rd720, %rd74, %rd96; - selp.s64 %rd721, -1, 0, %p9; - add.s64 %rd722, %rd720, %rd721; - setp.lt.u64 %p292, %rd93, %rd97; - selp.b64 %rd723, -1, 0, %p292; - add.s64 %rd724, %rd722, %rd723; - add.s64 %rd725, %rd724, %rd91; - selp.b64 %rd726, -576460752303423505, 0, %p8; - add.s64 %rd727, %rd725, %rd726; - setp.lt.u64 %p293, %rd687, %rd695; - selp.b64 %rd728, -1, 0, %p293; - add.s64 %rd729, %rd727, %rd728; - mul.lo.s64 %rd730, %rd769, -576460752303423506; - add.s64 %rd731, %rd729, %rd730; - add.s64 %rd732, %rd731, %rd702; - add.s64 %rd733, %rd732, %rd705; - add.s64 %rd734, %rd733, %rd707; - add.s64 %rd735, %rd734, %rd709; - add.s64 %rd736, %rd735, %rd712; - add.s64 %rd737, %rd736, %rd719; - st.global.u64 [%rd754], %rd75; - st.global.u64 [%rd754+8], %rd73; - st.global.u64 [%rd754+16], %rd72; - st.global.u64 [%rd754+24], %rd71; - st.global.u64 [%rd752], %rd737; - st.global.u64 [%rd752+8], %rd718; - st.global.u64 [%rd752+16], %rd716; - st.global.u64 [%rd752+24], %rd713; + mov.u64 %rd1024, 1; + @%p242 bra $L__BB2_13; + +$L__BB2_12: + setp.eq.s64 %p252, %rd82, %rd76; + setp.lt.u64 %p253, %rd83, %rd77; + and.pred %p254, %p253, %p252; + setp.lt.u64 %p255, %rd82, %rd76; + or.pred %p256, %p255, %p254; + selp.u64 %rd1024, 1, 0, %p256; + +$L__BB2_13: + or.b64 %rd660, %rd81, %rd80; + or.b64 %rd661, %rd660, %rd83; + setp.eq.s64 %p257, %rd661, 0; + mov.u64 %rd662, 0; + setp.eq.s64 %p258, %rd82, 576460752303423505; + and.pred %p259, %p257, %p258; + setp.gt.u64 %p260, %rd82, 576460752303423504; + xor.pred %p261, %p260, %p259; + selp.u64 %rd663, 1, 0, %p261; + mov.u64 %rd664, -1; + mul.hi.u64 %rd665, %rd1024, %rd664; + sub.s64 %rd666, %rd665, %rd1024; + neg.s64 %rd667, %rd1024; + setp.lt.u64 %p262, %rd666, %rd667; + selp.u64 %rd668, 1, 0, %p262; + add.s64 %rd669, %rd666, %rd36; + setp.lt.u64 %p263, %rd669, %rd666; + selp.u64 %rd670, 1, 0, %p263; + add.s64 %rd671, %rd669, %rd668; + setp.lt.u64 %p264, %rd671, %rd669; + selp.u64 %rd672, 1, 0, %p264; + add.s64 %rd673, %rd671, %rd83; + setp.lt.u64 %p265, %rd673, %rd671; + selp.u64 %rd674, 1, 0, %p265; + add.s64 %rd675, %rd666, %rd81; + sub.s64 %rd676, %rd80, %rd1024; + setp.lt.u64 %p266, %rd676, %rd80; + selp.u64 %rd677, 1, 0, %p266; + add.s64 %rd678, %rd675, %rd677; + setp.eq.s64 %p267, %rd678, %rd81; + and.pred %p268, %p266, %p267; + setp.lt.u64 %p269, %rd678, %rd81; + or.pred %p270, %p269, %p268; + selp.u64 %rd679, 1, 0, %p270; + add.s64 %rd680, %rd673, %rd679; + setp.lt.u64 %p271, %rd680, %rd673; + selp.u64 %rd681, 1, 0, %p271; + mov.u64 %rd682, 1; + mul.hi.u64 %rd683, %rd663, %rd682; + mul.hi.u64 %rd684, %rd663, %rd662; + add.s64 %rd685, %rd684, %rd12; + setp.lt.u64 %p272, %rd685, %rd684; + selp.b64 %rd686, -576460752303423505, 0, %p261; + selp.b64 %rd687, -1, 0, %p272; + setp.lt.u64 %p273, %rd680, %rd685; + selp.b64 %rd688, -1, 0, %p273; + sub.s64 %rd689, %rd680, %rd685; + sub.s64 %rd690, %rd678, %rd683; + setp.lt.u64 %p274, %rd676, %rd663; + selp.b64 %rd691, -1, 0, %p274; + add.s64 %rd1021, %rd690, %rd691; + setp.eq.s64 %p275, %rd1021, %rd678; + and.pred %p276, %p274, %p275; + setp.gt.u64 %p277, %rd1021, %rd678; + or.pred %p278, %p277, %p276; + selp.u64 %rd692, 1, 0, %p278; + setp.lt.u64 %p279, %rd689, %rd692; + selp.b64 %rd693, -1, 0, %p279; + sub.s64 %rd694, %rd82, %rd38; + add.s64 %rd695, %rd694, %rd686; + mul.lo.s64 %rd696, %rd1024, -576460752303423506; + add.s64 %rd697, %rd695, %rd696; + add.s64 %rd698, %rd697, %rd36; + add.s64 %rd699, %rd698, %rd36; + add.s64 %rd700, %rd699, %rd665; + sub.s64 %rd701, %rd700, %rd684; + add.s64 %rd702, %rd701, %rd668; + add.s64 %rd703, %rd702, %rd670; + add.s64 %rd704, %rd703, %rd687; + add.s64 %rd705, %rd704, %rd672; + add.s64 %rd706, %rd705, %rd674; + add.s64 %rd707, %rd706, %rd681; + add.s64 %rd708, %rd707, %rd688; + add.s64 %rd1019, %rd708, %rd693; + sub.s64 %rd1020, %rd689, %rd692; + sub.s64 %rd1022, %rd676, %rd663; + +$L__BB2_14: + mov.u64 %rd710, 0; + shr.u32 %r18, %r18, 1; + mul.lo.s64 %rd711, %rd1015, %rd1016; + mul.hi.u64 %rd712, %rd1016, %rd1016; + add.s64 %rd713, %rd712, %rd711; + setp.lt.u64 %p280, %rd713, %rd712; + selp.u64 %rd714, 1, 0, %p280; + mul.hi.u64 %rd95, %rd1015, %rd1015; + add.s64 %rd715, %rd713, %rd711; + setp.lt.u64 %p281, %rd715, %rd713; + selp.u64 %rd716, 1, 0, %p281; + mul.lo.s64 %rd717, %rd1015, %rd1015; + mul.hi.u64 %rd718, %rd1015, %rd1016; + add.s64 %rd719, %rd718, %rd717; + setp.lt.u64 %p282, %rd719, %rd718; + selp.u64 %rd96, 1, 0, %p282; + mul.hi.u64 %rd720, %rd1016, %rd1015; + add.s64 %rd721, %rd720, %rd719; + setp.lt.u64 %p283, %rd721, %rd720; + selp.u64 %rd97, 1, 0, %p283; + add.s64 %rd722, %rd721, %rd714; + add.s64 %rd723, %rd722, %rd716; + setp.lt.u64 %p284, %rd723, %rd721; + selp.u64 %rd98, 1, 0, %p284; + mul.lo.s64 %rd724, %rd1016, %rd1017; + mul.hi.u64 %rd725, %rd1016, %rd1018; + add.s64 %rd726, %rd725, %rd724; + setp.lt.u64 %p285, %rd726, %rd725; + selp.u64 %rd727, 1, 0, %p285; + mul.lo.s64 %rd728, %rd1015, %rd1018; + add.s64 %rd729, %rd726, %rd728; + setp.lt.u64 %p286, %rd729, %rd726; + selp.u64 %rd730, 1, 0, %p286; + mul.lo.s64 %rd731, %rd1015, %rd1017; + mul.hi.u64 %rd732, %rd1015, %rd1018; + add.s64 %rd733, %rd732, %rd731; + setp.lt.u64 %p287, %rd733, %rd732; + selp.u64 %rd734, 1, 0, %p287; + mul.hi.u64 %rd735, %rd1016, %rd1017; + add.s64 %rd736, %rd735, %rd733; + setp.lt.u64 %p288, %rd736, %rd735; + selp.u64 %rd737, 1, 0, %p288; + add.s64 %rd738, %rd736, %rd727; + add.s64 %rd739, %rd738, %rd730; + setp.lt.u64 %p289, %rd739, %rd736; + selp.u64 %rd740, 1, 0, %p289; + mul.lo.s64 %rd741, %rd1017, %rd1018; + mul.hi.u64 %rd742, %rd1018, %rd1018; + add.s64 %rd743, %rd742, %rd741; + setp.lt.u64 %p290, %rd743, %rd742; + selp.u64 %rd744, 1, 0, %p290; + add.s64 %rd745, %rd743, %rd741; + setp.lt.u64 %p291, %rd745, %rd743; + selp.u64 %rd746, 1, 0, %p291; + mul.lo.s64 %rd747, %rd1017, %rd1017; + mul.hi.u64 %rd748, %rd1017, %rd1018; + add.s64 %rd749, %rd748, %rd747; + setp.lt.u64 %p292, %rd749, %rd748; + selp.u64 %rd750, 1, 0, %p292; + mul.hi.u64 %rd751, %rd1018, %rd1017; + add.s64 %rd752, %rd751, %rd749; + setp.lt.u64 %p293, %rd752, %rd751; + selp.u64 %rd753, 1, 0, %p293; + add.s64 %rd754, %rd752, %rd744; + add.s64 %rd755, %rd754, %rd746; + setp.lt.u64 %p294, %rd755, %rd752; + selp.u64 %rd756, 1, 0, %p294; + mul.lo.s64 %rd757, %rd1016, %rd1018; + shl.b64 %rd758, %rd757, 1; + mov.u64 %rd709, 1; + setp.lt.u64 %p295, %rd758, %rd757; + selp.u64 %rd759, 1, 0, %p295; + add.s64 %rd760, %rd729, %rd759; + add.s64 %rd761, %rd760, %rd729; + setp.lt.u64 %p296, %rd761, %rd760; + setp.eq.s64 %p297, %rd760, 0; + and.pred %p298, %p295, %p297; + or.pred %p299, %p296, %p298; + selp.u64 %rd762, 1, 0, %p299; + add.s64 %rd763, %rd755, %rd758; + setp.lt.u64 %p300, %rd763, %rd755; + selp.u64 %rd764, 1, 0, %p300; + mul.hi.u64 %rd765, %rd710, %rd1018; + add.s64 %rd766, %rd761, %rd765; + mul.hi.u64 %rd767, %rd1017, %rd1017; + add.s64 %rd768, %rd766, %rd767; + mul.hi.u64 %rd769, %rd1018, %rd710; + add.s64 %rd770, %rd768, %rd769; + add.s64 %rd771, %rd770, %rd750; + add.s64 %rd772, %rd771, %rd753; + add.s64 %rd773, %rd772, %rd756; + add.s64 %rd774, %rd773, %rd764; + setp.eq.s64 %p301, %rd774, %rd761; + and.pred %p302, %p300, %p301; + setp.lt.u64 %p303, %rd774, %rd761; + or.pred %p304, %p303, %p302; + selp.u64 %rd775, 1, 0, %p304; + mul.lo.s64 %rd776, %rd1016, %rd1016; + add.s64 %rd777, %rd739, %rd776; + setp.lt.u64 %p305, %rd777, %rd739; + selp.u64 %rd778, 1, 0, %p305; + mul.hi.u64 %rd99, %rd1016, %rd710; + add.s64 %rd779, %rd99, %rd715; + add.s64 %rd780, %rd779, %rd765; + mul.hi.u64 %rd781, %rd1015, %rd1017; + add.s64 %rd782, %rd780, %rd781; + add.s64 %rd783, %rd782, %rd734; + add.s64 %rd784, %rd783, %rd737; + add.s64 %rd785, %rd784, %rd740; + add.s64 %rd786, %rd785, %rd778; + setp.eq.s64 %p306, %rd786, %rd715; + and.pred %p307, %p305, %p306; + setp.lt.u64 %p308, %rd786, %rd715; + or.pred %p309, %p308, %p307; + selp.u64 %rd787, 1, 0, %p309; + add.s64 %rd788, %rd723, %rd787; + setp.lt.u64 %p310, %rd788, %rd723; + selp.u64 %rd100, 1, 0, %p310; + add.s64 %rd789, %rd739, %rd777; + setp.lt.u64 %p311, %rd789, %rd739; + selp.u64 %rd790, 1, 0, %p311; + add.s64 %rd791, %rd781, %rd765; + add.s64 %rd792, %rd791, %rd99; + add.s64 %rd793, %rd792, %rd734; + add.s64 %rd794, %rd793, %rd786; + add.s64 %rd795, %rd794, %rd737; + add.s64 %rd796, %rd795, %rd740; + add.s64 %rd797, %rd796, %rd790; + setp.eq.s64 %p312, %rd797, %rd786; + and.pred %p313, %p311, %p312; + setp.lt.u64 %p314, %rd797, %rd786; + or.pred %p315, %p314, %p313; + selp.u64 %rd798, 1, 0, %p315; + add.s64 %rd101, %rd788, %rd798; + setp.lt.u64 %p12, %rd101, %rd788; + selp.u64 %rd102, 1, 0, %p12; + add.s64 %rd799, %rd789, %rd762; + add.s64 %rd103, %rd799, %rd775; + setp.lt.u64 %p316, %rd103, %rd789; + selp.u64 %rd800, 1, 0, %p316; + add.s64 %rd104, %rd797, %rd800; + setp.lt.u64 %p13, %rd104, %rd797; + mul.lo.s64 %rd801, %rd1018, %rd1018; + mul.lo.s64 %rd802, %rd801, 576460752303423504; + mov.u64 %rd803, -1; + mul.hi.u64 %rd804, %rd763, %rd803; + mul.hi.u64 %rd805, %rd801, %rd803; + neg.s64 %rd105, %rd801; + mul.hi.u64 %rd806, %rd745, %rd803; + sub.s64 %rd807, %rd805, %rd801; + setp.lt.u64 %p317, %rd807, %rd105; + selp.u64 %rd808, 1, 0, %p317; + neg.s64 %rd809, %rd745; + sub.s64 %rd810, %rd806, %rd745; + sub.s64 %rd106, %rd807, %rd745; + setp.lt.u64 %p318, %rd106, %rd807; + selp.u64 %rd811, 1, 0, %p318; + add.s64 %rd812, %rd811, %rd808; + setp.lt.u64 %p319, %rd810, %rd809; + selp.u64 %rd813, 1, 0, %p319; + add.s64 %rd814, %rd810, %rd807; + setp.lt.u64 %p320, %rd814, %rd810; + selp.u64 %rd815, 1, 0, %p320; + sub.s64 %rd816, %rd814, %rd763; + setp.lt.u64 %p321, %rd816, %rd814; + selp.u64 %rd817, 1, 0, %p321; + add.s64 %rd108, %rd812, %rd816; + setp.lt.u64 %p322, %rd108, %rd812; + selp.u64 %rd818, 1, 0, %p322; + add.s64 %rd819, %rd804, %rd802; + add.s64 %rd820, %rd819, %rd805; + sub.s64 %rd821, %rd820, %rd763; + add.s64 %rd822, %rd821, %rd810; + add.s64 %rd823, %rd822, %rd808; + sub.s64 %rd824, %rd823, %rd774; + add.s64 %rd825, %rd824, %rd813; + add.s64 %rd826, %rd825, %rd815; + add.s64 %rd827, %rd826, %rd817; + add.s64 %rd107, %rd827, %rd818; + mul.hi.u64 %rd828, %rd108, %rd709; + add.s64 %rd109, %rd107, %rd828; + mul.hi.u64 %rd110, %rd710, %rd105; + mul.lo.s64 %rd829, %rd801, -576460752303423505; + add.s64 %rd111, %rd110, %rd829; + setp.lt.u64 %p15, %rd111, %rd110; + mul.hi.u64 %rd830, %rd105, %rd709; + mul.hi.u64 %rd831, %rd106, %rd709; + mul.hi.u64 %rd832, %rd105, %rd710; + mul.hi.u64 %rd833, %rd106, %rd710; + add.s64 %rd834, %rd830, %rd106; + setp.lt.u64 %p323, %rd834, %rd830; + selp.u64 %rd835, 1, 0, %p323; + add.s64 %rd836, %rd832, %rd831; + setp.lt.u64 %p324, %rd836, %rd832; + selp.u64 %rd837, 1, 0, %p324; + add.s64 %rd838, %rd836, %rd835; + setp.lt.u64 %p325, %rd838, %rd836; + selp.u64 %rd839, 1, 0, %p325; + add.s64 %rd112, %rd111, %rd109; + setp.lt.u64 %p16, %rd112, %rd111; + add.s64 %rd115, %rd838, %rd108; + setp.lt.u64 %p17, %rd115, %rd838; + selp.u64 %rd840, 1, 0, %p17; + add.s64 %rd841, %rd112, %rd12; + add.s64 %rd842, %rd841, %rd832; + add.s64 %rd843, %rd842, %rd833; + add.s64 %rd844, %rd843, %rd837; + add.s64 %rd845, %rd844, %rd839; + add.s64 %rd114, %rd845, %rd840; + add.s64 %rd846, %rd115, %rd763; + setp.lt.u64 %p326, %rd846, %rd115; + selp.u64 %rd847, 1, 0, %p326; + setp.ne.s64 %p327, %rd801, 0; + selp.u64 %rd848, 1, 0, %p327; + add.s64 %rd849, %rd807, %rd848; + add.s64 %rd850, %rd849, %rd830; + setp.eq.s64 %p328, %rd849, %rd106; + and.pred %p329, %p327, %p328; + setp.lt.u64 %p330, %rd850, %rd834; + or.pred %p331, %p330, %p329; + selp.u64 %rd851, 1, 0, %p331; + add.s64 %rd117, %rd846, %rd851; + setp.lt.u64 %p332, %rd117, %rd846; + selp.u64 %rd852, 1, 0, %p332; + add.s64 %rd853, %rd774, %rd847; + add.s64 %rd116, %rd853, %rd852; + setp.ne.s64 %p333, %rd116, 0; + setp.ne.s64 %p334, %rd117, %rd115; + or.pred %p335, %p334, %p333; + not.pred %p336, %p331; + or.pred %p337, %p335, %p336; + not.pred %p338, %p337; + mov.u64 %rd1029, %rd709; + @%p338 bra $L__BB2_16; + + setp.eq.s64 %p339, %rd116, 0; + setp.lt.u64 %p340, %rd117, %rd115; + and.pred %p341, %p340, %p339; + add.s64 %rd854, %rd116, %rd114; + setp.lt.u64 %p342, %rd854, %rd116; + or.pred %p343, %p342, %p341; + selp.u64 %rd1029, 1, 0, %p343; + +$L__BB2_16: + mul.hi.u64 %rd1006, %rd710, %rd1016; + mul.hi.u64 %rd1005, %rd108, %rd709; + add.s64 %rd1004, %rd107, %rd1005; + setp.lt.u64 %p417, %rd1004, %rd107; + selp.u64 %rd855, 1, 0, %p13; + add.s64 %rd856, %rd101, %rd855; + setp.lt.u64 %p344, %rd856, %rd101; + mul.hi.u64 %rd858, %rd108, %rd710; + mul.hi.u64 %rd859, %rd107, %rd710; + mul.lo.s64 %rd860, %rd108, 576460752303423505; + mov.u64 %rd861, 576460752303423505; + add.s64 %rd862, %rd860, %rd858; + setp.lt.u64 %p345, %rd862, %rd860; + mul.lo.s64 %rd863, %rd107, 576460752303423505; + add.s64 %rd864, %rd863, %rd859; + setp.lt.u64 %p346, %rd864, %rd863; + mul.hi.u64 %rd865, %rd108, %rd861; + add.s64 %rd866, %rd864, %rd865; + setp.lt.u64 %p347, %rd866, %rd864; + selp.u64 %rd867, 1, 0, %p345; + add.s64 %rd868, %rd866, %rd867; + setp.lt.u64 %p348, %rd868, %rd866; + mul.hi.u64 %rd870, %rd107, %rd709; + add.s64 %rd871, %rd870, %rd858; + setp.lt.u64 %p349, %rd871, %rd870; + selp.u64 %rd872, 1, 0, %p417; + add.s64 %rd873, %rd871, %rd872; + setp.lt.u64 %p350, %rd873, %rd871; + mul.lo.s64 %rd874, %rd106, 576460752303423505; + mul.hi.u64 %rd875, %rd861, %rd105; + add.s64 %rd876, %rd875, %rd874; + setp.lt.u64 %p351, %rd876, %rd875; + mul.hi.u64 %rd877, %rd710, %rd106; + add.s64 %rd878, %rd877, %rd876; + setp.lt.u64 %p352, %rd878, %rd877; + selp.u64 %rd879, 1, 0, %p15; + add.s64 %rd880, %rd878, %rd879; + setp.lt.u64 %p353, %rd880, %rd878; + add.s64 %rd881, %rd859, %rd858; + add.s64 %rd882, %rd881, %rd12; + add.s64 %rd883, %rd882, %rd862; + selp.u64 %rd884, 1, 0, %p349; + add.s64 %rd885, %rd883, %rd884; + selp.u64 %rd886, 1, 0, %p350; + add.s64 %rd887, %rd885, %rd886; + setp.lt.u64 %p354, %rd887, %rd862; + selp.u64 %rd888, 1, 0, %p354; + add.s64 %rd889, %rd868, %rd888; + setp.lt.u64 %p355, %rd889, %rd868; + add.s64 %rd890, %rd880, %rd873; + setp.lt.u64 %p356, %rd890, %rd880; + add.s64 %rd891, %rd110, %rd11; + mul.hi.u64 %rd892, %rd861, %rd106; + add.s64 %rd893, %rd891, %rd892; + selp.u64 %rd894, 1, 0, %p351; + add.s64 %rd895, %rd893, %rd894; + selp.u64 %rd896, 1, 0, %p352; + add.s64 %rd897, %rd895, %rd896; + selp.u64 %rd898, 1, 0, %p353; + add.s64 %rd899, %rd897, %rd898; + selp.u64 %rd900, 1, 0, %p356; + add.s64 %rd901, %rd899, %rd900; + add.s64 %rd902, %rd901, %rd887; + setp.lt.u64 %p357, %rd902, %rd901; + setp.eq.s64 %p358, %rd901, 0; + and.pred %p359, %p356, %p358; + or.pred %p360, %p357, %p359; + selp.u64 %rd903, 1, 0, %p360; + add.s64 %rd904, %rd889, %rd903; + setp.lt.u64 %p361, %rd904, %rd889; + selp.u64 %rd905, 1, 0, %p16; + add.s64 %rd906, %rd890, %rd905; + setp.lt.u64 %p362, %rd114, %rd112; + setp.eq.s64 %p363, %rd114, %rd112; + and.pred %p364, %p17, %p363; + or.pred %p365, %p362, %p364; + selp.u64 %rd907, 1, 0, %p365; + add.s64 %rd908, %rd906, %rd907; + setp.lt.u64 %p366, %rd908, %rd890; + selp.u64 %rd909, 1, 0, %p366; + add.s64 %rd910, %rd902, %rd909; + setp.lt.u64 %p367, %rd910, %rd902; + selp.u64 %rd911, 1, 0, %p367; + add.s64 %rd912, %rd904, %rd911; + setp.lt.u64 %p368, %rd912, %rd904; + add.s64 %rd123, %rd1029, %rd103; + setp.lt.u64 %p369, %rd123, %rd1029; + selp.u64 %rd913, 1, 0, %p369; + add.s64 %rd122, %rd104, %rd913; + setp.lt.u64 %p370, %rd122, %rd104; + selp.u64 %rd914, 1, 0, %p370; + add.s64 %rd121, %rd856, %rd914; + setp.lt.u64 %p371, %rd121, %rd856; + selp.u64 %rd915, 1, 0, %p371; + add.s64 %rd916, %rd95, %rd1006; + add.s64 %rd917, %rd916, %rd99; + add.s64 %rd918, %rd917, %rd96; + add.s64 %rd919, %rd918, %rd97; + add.s64 %rd920, %rd919, %rd98; + add.s64 %rd921, %rd920, %rd100; + add.s64 %rd922, %rd921, %rd102; + selp.u64 %rd923, 1, 0, %p344; + add.s64 %rd924, %rd922, %rd923; + add.s64 %rd120, %rd924, %rd915; + add.s64 %rd925, %rd121, %rd912; + setp.lt.u64 %p372, %rd925, %rd121; + selp.u64 %rd926, 1, 0, %p372; + add.s64 %rd927, %rd122, %rd910; + add.s64 %rd124, %rd123, %rd908; + setp.lt.u64 %p373, %rd124, %rd123; + selp.u64 %rd928, 1, 0, %p373; + add.s64 %rd125, %rd927, %rd928; + setp.eq.s64 %p374, %rd125, %rd122; + and.pred %p375, %p373, %p374; + setp.lt.u64 %p376, %rd125, %rd122; + or.pred %p377, %p376, %p375; + selp.u64 %rd929, 1, 0, %p377; + add.s64 %rd127, %rd925, %rd929; + setp.lt.u64 %p378, %rd127, %rd925; + selp.u64 %rd930, 1, 0, %p378; + add.s64 %rd931, %rd858, %rd11; + mul.hi.u64 %rd932, %rd107, %rd861; + add.s64 %rd933, %rd931, %rd932; + selp.u64 %rd934, 1, 0, %p346; + add.s64 %rd935, %rd933, %rd934; + selp.u64 %rd936, 1, 0, %p347; + add.s64 %rd937, %rd935, %rd936; + selp.u64 %rd938, 1, 0, %p348; + add.s64 %rd939, %rd937, %rd938; + selp.u64 %rd940, 1, 0, %p355; + add.s64 %rd941, %rd939, %rd940; + selp.u64 %rd942, 1, 0, %p361; + add.s64 %rd943, %rd941, %rd942; + selp.u64 %rd944, 1, 0, %p368; + add.s64 %rd945, %rd943, %rd944; + add.s64 %rd946, %rd945, %rd120; + add.s64 %rd947, %rd946, %rd926; + add.s64 %rd126, %rd947, %rd930; + setp.ne.s64 %p379, %rd126, %rd120; + setp.ne.s64 %p380, %rd127, %rd121; + or.pred %p381, %p380, %p379; + @%p381 bra $L__BB2_18; + + mov.u64 %rd1030, 1; + @%p377 bra $L__BB2_19; + +$L__BB2_18: + setp.eq.s64 %p387, %rd126, %rd120; + setp.lt.u64 %p388, %rd127, %rd121; + and.pred %p389, %p388, %p387; + setp.lt.u64 %p390, %rd126, %rd120; + or.pred %p391, %p390, %p389; + selp.u64 %rd1030, 1, 0, %p391; + +$L__BB2_19: + or.b64 %rd949, %rd125, %rd124; + or.b64 %rd950, %rd949, %rd127; + setp.eq.s64 %p392, %rd950, 0; + mov.u64 %rd951, 0; + setp.eq.s64 %p393, %rd126, 576460752303423505; + and.pred %p394, %p392, %p393; + setp.gt.u64 %p395, %rd126, 576460752303423504; + xor.pred %p396, %p395, %p394; + selp.u64 %rd952, 1, 0, %p396; + mov.u64 %rd953, -1; + mul.hi.u64 %rd954, %rd1030, %rd953; + sub.s64 %rd955, %rd954, %rd1030; + neg.s64 %rd956, %rd1030; + setp.lt.u64 %p397, %rd955, %rd956; + selp.u64 %rd957, 1, 0, %p397; + add.s64 %rd958, %rd955, %rd36; + setp.lt.u64 %p398, %rd958, %rd955; + selp.u64 %rd959, 1, 0, %p398; + add.s64 %rd960, %rd958, %rd957; + setp.lt.u64 %p399, %rd960, %rd958; + selp.u64 %rd961, 1, 0, %p399; + add.s64 %rd962, %rd960, %rd127; + setp.lt.u64 %p400, %rd962, %rd960; + selp.u64 %rd963, 1, 0, %p400; + add.s64 %rd964, %rd955, %rd125; + sub.s64 %rd965, %rd124, %rd1030; + setp.lt.u64 %p401, %rd965, %rd124; + selp.u64 %rd966, 1, 0, %p401; + add.s64 %rd967, %rd964, %rd966; + setp.eq.s64 %p402, %rd967, %rd125; + and.pred %p403, %p401, %p402; + setp.lt.u64 %p404, %rd967, %rd125; + or.pred %p405, %p404, %p403; + selp.u64 %rd968, 1, 0, %p405; + add.s64 %rd969, %rd962, %rd968; + setp.lt.u64 %p406, %rd969, %rd962; + selp.u64 %rd970, 1, 0, %p406; + mov.u64 %rd971, 1; + mul.hi.u64 %rd972, %rd952, %rd971; + mul.hi.u64 %rd973, %rd952, %rd951; + add.s64 %rd974, %rd973, %rd12; + setp.lt.u64 %p407, %rd974, %rd973; + selp.b64 %rd975, -576460752303423505, 0, %p396; + selp.b64 %rd976, -1, 0, %p407; + setp.lt.u64 %p408, %rd969, %rd974; + selp.b64 %rd977, -1, 0, %p408; + sub.s64 %rd978, %rd969, %rd974; + sub.s64 %rd979, %rd967, %rd972; + setp.lt.u64 %p409, %rd965, %rd952; + selp.b64 %rd980, -1, 0, %p409; + add.s64 %rd1017, %rd979, %rd980; + setp.eq.s64 %p410, %rd1017, %rd967; + and.pred %p411, %p409, %p410; + setp.gt.u64 %p412, %rd1017, %rd967; + or.pred %p413, %p412, %p411; + selp.u64 %rd981, 1, 0, %p413; + setp.lt.u64 %p414, %rd978, %rd981; + selp.b64 %rd982, -1, 0, %p414; + sub.s64 %rd983, %rd126, %rd38; + add.s64 %rd984, %rd983, %rd975; + mul.lo.s64 %rd985, %rd1030, -576460752303423506; + add.s64 %rd986, %rd984, %rd985; + add.s64 %rd987, %rd986, %rd36; + add.s64 %rd988, %rd987, %rd36; + add.s64 %rd989, %rd988, %rd954; + sub.s64 %rd990, %rd989, %rd973; + add.s64 %rd991, %rd990, %rd957; + add.s64 %rd992, %rd991, %rd959; + add.s64 %rd993, %rd992, %rd976; + add.s64 %rd994, %rd993, %rd961; + add.s64 %rd995, %rd994, %rd963; + add.s64 %rd996, %rd995, %rd970; + add.s64 %rd997, %rd996, %rd977; + add.s64 %rd1015, %rd997, %rd982; + sub.s64 %rd1016, %rd978, %rd981; + sub.s64 %rd1018, %rd965, %rd952; + setp.ne.s32 %p415, %r18, 0; + @%p415 bra $L__BB2_7; + +$L__BB2_20: + ld.param.u64 %rd1003, [calc_twiddles_bitrev_param_0]; + mov.u32 %r17, %tid.x; + mov.u32 %r16, %ctaid.x; + mov.u32 %r15, %ntid.x; + mad.lo.s32 %r14, %r15, %r16, %r17; + cvta.to.global.u64 %rd998, %rd1003; + mul.wide.u32 %rd999, %r14, 32; + add.s64 %rd1000, %rd998, %rd999; + st.global.u64 [%rd1000], %rd1019; + st.global.u64 [%rd1000+8], %rd1020; + st.global.u64 [%rd1000+16], %rd1021; + st.global.u64 [%rd1000+24], %rd1022; + +$L__BB2_21: + ret; + +} + // .globl bitrev_permutation +.visible .entry bitrev_permutation( + .param .u64 bitrev_permutation_param_0, + .param .u64 bitrev_permutation_param_1, + .param .u32 bitrev_permutation_param_2 +) +{ + .reg .pred %p<3>; + .reg .b32 %r<11>; + .reg .b64 %rd<13>; + + + ld.param.u64 %rd1, [bitrev_permutation_param_0]; + ld.param.u64 %rd2, [bitrev_permutation_param_1]; + ld.param.u32 %r2, [bitrev_permutation_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r4, %r3, %r5; + setp.ge.u32 %p1, %r1, %r2; + @%p1 bra $L__BB3_2; + + clz.b32 %r6, %r2; + add.s32 %r7, %r6, 1; + brev.b32 %r8, %r1; + shr.u32 %r9, %r8, %r7; + setp.eq.s32 %p2, %r2, 1; + selp.b32 %r10, %r1, %r9, %p2; + cvta.to.global.u64 %rd3, %rd1; + mul.wide.u32 %rd4, %r10, 32; + add.s64 %rd5, %rd3, %rd4; + ld.global.u64 %rd6, [%rd5]; + ld.global.u64 %rd7, [%rd5+8]; + ld.global.u64 %rd8, [%rd5+16]; + ld.global.u64 %rd9, [%rd5+24]; + cvta.to.global.u64 %rd10, %rd2; + mul.wide.u32 %rd11, %r1, 32; + add.s64 %rd12, %rd10, %rd11; + st.global.u64 [%rd12], %rd6; + st.global.u64 [%rd12+8], %rd7; + st.global.u64 [%rd12+16], %rd8; + st.global.u64 [%rd12+24], %rd9; + +$L__BB3_2: ret; } diff --git a/math/src/gpu/metal/shaders/fft/fft.h.metal b/math/src/gpu/metal/shaders/fft/fft.h.metal index 9e7297aee..0f241ead1 100644 --- a/math/src/gpu/metal/shaders/fft/fft.h.metal +++ b/math/src/gpu/metal/shaders/fft/fft.h.metal @@ -4,22 +4,26 @@ template [[kernel]] void radix2_dit_butterfly( - device Fp* input [[ buffer(0) ]], - constant Fp* twiddles [[ buffer(1) ]], - uint32_t group [[ threadgroup_position_in_grid ]], - uint32_t pos_in_group [[ thread_position_in_threadgroup ]], - uint32_t half_group_size [[ threads_per_threadgroup ]] + device Fp* input [[ buffer(0) ]], + constant Fp* twiddles [[ buffer(1) ]], + constant uint32_t& stage [[ buffer(2) ]], + uint32_t thread_count [[ threads_per_grid ]], + uint32_t thread_pos [[ thread_position_in_grid ]] ) { - uint32_t i = group * half_group_size * 2 + pos_in_group; + uint32_t half_group_size = thread_count >> stage; // thread_count / group_count + uint32_t group = thread_pos >> metal::ctz(half_group_size); // thread_pos / half_group_size - Fp w = twiddles[group]; - Fp a = input[i]; - Fp b = input[i + half_group_size]; + uint32_t pos_in_group = thread_pos & (half_group_size - 1); // thread_pos % half_group_size + uint32_t i = thread_pos * 2 - pos_in_group; // multiply quotient by 2 - Fp res_1 = a + w*b; - Fp res_2 = a - w*b; + Fp w = twiddles[group]; + Fp a = input[i]; + Fp b = input[i + half_group_size]; - input[i] = res_1; // --\/-- - input[i + half_group_size] = res_2; // --/\-- + Fp res_1 = a + w*b; + Fp res_2 = a - w*b; + + input[i] = res_1; // --\/-- + input[i + half_group_size] = res_2; // --/\-- } diff --git a/math/src/gpu/metal/shaders/field/stark256.h.metal b/math/src/gpu/metal/shaders/field/stark256.h.metal index 8a18cce90..dd78ffefe 100644 --- a/math/src/gpu/metal/shaders/field/stark256.h.metal +++ b/math/src/gpu/metal/shaders/field/stark256.h.metal @@ -17,10 +17,10 @@ namespace { template [[ host_name("radix2_dit_butterfly_stark256") ]] [[kernel]] void radix2_dit_butterfly( - device Fp*, - constant Fp*, - uint32_t, - uint32_t, + device Fp*, + constant Fp*, + constant uint32_t&, + uint32_t, uint32_t ); From 595ae77dfb2909431a996104e8d4a0b59e31ab54 Mon Sep 17 00:00:00 2001 From: JihoonSong Date: Tue, 18 Jul 2023 19:43:49 +0900 Subject: [PATCH 3/7] Add serde to MontgomeryBackendPrimeField (#509) * Add serde to MontgomeryBackendPrimeField * Fix lint * Configure serde as one of std feature * Mark JSON serializer as optional dependencies Please note that a test for `lambdaworks-serde` feature is not added to CI test. However, you can verify it with the following command. cargo test --package lambdaworks-math --features lambdaworks-serde montgomery_backend_serialization_deserialization See #509 for detailed info. --------- Co-authored-by: Jihoon Song Co-authored-by: Jihoon Song --- math/Cargo.toml | 3 + math/src/field/element.rs | 66 +++++++++++++++++++ .../fields/montgomery_backed_prime_fields.rs | 10 +++ 3 files changed, 79 insertions(+) diff --git a/math/Cargo.toml b/math/Cargo.toml index 64bd40e3b..0b9877420 100644 --- a/math/Cargo.toml +++ b/math/Cargo.toml @@ -9,6 +9,8 @@ license.workspace = true [dependencies] thiserror = { version = "1.0", optional = true } +serde = { version = "1.0", features = ["derive"], optional = true } +serde_json = { version = "1.0", optional = true } # rayon rayon = { version = "1.7", optional = true } @@ -33,6 +35,7 @@ iai-callgrind.workspace = true rayon = ["dep:rayon"] default = ["rayon", "std"] std = ["dep:thiserror"] +lambdaworks-serde = ["dep:serde", "dep:serde_json", "std"] # gpu metal = [ diff --git a/math/src/field/element.rs b/math/src/field/element.rs index dd628e6b9..9f673def5 100644 --- a/math/src/field/element.rs +++ b/math/src/field/element.rs @@ -6,7 +6,15 @@ use crate::unsigned_integer::traits::IsUnsignedInteger; use core::fmt; use core::fmt::Debug; use core::iter::Sum; +#[cfg(feature = "lambdaworks-serde")] +use core::marker::PhantomData; use core::ops::{Add, AddAssign, Div, Mul, Neg, Sub}; +#[cfg(feature = "lambdaworks-serde")] +use serde::de::{self, Deserializer, MapAccess, Visitor}; +#[cfg(feature = "lambdaworks-serde")] +use serde::ser::{Serialize, SerializeStruct, Serializer}; +#[cfg(feature = "lambdaworks-serde")] +use serde::Deserialize; use super::fields::montgomery_backed_prime_fields::{IsModulus, MontgomeryBackendPrimeField}; use super::traits::{IsPrimeField, LegendreSymbol}; @@ -421,6 +429,64 @@ impl FieldElement { } } +#[cfg(feature = "lambdaworks-serde")] +impl Serialize for FieldElement { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut state = serializer.serialize_struct("FieldElement", 1)?; + state.serialize_field("value", &F::representative(self.value()).to_string())?; + state.end() + } +} + +#[cfg(feature = "lambdaworks-serde")] +impl<'de, F: IsPrimeField> Deserialize<'de> for FieldElement { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + #[derive(Deserialize)] + #[serde(field_identifier, rename_all = "lowercase")] + enum Field { + Value, + } + + struct FieldElementVisitor(PhantomData F>); + + impl<'de, F: IsPrimeField> Visitor<'de> for FieldElementVisitor { + type Value = FieldElement; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("struct FieldElement") + } + + fn visit_map(self, mut map: M) -> Result, M::Error> + where + M: MapAccess<'de>, + { + let mut value = None; + while let Some(key) = map.next_key()? { + match key { + Field::Value => { + if value.is_some() { + return Err(de::Error::duplicate_field("value")); + } + value = Some(map.next_value()?); + } + } + } + let value = value.ok_or_else(|| de::Error::missing_field("value"))?; + Ok(FieldElement::from_hex(value).unwrap()) + } + } + + const FIELDS: &[&str] = &["value"]; + deserializer.deserialize_struct("FieldElement", FIELDS, FieldElementVisitor(PhantomData)) + } +} + impl fmt::Display for FieldElement> where diff --git a/math/src/field/fields/montgomery_backed_prime_fields.rs b/math/src/field/fields/montgomery_backed_prime_fields.rs index 2c7cc2cca..53283df58 100644 --- a/math/src/field/fields/montgomery_backed_prime_fields.rs +++ b/math/src/field/fields/montgomery_backed_prime_fields.rs @@ -427,6 +427,16 @@ mod tests_u384_prime_fields { assert_eq!(x * y, c); } + #[test] + #[cfg(feature = "lambdaworks-serde")] + fn montgomery_backend_serialization_deserialization() { + let x = U384F23Element::from(11_u64); + let x_serialized = serde_json::to_string(&x).unwrap(); + let x_deserialized: U384F23Element = serde_json::from_str(&x_serialized).unwrap(); + assert_eq!(x_serialized, "{\"value\":\"0xb\"}"); + assert_eq!(x_deserialized, x); + } + const ORDER: usize = 23; #[test] fn two_plus_one_is_three() { From 38f935be542fb79baaf3c71806d153fa4740e16d Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Tue, 18 Jul 2023 08:42:17 -0300 Subject: [PATCH 4/7] Adding serde to Proof (#514) Co-authored-by: Pablo Deymonnaz --- crypto/Cargo.toml | 1 + crypto/src/merkle_tree/proof.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/Cargo.toml b/crypto/Cargo.toml index 554628654..8261ee196 100644 --- a/crypto/Cargo.toml +++ b/crypto/Cargo.toml @@ -12,6 +12,7 @@ lambdaworks-math.workspace = true sha3 = "0.10" sha2 = "0.10" thiserror = "1.0.38" +serde = { version = "1.0", features = ["derive"] } [dev-dependencies] criterion = "0.4" diff --git a/crypto/src/merkle_tree/proof.rs b/crypto/src/merkle_tree/proof.rs index 18cecf49e..c2914cf50 100644 --- a/crypto/src/merkle_tree/proof.rs +++ b/crypto/src/merkle_tree/proof.rs @@ -10,7 +10,7 @@ use super::traits::IsMerkleTreeBackend; /// `merkle_path` field, in such a way that, if the merkle tree is of height `n`, the /// `i`-th element of `merkle_path` is the sibling node in the `n - 1 - i`-th check /// when verifying. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] pub struct Proof { pub merkle_path: Vec, } From 9d3cde6c31c43591cda746f7bbc5b5d41af0af22 Mon Sep 17 00:00:00 2001 From: Pablo Deymonnaz Date: Tue, 18 Jul 2023 12:21:45 -0300 Subject: [PATCH 5/7] Serialize and Deserialize in MontgomeryBackendPrimeField (#515) Co-authored-by: Pablo Deymonnaz --- math/src/field/fields/montgomery_backed_prime_fields.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/math/src/field/fields/montgomery_backed_prime_fields.rs b/math/src/field/fields/montgomery_backed_prime_fields.rs index 53283df58..280b36bd9 100644 --- a/math/src/field/fields/montgomery_backed_prime_fields.rs +++ b/math/src/field/fields/montgomery_backed_prime_fields.rs @@ -19,6 +19,10 @@ pub trait IsModulus: Debug { const MODULUS: U; } +#[cfg_attr( + feature = "lambdaworks-serde", + derive(serde::Serialize, serde::Deserialize) +)] #[derive(Clone, Debug, Hash, Copy)] pub struct MontgomeryBackendPrimeField { phantom: PhantomData, From 48d4b03e74e18303cf277845702506f523eebfd7 Mon Sep 17 00:00:00 2001 From: fmoletta <99273364+fmoletta@users.noreply.github.com> Date: Fri, 21 Jul 2023 20:20:49 +0300 Subject: [PATCH 6/7] feat: Implement `UnsignedInteger::from_dec_str` (#497) * Implement from_dec_str for UnsignedInteger * Add tests for U256 * Add tests for U384 * Add tests for error cases --- math/src/errors.rs | 1 + math/src/unsigned_integer/element.rs | 173 +++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) diff --git a/math/src/errors.rs b/math/src/errors.rs index 5e840018b..84261a9ac 100644 --- a/math/src/errors.rs +++ b/math/src/errors.rs @@ -7,6 +7,7 @@ pub enum ByteConversionError { #[derive(Debug, PartialEq, Eq)] pub enum CreationError { InvalidHexString, + InvalidDecString, } #[derive(Debug, PartialEq, Eq)] diff --git a/math/src/unsigned_integer/element.rs b/math/src/unsigned_integer/element.rs index 66631f2e3..20e909429 100644 --- a/math/src/unsigned_integer/element.rs +++ b/math/src/unsigned_integer/element.rs @@ -832,6 +832,22 @@ impl UnsignedInteger { quo = Self::ct_select(&Self::from_u64(0), &quo, is_some); (quo, rem) } + + /// Convert from a decimal string. + pub fn from_dec_str(value: &str) -> Result { + if value.is_empty() { + return Err(CreationError::InvalidDecString); + } + let mut res = Self::from_u64(0); + for b in value.bytes().map(|b| b.wrapping_sub(b'0')) { + if b > 9 { + return Err(CreationError::InvalidDecString); + } + let r = res * Self::from(10_u64) + Self::from(b as u64); + res = r; + } + Ok(res) + } } impl IsUnsignedInteger for UnsignedInteger {} @@ -1145,6 +1161,85 @@ mod tests_u384 { ); } + #[test] + fn construct_new_integer_from_dec_1() { + let a = U384::from_dec_str("1").unwrap(); + assert_eq!(a.limbs, [0, 0, 0, 0, 0, 1]); + } + + #[test] + fn construct_new_integer_from_dec_2() { + let a = U384::from_dec_str("15").unwrap(); + assert_eq!(a.limbs, [0, 0, 0, 0, 0, 15]); + } + + #[test] + fn construct_new_integer_from_dec_3() { + let a = U384::from_dec_str("18446744073709551616").unwrap(); + assert_eq!(a.limbs, [0, 0, 0, 0, 1, 0]); + } + + #[test] + fn construct_new_integer_from_dec_4() { + let a = U384::from_dec_str("184467440737095516160").unwrap(); + assert_eq!(a.limbs, [0, 0, 0, 0, 10, 0]); + } + + #[test] + fn construct_new_integer_from_dec_5() { + let a = U384::from_dec_str("4722366482869645213695").unwrap(); + assert_eq!(a.limbs, [0, 0, 0, 0, 255, u64::MAX]); + } + + #[test] + fn construct_new_integer_from_dec_6() { + let a = U384::from_dec_str("1110408632367155513346836").unwrap(); + assert_eq!(a.limbs, [0, 0, 0, 0, 60195, 6872850209053821716]); + } + + #[test] + fn construct_new_integer_from_dec_7() { + let a = + U384::from_dec_str("66092860629991288370279803883558073888453977263446474418").unwrap(); + assert_eq!( + a.limbs, + [ + 0, + 0, + 0, + 194229460750598834, + 4171047363999149894, + 6975114134393503410 + ] + ); + } + + #[test] + fn construct_new_integer_from_dec_8() { + let a = U384::from_dec_str("3087491467896943881295768554872271030441880044814691421073017731442549147034464936390742057449079000462340371991316").unwrap(); + assert_eq!( + a.limbs, + [ + 1445463580056702870, + 13122285128622708909, + 3107671372009581347, + 11396525602857743462, + 921361708038744867, + 6872850209053821716 + ] + ); + } + + #[test] + fn construct_new_integer_from_dec_empty() { + assert!(U384::from_dec_str("").is_err()); + } + + #[test] + fn construct_new_integer_from_dec_invalid() { + assert!(U384::from_dec_str("0xff").is_err()); + } + #[test] fn equality_works_1() { let a = U384::from_hex_unchecked("1"); @@ -2054,6 +2149,84 @@ mod tests_u256 { ); } + #[test] + fn construct_new_integer_from_dec_1() { + let a = U256::from_dec_str("1").unwrap(); + assert_eq!(a.limbs, [0, 0, 0, 1]); + } + + #[test] + fn construct_new_integer_from_dec_2() { + let a = U256::from_dec_str("15").unwrap(); + assert_eq!(a.limbs, [0, 0, 0, 15]); + } + + #[test] + fn construct_new_integer_from_dec_3() { + let a = U256::from_dec_str("18446744073709551616").unwrap(); + assert_eq!(a.limbs, [0, 0, 1, 0]); + } + + #[test] + fn construct_new_integer_from_dec_4() { + let a = U256::from_dec_str("184467440737095516160").unwrap(); + assert_eq!(a.limbs, [0, 0, 10, 0]); + } + + #[test] + fn construct_new_integer_from_dec_5() { + let a = U256::from_dec_str("4722366482869645213695").unwrap(); + assert_eq!(a.limbs, [0, 0, 255, u64::MAX]); + } + + #[test] + fn construct_new_integer_from_dec_6() { + let a = U256::from_dec_str("1110408632367155513346836").unwrap(); + assert_eq!(a.limbs, [0, 0, 60195, 6872850209053821716]); + } + + #[test] + fn construct_new_integer_from_dec_7() { + let a = + U256::from_dec_str("66092860629991288370279803883558073888453977263446474418").unwrap(); + assert_eq!( + a.limbs, + [ + 0, + 194229460750598834, + 4171047363999149894, + 6975114134393503410 + ] + ); + } + + #[test] + fn construct_new_integer_from_dec_8() { + let a = U256::from_dec_str( + "19507169362252850253634654373914901165934018806002526957372506333098895428372", + ) + .unwrap(); + assert_eq!( + a.limbs, + [ + 3107671372009581347, + 11396525602857743462, + 921361708038744867, + 6872850209053821716 + ] + ); + } + + #[test] + fn construct_new_integer_from_dec_empty() { + assert!(U256::from_dec_str("").is_err()); + } + + #[test] + fn construct_new_integer_from_dec_invalid() { + assert!(U256::from_dec_str("0xff").is_err()); + } + #[test] fn equality_works_1() { let a = U256::from_hex_unchecked("1"); From 81d831102b2b6833dd9dcc7e7fad1a0fccc2b7e4 Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Mon, 31 Jul 2023 09:30:06 -0300 Subject: [PATCH 7/7] perf: replace asserts by debug_asserts (#516) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Runtime assertions to validate inputs were consuming a significant amount of time in benchmarks. Measured on an M1, based on branch `fix_felt_benchmarks` with the `ark-ff` code commented out to reduce noise we observe the following changes: ``` add | lambdaworks time: [12.273 µs 12.278 µs 12.285 µs] change: [-63.107% -63.042% -62.979%] (p = 0.00 < 0.05) Performance has improved. invert | lambdaworks time: [26.858 ms 26.864 ms 26.871 ms] change: [-7.8986% -7.8611% -7.8231%] (p = 0.00 < 0.05) Performance has improved. mul | lambdaworks time: [63.604 µs 63.622 µs 63.645 µs] change: [-0.1615% -0.0957% -0.0332%] (p = 0.00 < 0.05) Change within noise threshold. pow | lambdaworks time: [12.594 ms 12.599 ms 12.604 ms] change: [-0.4536% -0.4009% -0.3481%] (p = 0.00 < 0.05) Change within noise threshold. sqrt | lambdaworks time: [139.76 ms 139.79 ms 139.82 ms] change: [-0.1288% -0.1015% -0.0730%] (p = 0.00 < 0.05) Change within noise threshold. sub | lambdaworks time: [13.518 µs 13.529 µs 13.542 µs] change: [-19.474% -18.102% -17.059%] (p = 0.00 < 0.05) Performance has improved. ``` --- math/src/unsigned_integer/element.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/math/src/unsigned_integer/element.rs b/math/src/unsigned_integer/element.rs index 20e909429..4a79d9f6c 100644 --- a/math/src/unsigned_integer/element.rs +++ b/math/src/unsigned_integer/element.rs @@ -149,7 +149,7 @@ impl Sub<&UnsignedInteger> for &UnsignedInteg fn sub(self, other: &UnsignedInteger) -> UnsignedInteger { let (result, overflow) = UnsignedInteger::sub(self, other); - assert!(!overflow, "UnsignedInteger subtraction overflow."); + debug_assert!(!overflow, "UnsignedInteger subtraction overflow."); result } } @@ -804,7 +804,7 @@ impl UnsignedInteger { /// Computes self / rhs, returns the quotient, remainder. pub fn div_rem(&self, rhs: &Self) -> (Self, Self) { - assert!( + debug_assert!( *rhs != UnsignedInteger::from_u64(0), "Attempted to divide by zero" );