diff --git a/.github/workflows/shared.yml b/.github/workflows/shared.yml index aee47884a5..bd8044e580 100644 --- a/.github/workflows/shared.yml +++ b/.github/workflows/shared.yml @@ -113,7 +113,7 @@ jobs: run: | case "${{ matrix.target }}" in "linux-aarch64") - sudo apt-get -y install autoconf-archive + sudo apt-get -y install autoconf-archive pkg-config bazel run //bazel/toolchain:aarch64-linux-musl-gcc ;; "linux-x86_64") diff --git a/MODULE.bazel b/MODULE.bazel new file mode 100644 index 0000000000..00bb18361f --- /dev/null +++ b/MODULE.bazel @@ -0,0 +1,6 @@ +############################################################################### +# Bazel now uses Bzlmod by default to manage external dependencies. +# Please consider migrating your external dependencies from WORKSPACE to MODULE.bazel. +# +# For more details, please check https://github.com/bazelbuild/bazel/issues/18958 +############################################################################### diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index 6661562bee..da0bb64568 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -300,6 +300,15 @@ versioned_http_archive( version = "2.14", ) +versioned_http_archive( + name = "softblas", + build_file = "//bazel/third_party/softblas:softblas.BUILD", + strip_prefix = "SoftBLAS-{version}", + # sha256 = "", + url = "https://github.com/urbit/SoftBLAS/archive/{version}.tar.gz", + version = "cbffb33f19ea02f9ffbd184d445123c57929ec53", +) + versioned_http_archive( name = "softfloat", build_file = "//bazel/third_party/softfloat:softfloat.BUILD", diff --git a/bazel/third_party/softblas/BUILD.bazel b/bazel/third_party/softblas/BUILD.bazel new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bazel/third_party/softblas/softblas.BUILD b/bazel/third_party/softblas/softblas.BUILD new file mode 100644 index 0000000000..34c80c93e6 --- /dev/null +++ b/bazel/third_party/softblas/softblas.BUILD @@ -0,0 +1,117 @@ +# FILEPATH: /home/neal/lagoon/vere/bazel/third_party/softblas/softblas.BUILD + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +cc_library( + name = "softblas", + visibility = ["//visibility:public"], + deps = select({ + "@platforms//cpu:aarch64": [":softblas_aarch64"], + "@platforms//cpu:x86_64": [":softblas_x86_64"], + "//conditions:default": [], + }), +) + +cc_library( + name = "softblas_aarch64", + visibility = ["//visibility:public"], + hdrs = ["include/softblas.h"], + includes = ["include"], + srcs = [ + "include/softblas.h", + "src/softblas_state.c", + "src/blas/level1/sasum.c", + "src/blas/level1/dasum.c", + "src/blas/level1/hasum.c", + "src/blas/level1/qasum.c", + "src/blas/level1/saxpy.c", + "src/blas/level1/daxpy.c", + "src/blas/level1/haxpy.c", + "src/blas/level1/qaxpy.c", + "src/blas/level1/scopy.c", + "src/blas/level1/dcopy.c", + "src/blas/level1/hcopy.c", + "src/blas/level1/qcopy.c", + "src/blas/level1/sdot.c", + "src/blas/level1/ddot.c", + "src/blas/level1/hdot.c", + "src/blas/level1/qdot.c", + "src/blas/level1/snrm2.c", + "src/blas/level1/dnrm2.c", + "src/blas/level1/hnrm2.c", + "src/blas/level1/qnrm2.c", + "src/blas/level1/sscal.c", + "src/blas/level1/dscal.c", + "src/blas/level1/hscal.c", + "src/blas/level1/qscal.c", + "src/blas/level1/sswap.c", + "src/blas/level1/dswap.c", + "src/blas/level1/hswap.c", + "src/blas/level1/qswap.c", + "src/blas/level1/isamax.c", + "src/blas/level1/idamax.c", + "src/blas/level1/ihamax.c", + "src/blas/level1/iqamax.c", + "src/blas/level2/sgemv.c", + "src/blas/level2/dgemv.c", + "src/blas/level2/hgemv.c", + "src/blas/level2/qgemv.c", + "src/blas/level3/sgemm.c", + "src/blas/level3/dgemm.c", + "src/blas/level3/hgemm.c", + "src/blas/level3/qgemm.c" + ], + deps = ["@softfloat"], +) + +cc_library( + name = "softblas_x86_64", + visibility = ["//visibility:public"], + hdrs = ["include/softblas.h"], + includes = ["include"], + srcs = [ + "include/softblas.h", + "src/softblas_state.c", + "src/blas/level1/sasum.c", + "src/blas/level1/dasum.c", + "src/blas/level1/hasum.c", + "src/blas/level1/qasum.c", + "src/blas/level1/saxpy.c", + "src/blas/level1/daxpy.c", + "src/blas/level1/haxpy.c", + "src/blas/level1/qaxpy.c", + "src/blas/level1/scopy.c", + "src/blas/level1/dcopy.c", + "src/blas/level1/hcopy.c", + "src/blas/level1/qcopy.c", + "src/blas/level1/sdot.c", + "src/blas/level1/ddot.c", + "src/blas/level1/hdot.c", + "src/blas/level1/qdot.c", + "src/blas/level1/snrm2.c", + "src/blas/level1/dnrm2.c", + "src/blas/level1/hnrm2.c", + "src/blas/level1/qnrm2.c", + "src/blas/level1/sscal.c", + "src/blas/level1/dscal.c", + "src/blas/level1/hscal.c", + "src/blas/level1/qscal.c", + "src/blas/level1/sswap.c", + "src/blas/level1/dswap.c", + "src/blas/level1/hswap.c", + "src/blas/level1/qswap.c", + "src/blas/level1/isamax.c", + "src/blas/level1/idamax.c", + "src/blas/level1/ihamax.c", + "src/blas/level1/iqamax.c", + "src/blas/level2/sgemv.c", + "src/blas/level2/dgemv.c", + "src/blas/level2/hgemv.c", + "src/blas/level2/qgemv.c", + "src/blas/level3/sgemm.c", + "src/blas/level3/dgemm.c", + "src/blas/level3/hgemm.c", + "src/blas/level3/qgemm.c" + ], + deps = ["@softfloat"], +) diff --git a/pkg/c3/motes.h b/pkg/c3/motes.h index 975554e94b..8ed8c81389 100644 --- a/pkg/c3/motes.h +++ b/pkg/c3/motes.h @@ -258,6 +258,7 @@ # define c3__corp c3_s4('c','o','r','p') # define c3__corp c3_s4('c','o','r','p') # define c3__cow c3_s3('c','o','w') +# define c3__cplx c3_s3('c','p','l','x') # define c3__cpu c3_s3('c','p','u') # define c3__crad c3_s4('c','r','a','d') # define c3__cram c3_s4('c','r','a','m') @@ -430,6 +431,7 @@ # define c3__fit c3_s3('f','i','t') # define c3__fits c3_s4('f','i','t','s') # define c3__fix c3_s3('f','i','x') +# define c3__fixp c3_s3('f','i','x','p') # define c3__fl c3_s2('f','l') # define c3__flac c3_s4('f','l','a','c') # define c3__flag c3_s4('f','l','a','g') @@ -602,6 +604,7 @@ # define c3__info c3_s4('i','n','f','o') # define c3__init c3_s4('i','n','i','t') # define c3__ins c3_s3('i','n','s') +# define c3__int2 c3_s4('i','n','t','2') # define c3__into c3_s4('i','n','t','o') # define c3__intr c3_s4('i','n','t','r') # define c3__inuk c3_s4('i','n','u','k') @@ -610,6 +613,7 @@ # define c3__is c3_s2('i','s') # define c3__item c3_s4('i','t','e','m') # define c3__ix c3_s2('i','x') +# define c3__i754 c3_s4('i','7','5','4') # define c3__j c3_s1('j') # define c3__jack c3_s4('j','a','c','k') # define c3__jam c3_s3('j','a','m') @@ -971,6 +975,7 @@ # define c3__rasp c3_s4('r','a','s','p') # define c3__raw c3_s3('r','a','w') # define c3__read c3_s4('r','e','a','d') +# define c3__real c3_s4('r','e','a','l') # define c3__reck c3_s4('r','e','c','k') # define c3__reef c3_s4('r','e','e','f') # define c3__resd c3_s4('r','e','s','d') @@ -1232,11 +1237,13 @@ # define c3__ubin c3_s4('u','b','i','n') # define c3__ubit c3_s4('u','b','i','t') # define c3__ud c3_s2('u','d') +# define c3__uint c3_s4('u','i','n','t') # define c3__ulib c3_s4('u','l','i','b') # define c3__un c3_s2('u','n') # define c3__uniq c3_s4('u','n','i','q') # define c3__unix c3_s4('u','n','i','x') # define c3__unt c3_s3('u','n','t') +# define c3__unum c3_s3('u','n','u','m') # define c3__up c3_s2('u','p') # define c3__url c3_s3('u','r','l') # define c3__urth c3_s4('u','r','t','h') diff --git a/pkg/noun/BUILD.bazel b/pkg/noun/BUILD.bazel index fd6963cb82..444bf0be37 100644 --- a/pkg/noun/BUILD.bazel +++ b/pkg/noun/BUILD.bazel @@ -38,6 +38,7 @@ vere_library( "@openssl", "@pdjson", "@sigsegv", + "@softblas", "@softfloat", "@urcrypt", "@zlib", diff --git a/pkg/noun/jets/i/lagoon.c b/pkg/noun/jets/i/lagoon.c new file mode 100644 index 0000000000..13c0f2a138 --- /dev/null +++ b/pkg/noun/jets/i/lagoon.c @@ -0,0 +1,3314 @@ +/// @file + +#include "jets/q.h" +#include "jets/w.h" + +#include "noun.h" +#include "softfloat.h" +#include "softblas.h" + +#include // for pow() +#include + +#define f16_ceil(a) f16_roundToInt( a, softfloat_round_max, false ) +#define f32_ceil(a) f32_roundToInt( a, softfloat_round_max, false ) +#define f64_ceil(a) f64_roundToInt( a, softfloat_round_max, false ) +#define f128M_ceil(a, b) f128M_roundToInt( a, softfloat_round_max, false, b ) + + union half { + float16_t h; + c3_w c; + }; + + union sing { + float32_t s; + c3_w c; + }; + + union doub { + float64_t d; + c3_d c; + }; + + union quad { + float128_t q; + c3_d c[2]; + }; + + // $?(%n %u %d %z %a) + static inline void + _set_rounding(c3_w a) + { + // We could use SoftBLAS set_rounding() to set the SoftFloat + // mode as well, but it's more explicit to do it here since + // we may use SoftFloat in any given Lagoon jet and we want + // you, dear developer, to see it set here. + switch ( a ) + { + default: + u3m_bail(c3__fail); + break; + // %n - near + case c3__n: + softfloat_roundingMode = softfloat_round_near_even; + softblas_roundingMode = 'n'; + break; + // %z - zero + case c3__z: + softfloat_roundingMode = softfloat_round_minMag; + softblas_roundingMode = 'z'; + break; + // %u - up + case c3__u: + softfloat_roundingMode = softfloat_round_max; + softblas_roundingMode = 'u'; + break; + // %d - down + case c3__d: + softfloat_roundingMode = softfloat_round_min; + softblas_roundingMode = 'd'; + break; + // %a - away + case c3__a: + softfloat_roundingMode = softfloat_round_near_maxMag; + softblas_roundingMode = 'a'; + break; + } + } + +/* length of shape = x * y * z * w * ... +*/ + static inline c3_d _get_length(u3_noun shape) + { + c3_d len = 1; + while (u3_nul != shape) { + len = len * u3x_atom(u3h(shape)); + shape = u3t(shape); + } + return len; + } + +/* get dims from shape as array [x y z w ...] +*/ + static inline c3_d* _get_dims(u3_noun shape) + { + u3_atom len = u3qb_lent(shape); + c3_d len_d = u3r_chub(0, len); + c3_d* dims = (c3_d*)u3a_malloc(len_d*sizeof(c3_d)); + for (c3_d i = 0; i < len_d; i++) { + dims[i] = u3r_chub(0, u3x_atom(u3h(shape))); + shape = u3t(shape); + } + u3z(len); + return dims; + } + +/* check consistency of array shape and bloq size + |= =ray + ^- ? + .= (roll shape.meta.ray ^mul) + (dec (met bloq.meta.ray data.ray)) +*/ + static inline c3_o _check(u3_noun ray) + { + // Calculate expected size. + u3_atom shp = u3h(u3h(ray)); // (reported) shape of ray, +4 + u3_atom blq = u3h(u3t(u3h(ray))); // block size of ray, +10 + u3_atom sin = _get_length(shp); // calculated length of ray + + // Calculate actual size. + u3_atom len = u3r_met(blq, u3t(ray)); // length of ray + u3_atom dex = u3qa_dec(len); // decrement length b/c of pinned 1 + + return __(sin == dex); + } + +/* add - axpy = 1*x+y +*/ + u3_noun + u3qi_la_add_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq + ) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + break; + + case 5: + saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + break; + + case 6: + daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + break; + + case 7: + qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* sub - axpy = -1*y+x +*/ + u3_noun + u3qi_la_sub_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq + ) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + break; + + case 5: + saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + break; + + case 6: + daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + break; + + case 7: + qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + + +/* mul - x.*y + elementwise multiplication +*/ + u3_noun + u3qi_la_mul_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + ((float16_t*)y_bytes)[i] = f16_mul(((float16_t*)x_bytes)[i], ((float16_t*)y_bytes)[i]); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + ((float32_t*)y_bytes)[i] = f32_mul(((float32_t*)x_bytes)[i], ((float32_t*)y_bytes)[i]); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + ((float64_t*)y_bytes)[i] = f64_mul(((float64_t*)x_bytes)[i], ((float64_t*)y_bytes)[i]); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + f128M_mul(&(((float128_t*)y_bytes)[i]), &(((float128_t*)x_bytes)[i]), &(((float128_t*)y_bytes)[i])); + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* div - x/y + elementwise division +*/ + u3_noun + u3qi_la_div_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + ((float16_t*)y_bytes)[i] = f16_div(((float16_t*)x_bytes)[i], ((float16_t*)y_bytes)[i]); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + ((float32_t*)y_bytes)[i] = f32_div(((float32_t*)x_bytes)[i], ((float32_t*)y_bytes)[i]); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + ((float64_t*)y_bytes)[i] = f64_div(((float64_t*)x_bytes)[i], ((float64_t*)y_bytes)[i]); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + f128M_div(&(((float128_t*)y_bytes)[i]), &(((float128_t*)x_bytes)[i]), &(((float128_t*)y_bytes)[i])); + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* mod - x % y = x - r*floor(x/r) + remainder after division +*/ + u3_noun + u3qi_la_mod_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + // Perform division x/n + float16_t div_result16 = f16_div(x_val16, y_val16); + // Compute floor of the division result + c3_ds floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false); + float16_t floor_float16 = i64_to_f16(floor_result16); + // Multiply n by floor(x/n) + float16_t mult_result16 = f16_mul(y_val16, floor_float16); + // Compute remainder: x - n * floor(x/n) + ((float16_t*)y_bytes)[i] = f16_sub(x_val16, mult_result16); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + // Perform division x/n + float32_t div_result32 = f32_div(x_val32, y_val32); + // Compute floor of the division result + c3_ds floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); + float32_t floor_float32 = i64_to_f32(floor_result32); + // Multiply n by floor(x/n) + float32_t mult_result32 = f32_mul(y_val32, floor_float32); + // Compute remainder: x - n * floor(x/n) + ((float32_t*)y_bytes)[i] = f32_sub(x_val32, mult_result32); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + // Perform division x/n + float64_t div_result64 = f64_div(x_val64, y_val64); + // Compute floor of the division result + c3_ds floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false); + float64_t floor_float64 = i64_to_f64(floor_result64); + // Multiply n by floor(x/n) + float64_t mult_result64 = f64_mul(y_val64, floor_float64); + // Compute remainder: x - n * floor(x/n) + ((float64_t*)y_bytes)[i] = f64_sub(x_val64, mult_result64); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + // Perform division x/n + float128_t div_result128; + f128M_div((float128_t*)&x_val128, (float128_t*)&y_val128, (float128_t*)&div_result128); + // Compute floor of the division result + c3_ds floor_result128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false); + float128_t floor_float128; + i64_to_f128M(floor_result128, &floor_float128); + // Multiply n by floor(x/n) + float128_t mult_result128; + f128M_mul(((float128_t*)&y_val128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128)); + // Compute remainder: x - n * floor(x/n) + f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)y_bytes)[i])); + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* cumsum - x[0] + x[1] + ... x[n] +*/ + u3_noun + u3qi_la_cumsum_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // y_bytes is the data array (w/ leading 0x1, skipped by for range) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + u3_noun r_data; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: { + float16_t sum16[2]; + sum16[0] = (float16_t){SB_REAL16_ZERO}; + for (c3_d i = len_x; i > 0; i--) { + sum16[0] = f16_add(sum16[0], ((float16_t*)x_bytes)[i-1]); + } + sum16[1].v = 0x1; + r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)sum16); + break;} + + case 5: { + float32_t sum32[2]; + sum32[0] = (float32_t){SB_REAL32_ZERO}; + for (c3_d i = len_x; i > 0; i--) { + sum32[0] = f32_add(sum32[0], ((float32_t*)x_bytes)[i-1]); + } + sum32[1].v = 0x1; + r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)sum32); + break;} + + case 6: { + float64_t sum64[2]; + sum64[0] = (float64_t){SB_REAL64_ZERO}; + for (c3_d i = len_x; i > 0; i--) { + sum64[0] = f64_add(sum64[0], ((float64_t*)x_bytes)[i-1]); + } + sum64[1].v = 0x1; + r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)sum64); + break;} + + case 7: { + float128_t sum128[2]; + sum128[0] = (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + for (c3_d i = len_x; i > 0; i--) { + f128M_add(&(sum128[0]), &(((float128_t*)x_bytes)[i-1]), &(sum128[0])); + } + sum128[1] = (float128_t){0x1, 0x0}; + r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)sum128); + break;} + } + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* argmin - argmin(x) +*/ + u3_noun + u3qi_la_argmin_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1, which doesn't matter here) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + c3_w min_idx = 0; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: { + float16_t min_val16 = ((float16_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f16_lt(((float16_t*)x_bytes)[i], min_val16)) { + min_val16 = ((float16_t*)x_bytes)[i]; + min_idx = (len_x - i - 1); + } + } + break;} + + case 5: { + float32_t min_val32 = ((float32_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f32_lt(((float32_t*)x_bytes)[i], min_val32)) { + min_val32 = ((float32_t*)x_bytes)[i]; + min_idx = (len_x - i - 1); + } + } + break;} + + case 6: { + float64_t min_val64 = ((float64_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f64_lt(((float64_t*)x_bytes)[i], min_val64)) { + min_val64 = ((float64_t*)x_bytes)[i]; + min_idx = (len_x - i - 1); + } + } + break;} + + case 7: { + float128_t min_val128 = ((float128_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f128M_lt(&(((float128_t*)x_bytes)[i]), &min_val128)) { + min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]); + min_idx = (len_x - i - 1); + } + } + break;} + } + + u3_noun r_data = u3i_chub(min_idx); + + return r_data; + } + +/* argmax - argmax(x) +*/ + u3_noun + u3qi_la_argmax_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1, which doesn't matter here) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + c3_w max_idx = 0; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: { + float16_t max_val16 = ((float16_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f16_gt(((float16_t*)x_bytes)[i], max_val16)) { + max_val16 = ((float16_t*)x_bytes)[i]; + max_idx = (len_x - i - 1); + } + } + break;} + + case 5: { + float32_t max_val32 = ((float32_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f32_gt(((float32_t*)x_bytes)[i], max_val32)) { + max_val32 = ((float32_t*)x_bytes)[i]; + max_idx = (len_x - i - 1); + } + } + break;} + + case 6: { + float64_t max_val64 = ((float64_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f64_gt(((float64_t*)x_bytes)[i], max_val64)) { + max_val64 = ((float64_t*)x_bytes)[i]; + max_idx = (len_x - i - 1); + } + } + break;} + + case 7: { + float128_t max_val128 = ((float128_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + if(f128M_gt(&(((float128_t*)x_bytes)[i]), &max_val128)) { + max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]); + max_idx = (len_x - i - 1); + } + } + break;} + } + + u3_noun r_data = u3i_chub(max_idx); + + return r_data; + } + +/* ravel - x -> ~[x[0], x[1], ... x[n]] + entire nd-array busted out as a linear list +*/ + u3_noun + u3qi_la_ravel_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // r_data is the result noun of [data] + u3_noun r_data = u3_nul; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + r_data = u3nc(u3i_word(x_val16.v), r_data); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + r_data = u3nc(u3i_word(x_val32.v), r_data); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + r_data = u3nc(u3i_chub(x_val64.v), r_data); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + r_data = u3nc(u3i_chubs(2, (c3_d*)&(x_val128.v)), r_data); + } + break; + } + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* min - min(x,y) +*/ + u3_noun + u3qi_la_min_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by for range) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + u3_noun r_data; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: { + float16_t min_val16 = ((float16_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + min_val16 = f16_min(min_val16, ((float16_t*)x_bytes)[i]); + } + float16_t r16[2]; + r16[0] = min_val16; + r16[1].v = 0x1; + r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); + break;} + + case 5: { + float32_t min_val32 = ((float32_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + min_val32 = f32_min(min_val32, ((float32_t*)x_bytes)[i]); + } + float32_t r32[2]; + r32[0] = min_val32; + r32[1].v = 0x1; + r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); + break;} + + case 6: { + float64_t min_val64 = ((float64_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + min_val64 = f64_min(min_val64, ((float64_t*)x_bytes)[i]); + } + float64_t r64[2]; + r64[0] = min_val64; + r64[1].v = 0x1; + r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); + break;} + + case 7: { + float128_t min_val128 = ((float128_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + min_val128 = *f128M_min(&min_val128, &((float128_t*)x_bytes)[i]); + } + float128_t r128[2]; + r128[0] = min_val128; + r128[1] = (float128_t){0x1, 0x0}; + r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); + break;} + } + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* max - max(x,y) +*/ + u3_noun + u3qi_la_max_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by for range) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + u3_noun r_data; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: { + float16_t max_val16 = ((float16_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + max_val16 = f16_max(max_val16, ((float16_t*)x_bytes)[i]); + } + float16_t r16[2]; + r16[0] = max_val16; + r16[1].v = 0x1; + r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); + break;} + + case 5: { + float32_t max_val32 = ((float32_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + max_val32 = f32_max(max_val32, ((float32_t*)x_bytes)[i]); + } + float32_t r32[2]; + r32[0] = max_val32; + r32[1].v = 0x1; + r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); + break;} + + case 6: { + float64_t max_val64 = ((float64_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + max_val64 = f64_max(max_val64, ((float64_t*)x_bytes)[i]); + } + float64_t r64[2]; + r64[0] = max_val64; + r64[1].v = 0x1; + r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); + break;} + + case 7: { + float128_t max_val128 = ((float128_t*)x_bytes)[0]; + for (c3_d i = 0; i < len_x; i++) { + max_val128 = *f128M_max(&max_val128, &((float128_t*)x_bytes)[i]); + } + float128_t r128[2]; + r128[0] = max_val128; + r128[1] = (float128_t){0x1, 0x0}; + r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); + break;} + } + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* abs - |x| +*/ + u3_noun + u3qi_la_abs_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by for range) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + ((float16_t*)x_bytes)[i] = f16_abs(((float16_t*)x_bytes)[i]); + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + ((float32_t*)x_bytes)[i] = f32_abs(((float32_t*)x_bytes)[i]); + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + ((float64_t*)x_bytes)[i] = f64_abs(((float64_t*)x_bytes)[i]); + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + ((float128_t*)x_bytes)[i] = f128_abs(((float128_t*)x_bytes)[i]); + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* gth - x > y +*/ + u3_noun + u3qi_la_gth_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + ((float16_t*)y_bytes)[i] = f16_gt(x_val16, y_val16) ? (float16_t){SB_REAL16_ONE} : (float16_t){SB_REAL16_ZERO}; + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + ((float32_t*)y_bytes)[i] = f32_gt(x_val32, y_val32) ? (float32_t){SB_REAL32_ONE} : (float32_t){SB_REAL32_ZERO}; + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + ((float64_t*)y_bytes)[i] = f64_gt(x_val64, y_val64) ? (float64_t){SB_REAL64_ONE} : (float64_t){SB_REAL64_ZERO}; + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + ((float128_t*)y_bytes)[i] = f128M_gt(((float128_t*)&x_val128), ((float128_t*)&y_val128)) ? (float128_t){SB_REAL128L_ONE, SB_REAL128U_ONE} : (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* gte - x > y +*/ + u3_noun + u3qi_la_gte_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + ((float16_t*)y_bytes)[i] = f16_ge(x_val16, y_val16) ? (float16_t){SB_REAL16_ONE} : (float16_t){SB_REAL16_ZERO}; + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + ((float32_t*)y_bytes)[i] = f32_ge(x_val32, y_val32) ? (float32_t){SB_REAL32_ONE} : (float32_t){SB_REAL32_ZERO}; + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + ((float64_t*)y_bytes)[i] = f64_ge(x_val64, y_val64) ? (float64_t){SB_REAL64_ONE} : (float64_t){SB_REAL64_ZERO}; + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + ((float128_t*)y_bytes)[i] = f128M_ge(((float128_t*)&x_val128), ((float128_t*)&y_val128)) ? (float128_t){SB_REAL128L_ONE, SB_REAL128U_ONE} : (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* lth - x > y +*/ + u3_noun + u3qi_la_lth_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + ((float16_t*)y_bytes)[i] = f16_lt(x_val16, y_val16) ? (float16_t){SB_REAL16_ONE} : (float16_t){SB_REAL16_ZERO}; + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + ((float32_t*)y_bytes)[i] = f32_lt(x_val32, y_val32) ? (float32_t){SB_REAL32_ONE} : (float32_t){SB_REAL32_ZERO}; + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + ((float64_t*)y_bytes)[i] = f64_lt(x_val64, y_val64) ? (float64_t){SB_REAL64_ONE} : (float64_t){SB_REAL64_ZERO}; + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + ((float128_t*)y_bytes)[i] = f128M_lt(((float128_t*)&x_val128), ((float128_t*)&y_val128)) ? (float128_t){SB_REAL128L_ONE, SB_REAL128U_ONE} : (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* lte - x > y +*/ + u3_noun + u3qi_la_lte_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + float16_t y_val16 = ((float16_t*)y_bytes)[i]; + ((float16_t*)y_bytes)[i] = f16_le(x_val16, y_val16) ? (float16_t){SB_REAL16_ONE} : (float16_t){SB_REAL16_ZERO}; + } + break; + + case 5: + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + float32_t y_val32 = ((float32_t*)y_bytes)[i]; + ((float32_t*)y_bytes)[i] = f32_le(x_val32, y_val32) ? (float32_t){SB_REAL32_ONE} : (float32_t){SB_REAL32_ZERO}; + } + break; + + case 6: + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + float64_t y_val64 = ((float64_t*)y_bytes)[i]; + ((float64_t*)y_bytes)[i] = f64_le(x_val64, y_val64) ? (float64_t){SB_REAL64_ONE} : (float64_t){SB_REAL64_ZERO}; + } + break; + + case 7: + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + float128_t y_val128 = ((float128_t*)y_bytes)[i]; + ((float128_t*)y_bytes)[i] = f128M_le(((float128_t*)&x_val128), ((float128_t*)&y_val128)) ? (float128_t){SB_REAL128L_ONE, SB_REAL128U_ONE} : (float128_t){SB_REAL128L_ZERO, SB_REAL128U_ZERO}; + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* adds - axpy = 1*x+[n] +*/ + u3_noun + u3qi_la_adds_i754(u3_noun x_data, + u3_noun n, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + + float16_t n16; + float32_t n32; + float64_t n64; + float128_t n128; + + // Switch on the block size. We assume that n fits in the target block size; Hoon typecheck should prevent. + switch (u3x_atom(bloq)) { + case 4: + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float16_t*)y_bytes)[i] = n16; + } + haxpy(len_x, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + break; + + case 5: + u3r_bytes(0, 4, (c3_y*)&(n32.v), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float32_t*)y_bytes)[i] = n32; + } + saxpy(len_x, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + break; + + case 6: + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float64_t*)y_bytes)[i] = n64; + } + daxpy(len_x, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + break; + + case 7: + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]}; + } + qaxpy(len_x, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + break; + } + + // r_data is the result noun of [data] + y_bytes[syz_x] = 0x1; // pin head + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* subs - axpy = -1*[n]+x +*/ + u3_noun + u3qi_la_subs_i754(u3_noun x_data, + u3_noun n, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/o leading 0x1) + c3_y* y_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + + float16_t n16; + float32_t n32; + float64_t n64; + float128_t n128; + + // Switch on the block size. We assume that n fits in the target block size; Hoon typecheck should prevent. + switch (u3x_atom(bloq)) { + case 4: + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float16_t*)y_bytes)[i] = n16; + } + haxpy(len_x, (float16_t){SB_REAL16_NEGONE}, (float16_t*)y_bytes, 1, (float16_t*)x_bytes, 1); + break; + + case 5: + u3r_bytes(0, 4, (c3_y*)&(n32.v), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float32_t*)y_bytes)[i] = n32; + } + saxpy(len_x, (float32_t){SB_REAL32_NEGONE}, (float32_t*)y_bytes, 1, (float32_t*)x_bytes, 1); + break; + + case 6: + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float64_t*)y_bytes)[i] = n64; + } + daxpy(len_x, (float64_t){SB_REAL64_NEGONE}, (float64_t*)y_bytes, 1, (float64_t*)x_bytes, 1); + break; + + case 7: + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); + // set y to [n] + for (c3_d i = 0; i < len_x; i++) { + ((float128_t*)y_bytes)[i] = (float128_t){n128.v[0], n128.v[1]}; + } + qaxpy(len_x, (float128_t){SB_REAL128L_NEGONE,SB_REAL128U_NEGONE}, (float128_t*)y_bytes, 1, (float128_t*)x_bytes, 1); + break; + } + + // r_data is the result noun of [data] + x_bytes[syz_x] = 0x1; // pin head + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* muls - ?scal n * x + elementwise multiplication +*/ + u3_noun + u3qi_la_muls_i754(u3_noun x_data, + u3_noun n, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + x_bytes[syz_x] = 0x1; // pin head + + float16_t n16; + float32_t n32; + float64_t n64; + float128_t n128; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); + hscal(len_x, n16, (float16_t*)x_bytes, 1); + break; + + case 5: + u3r_bytes(0, 4, (c3_y*)&(n32.v), n); + sscal(len_x, n32, (float32_t*)x_bytes, 1); + break; + + case 6: + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); + dscal(len_x, n64, (float64_t*)x_bytes, 1); + break; + + case 7: + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); + qscal(len_x, n128, (float128_t*)x_bytes, 1); + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* divs - ?scal 1/n * x + elementwise division +*/ + u3_noun + u3qi_la_divs_i754(u3_noun x_data, + u3_noun n, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + x_bytes[syz_x] = 0x1; // pin head + + float16_t in16; + float32_t in32; + float64_t in64; + float128_t in128; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + // XX note that in16 is doing double duty here + u3r_bytes(0, 2, (c3_y*)&(in16.v), n); + in16 = f16_div((float16_t){SB_REAL16_ONE}, in16); + hscal(len_x, in16, (float16_t*)x_bytes, 1); + break; + + case 5: + // XX note that in32 is doing double duty here + u3r_bytes(0, 4, (c3_y*)&(in32.v), n); + in32 = f32_div((float32_t){SB_REAL32_ONE}, in32); + sscal(len_x, in32, (float32_t*)x_bytes, 1); + break; + + case 6: + // XX note that in64 is doing double duty here + u3r_bytes(0, 8, (c3_y*)&(in64.v), n); + in64 = f64_div((float64_t){SB_REAL64_ONE}, in64); + dscal(len_x, in64, (float64_t*)x_bytes, 1); + break; + + case 7: + // XX note that in128 is doing double duty here + u3r_bytes(0, 16, (c3_y*)&(in128.v[0]), n); + f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}), &in128, &in128); + qscal(len_x, in128, (float128_t*)x_bytes, 1); + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* mods - x % [n] = x - r*floor(x/r) + remainder after scalar division +*/ + u3_noun + u3qi_la_mods_i754(u3_noun x_data, + u3_noun n, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + // we reuse it for results for parsimony + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + + float16_t n16, in16; + float32_t n32, in32; + float64_t n64, in64; + float128_t n128, in128; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + u3r_bytes(0, 2, (c3_y*)&(n16.v), n); + in16 = f16_div((float16_t){SB_REAL16_ONE}, n16); + + for (c3_d i = 0; i < len_x; i++) { + float16_t x_val16 = ((float16_t*)x_bytes)[i]; + // Perform division x/n + float16_t div_result16 = f16_mul(in16, x_val16); + // Compute floor of the division result + c3_ds floor_result16 = f16_to_i64(div_result16, softfloat_round_minMag, false); + float16_t floor_float16 = i64_to_f16(floor_result16); + // Multiply n by floor(x/n) + float16_t mult_result16 = f16_mul(n16, floor_float16); + // Compute remainder: x - n * floor(x/n) + ((float16_t*)x_bytes)[i] = f16_sub(x_val16, mult_result16); + } + break; + + case 5: + u3r_bytes(0, 4, (c3_y*)&(n32.v), n); + in32 = f32_div((float32_t){SB_REAL32_ONE}, n32); + + for (c3_d i = 0; i < len_x; i++) { + float32_t x_val32 = ((float32_t*)x_bytes)[i]; + // Perform division x/n + float32_t div_result32 = f32_mul((float32_t)in32, (float32_t)x_val32); + // Compute floor of the division result + c3_ds floor_result32 = f32_to_i64(div_result32, softfloat_round_minMag, false); + float32_t floor_float32 = i64_to_f32(floor_result32); + // Multiply n by floor(x/n) + float32_t mult_result32 = f32_mul(n32, floor_float32); + // Compute remainder: x - n * floor(x/n) + ((float32_t*)x_bytes)[i] = f32_sub(x_val32, mult_result32); + } + break; + + case 6: + u3r_bytes(0, 8, (c3_y*)&(n64.v), n); + in64 = f64_div((float64_t){SB_REAL64_ONE}, n64); + + for (c3_d i = 0; i < len_x; i++) { + float64_t x_val64 = ((float64_t*)x_bytes)[i]; + // Perform division x/n + float64_t div_result64 = f64_mul(in64, x_val64); + // Compute floor of the division result + c3_ds floor_result64 = f64_to_i64(div_result64, softfloat_round_minMag, false); + float64_t floor_float64 = i64_to_f64(floor_result64); + // Multiply n by floor(x/n) + float64_t mult_result64 = f64_mul(n64, floor_float64); + // Compute remainder: x - n * floor(x/n) + ((float64_t*)x_bytes)[i] = f64_sub(x_val64, mult_result64); + } + break; + + case 7: + u3r_bytes(0, 16, (c3_y*)&(n128.v[0]), n); + f128M_div(&((float128_t){SB_REAL128L_ONE,SB_REAL128U_ZERO}), &n128, &in128); + + for (c3_d i = 0; i < len_x; i++) { + float128_t x_val128 = ((float128_t*)x_bytes)[i]; + // Perform division x/n + float128_t div_result128; + f128M_mul((float128_t*)&in128, (float128_t*)&x_val128, (float128_t*)&div_result128); + // Compute floor of the division result + c3_ds floor_result128 = f128M_to_i64(&div_result128, softfloat_round_minMag, false); + float128_t floor_float128; + i64_to_f128M(floor_result128, &floor_float128); + // Multiply n by floor(x/n) + float128_t mult_result128; + f128M_mul(((float128_t*)&n128), ((float128_t*)&floor_float128), ((float128_t*)&mult_result128)); + // Compute remainder: x - n * floor(x/n) + f128M_sub(((float128_t*)&x_val128), ((float128_t*)&mult_result128), &(((float128_t*)x_bytes)[i])); + } + break; + } + + // r_data is the result noun of [data] + u3_noun r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), x_bytes); + + // Clean up and return. + u3a_free(x_bytes); + + return r_data; + } + +/* dot - ?dot = x ยท y +*/ + u3_noun + u3qi_la_dot_i754(u3_noun x_data, + u3_noun y_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(shape); + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // y_bytes is the data array (w/ leading 0x1, skipped by ?axpy) + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, y_bytes, y_data); + + u3_noun r_data; + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: { + float16_t r16[2]; + r16[0] = hdot(len_x, (float16_t*)x_bytes, 1, (float16_t*)y_bytes, 1); + r16[1].v = 0x1; + r_data = u3i_bytes((2+1)*sizeof(c3_y), (c3_y*)r16); + break;} + + case 5: { + float32_t r32[2]; + r32[0] = sdot(len_x, (float32_t*)x_bytes, 1, (float32_t*)y_bytes, 1); + r32[1].v = 0x1; + r_data = u3i_bytes((4+1)*sizeof(c3_y), (c3_y*)r32); + break;} + + case 6: { + float64_t r64[2]; + r64[0] = ddot(len_x, (float64_t*)x_bytes, 1, (float64_t*)y_bytes, 1); + r64[1].v = 0x1; + r_data = u3i_bytes((8+1)*sizeof(c3_y), (c3_y*)r64); + break;} + + case 7: { + float128_t r128[2]; + r128[0] = qdot(len_x, (float128_t*)x_bytes, 1, (float128_t*)y_bytes, 1); + r128[1] = (float128_t){0x1, 0x0}; + r_data = u3i_bytes((16+1)*sizeof(c3_y), (c3_y*)r128); + break;} + } + + // Clean up and return. + u3a_free(x_bytes); + u3a_free(y_bytes); + + return r_data; + } + +/* diag - diag(x) +*/ + u3_noun + u3qi_la_diag(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + // Assert length of dims is 2. + if (u3qb_lent(shape) != 2) { + return u3m_bail(c3__exit); + } + // Unpack shape into an array of dimensions. + c3_d *dims = _get_dims(shape); + if (dims[0] != dims[1]) { + return u3m_bail(c3__exit); + } + + // Unpack the data as a byte array. We assume total length < 2**64. + c3_d len_x = _get_length(shape); + c3_d syz_x = len_x * pow(2, bloq - 3); + c3_d wyd = pow(2, bloq - 3); + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + c3_d syz_y = wyd * dims[1]; + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_y+1)*sizeof(c3_y)); + + u3_noun r_data; + + // Grab the index at i*n_x+j in bytes; put it at j. + for (c3_d i = 0; i < dims[1]; i++) { + // Scan across whole field width. + for (c3_y k = 0; k < wyd; k++) { + y_bytes[i*wyd+k] = x_bytes[(i*dims[0]+i)*wyd+k]; + } + } + y_bytes[syz_y] = 0x1; // pin head + + // Unpack the result back into a noun. + r_data = u3i_bytes((syz_y+1)*sizeof(c3_y), y_bytes); + + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(dims); + + return r_data; + } + +/* transpose - x' +*/ + u3_noun + u3qi_la_transpose(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + // Assert length of dims is 2. + if (u3qb_lent(shape) != 2) { + return u3m_bail(c3__exit); + } + // Unpack shape into an array of dimensions. + c3_d *dims = _get_dims(shape); + + // Unpack the data as a byte array. We assume total length < 2**64. + c3_d len_x = _get_length(shape); + c3_d syz_x = len_x * pow(2, bloq - 3); + c3_d wyd = pow(2, bloq - 3); + c3_y* x_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + u3r_bytes(0, syz_x+1, x_bytes, x_data); + c3_y* y_bytes = (c3_y*)u3a_malloc((syz_x+1)*sizeof(c3_y)); + + u3_noun r_data; + + // Grab the index at i*n_x+j in bytes; put it at j. + for (c3_d i = 0; i < dims[1]; i++) { + for (c3_d j = 0; j < dims[0]; j++) { + // Scan across whole field width. + for (c3_y k = 0; k < wyd; k++) { + y_bytes[(j*dims[1]+i)*wyd+k] = x_bytes[(i*dims[0]+j)*wyd+k]; + } + } + } + y_bytes[syz_x] = 0x1; // pin head + + // Unpack the result back into a noun. + r_data = u3i_bytes((syz_x+1)*sizeof(c3_y), y_bytes); + + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(dims); + + return r_data; + } + +/* linspace - [a a+(b-a)/n ... b] +*/ + u3_noun + u3qi_la_linspace_i754(u3_noun a, + u3_noun b, + u3_noun n, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + u3_noun r_data; + + switch (u3x_atom(bloq)) { + case 4: { + float16_t a16, b16; + u3r_bytes(0, 2, (c3_y*)&(a16.v), a); + u3r_bytes(0, 2, (c3_y*)&(b16.v), b); + float16_t span16 = f16_sub(b16, a16); + float16_t interval16 = f16_div(span16, i32_to_f16(n-1)); + c3_y* x_bytes16 = (c3_y*)u3a_malloc((n*2+1)*sizeof(c3_y)); + for (c3_d i = 1; i < n-1; i++) { + ((float16_t*)x_bytes16)[i] = f16_add(a16, f16_mul(i32_to_f16(i), interval16)); + } + // Assign in reverse order so that n=1 case is correctly left-hand bound. + ((float16_t*)x_bytes16)[n-1] = b16; + ((float16_t*)x_bytes16)[0] = a16; + x_bytes16[n*2] = 0x1; // pin head + r_data = u3i_bytes((n*2+1)*sizeof(c3_y), x_bytes16); + u3a_free(x_bytes16); + break;} + + case 5: { + float32_t a32, b32; + u3r_bytes(0, 4, (c3_y*)&(a32.v), a); + u3r_bytes(0, 4, (c3_y*)&(b32.v), b); + float32_t span32 = f32_sub(b32, a32); + float32_t interval32 = f32_div(span32, i32_to_f32(n-1)); + c3_y* x_bytes32 = (c3_y*)u3a_malloc((n*4+1)*sizeof(c3_y)); + for (c3_d i = 1; i < n-1; i++) { + ((float32_t*)x_bytes32)[i] = f32_add(a32, f32_mul(i32_to_f32(i), interval32)); + } + ((float32_t*)x_bytes32)[n-1] = b32; + ((float32_t*)x_bytes32)[0] = a32; + x_bytes32[n*4] = 0x1; // pin head + r_data = u3i_bytes((n*4+1)*sizeof(c3_y), x_bytes32); + u3a_free(x_bytes32); + break;} + + case 6: { + float64_t a64, b64; + u3r_bytes(0, 8, (c3_y*)&(a64.v), a); + u3r_bytes(0, 8, (c3_y*)&(b64.v), b); + float64_t span64 = f64_sub(b64, a64); + float64_t interval64 = f64_div(span64, i32_to_f64(n-1)); + c3_y* x_bytes64 = (c3_y*)u3a_malloc((n*8+1)*sizeof(c3_y)); + for (c3_d i = 1; i < n-1; i++) { + ((float64_t*)x_bytes64)[i] = f64_add(a64, f64_mul(i32_to_f64(i), interval64)); + } + ((float64_t*)x_bytes64)[n-1] = b64; + ((float64_t*)x_bytes64)[0] = a64; + x_bytes64[n*8] = 0x1; // pin head + r_data = u3i_bytes((n*8+1)*sizeof(c3_y), x_bytes64); + u3a_free(x_bytes64); + break;} + + case 7: { + float128_t a128, b128; + u3r_bytes(0, 16, (c3_y*)&(a128.v[0]), a); + u3r_bytes(0, 16, (c3_y*)&(b128.v[0]), b); + float128_t span128; + f128M_sub(&b128, &a128, &span128); + float128_t interval128; + float128_t n128; + i32_to_f128M(n-1, &n128); + f128M_div(&span128, &n128, &interval128); + c3_y* x_bytes128 = (c3_y*)u3a_malloc((n*16+1)*sizeof(c3_y)); + float128_t i128; + for (c3_d i = 1; i < n-1; i++) { + i32_to_f128M(i, &i128); + f128M_mul(&i128, &interval128, &((float128_t*)x_bytes128)[i]); + f128M_add(&a128, &((float128_t*)x_bytes128)[i], &((float128_t*)x_bytes128)[i]); + } + ((float128_t*)x_bytes128)[n-1] = b128; + ((float128_t*)x_bytes128)[0] = a128; + x_bytes128[n*16] = 0x1; // pin head + r_data = u3i_bytes((n*16+1)*sizeof(c3_y), x_bytes128); + u3a_free(x_bytes128); + break;} + } + + return r_data; + } + +/* range - [a a+d ... b] +*/ + u3_noun + u3qi_la_range_i754(u3_noun a, + u3_noun b, + u3_noun d, + u3_noun bloq) + { + // Fence on valid bloq size. + if (bloq < 4 || bloq > 7) { + return u3_none; + } + + u3_noun r_data; + + switch (u3x_atom(bloq)) { + case 4: { + float16_t a16, b16, interval16; + u3r_bytes(0, 2, (c3_y*)&(a16.v), a); + u3r_bytes(0, 2, (c3_y*)&(b16.v), b); + u3r_bytes(0, 2, (c3_y*)&(interval16.v), d); + c3_d n16 = f16_to_i64(f16_ceil(f16_div(f16_sub(b16, a16), interval16)), softfloat_round_minMag, false); + c3_y* x_bytes16 = (c3_y*)u3a_malloc(((n16+1)*2)*sizeof(c3_y)); + ((float16_t*)x_bytes16)[0] = a16; + for (c3_d i = 1; i < n16; i++) { + ((float16_t*)x_bytes16)[i] = f16_add(a16, f16_mul(i32_to_f16(i), interval16)); + } + ((float16_t*)x_bytes16)[n16].v = 0x1; // pin head + r_data = u3i_bytes(((n16+1)*2)*sizeof(c3_y), x_bytes16); + u3a_free(x_bytes16); + break;} + + case 5: { + float32_t a32, b32, interval32; + u3r_bytes(0, 4, (c3_y*)&(a32.v), a); + u3r_bytes(0, 4, (c3_y*)&(b32.v), b); + u3r_bytes(0, 4, (c3_y*)&(interval32.v), d); + c3_d n32 = f32_to_i64(f32_ceil(f32_div(f32_sub(b32, a32), interval32)), softfloat_round_minMag, false); + c3_y* x_bytes32 = (c3_y*)u3a_malloc(((n32+1)*4)*sizeof(c3_y)); + ((float32_t*)x_bytes32)[0] = a32; + for (c3_d i = 1; i < n32; i++) { + ((float32_t*)x_bytes32)[i] = f32_add(a32, f32_mul(i32_to_f32(i), interval32)); + } + ((float32_t*)x_bytes32)[n32].v = 0x1; // pin head + r_data = u3i_bytes(((n32+1)*4)*sizeof(c3_y), x_bytes32); + u3a_free(x_bytes32); + break;} + + case 6: { + float64_t a64, b64, interval64; + u3r_bytes(0, 8, (c3_y*)&(a64.v), a); + u3r_bytes(0, 8, (c3_y*)&(b64.v), b); + u3r_bytes(0, 8, (c3_y*)&(interval64.v), d); + c3_d n64 = f64_to_i64(f64_ceil(f64_div(f64_sub(b64, a64), interval64)), softfloat_round_minMag, false); + c3_y* x_bytes64 = (c3_y*)u3a_malloc(((n64+1)*8)*sizeof(c3_y)); + ((float64_t*)x_bytes64)[0] = a64; + for (c3_d i = 1; i < n64; i++) { + ((float64_t*)x_bytes64)[i] = f64_add(a64, f64_mul(i32_to_f64(i), interval64)); + } + ((float64_t*)x_bytes64)[n64].v = 0x1; // pin head + r_data = u3i_bytes(((n64+1)*8)*sizeof(c3_y), x_bytes64); + u3a_free(x_bytes64); + break;} + + case 7: { + float128_t a128, b128, interval128; + u3r_bytes(0, 16, (c3_y*)&(a128.v[0]), a); + u3r_bytes(0, 16, (c3_y*)&(b128.v[0]), b); + u3r_bytes(0, 16, (c3_y*)&(interval128.v[0]), d); + float128_t tmp; + f128M_sub(&b128, &a128, &tmp); + f128M_div(&tmp, &interval128, &tmp); + f128M_ceil(&tmp, &tmp); + c3_d n128 = f128M_to_i64(&tmp, softfloat_round_minMag, false); + c3_y* x_bytes128 = (c3_y*)u3a_malloc(((n128+1)*16)*sizeof(c3_y)); + float128_t i128; + ((float128_t*)x_bytes128)[0] = a128; + for (c3_d i = 1; i < n128; i++) { + i32_to_f128M(i, &i128); + f128M_mul(&i128, &interval128, &((float128_t*)x_bytes128)[i]); + f128M_add(&a128, &((float128_t*)x_bytes128)[i], &((float128_t*)x_bytes128)[i]); + } + ((float128_t*)x_bytes128)[n128].v[0] = 0x1; // pin head + ((float128_t*)x_bytes128)[n128].v[1] = 0x0; // pin head + r_data = u3i_bytes(((n128+1)*16)*sizeof(c3_y), x_bytes128); + u3a_free(x_bytes128); + break;} + } + + return r_data; + } + +/* trace - tr(x) +*/ + u3_noun + u3qi_la_trace_i754(u3_noun x_data, + u3_noun shape, + u3_noun bloq) + { + u3_noun d_data = u3qi_la_diag(x_data, shape, bloq); + c3_d len_x0 = _get_dims(shape)[0]; + u3_noun r_data = u3qi_la_dot_i754(d_data, d_data, u3nt(len_x0, 0x1, u3_nul), u3k(bloq)); + return r_data; + } + +/* mmul +*/ + u3_noun + u3qi_la_mmul_i754(u3_noun x_data, + u3_noun y_data, + u3_noun x_shape, + u3_noun y_shape, + u3_noun bloq) + { + // Unpack the data as a byte array. We assume total length < 2**64. + c3_d M = u3x_atom(u3h(x_shape)); + c3_d Na= u3x_atom(u3h(u3t(x_shape))); + c3_d Nb= u3x_atom(u3h(y_shape)); + c3_d P = u3x_atom(u3h(u3t(y_shape))); + + if ((u3_nul != u3t(u3t(x_shape))) || + (u3_nul != u3t(u3t(y_shape))) || + (Na != Nb)) { + return u3m_bail(c3__exit); + } + c3_d N = Na; + + // Unpack the data as a byte array. We assume total length < 2**64. + // len_x is length in base units + c3_d len_x = _get_length(x_shape); // M*N + + // syz_x is length in bytes + c3_d syz_x = len_x * pow(2, bloq-3); // M*N + + // x_bytes is the data array (w/o leading 0x1) + c3_y* x_bytes = (c3_y*)u3a_malloc(syz_x*sizeof(c3_y)); + u3r_bytes(0, syz_x, x_bytes, x_data); + + // len_x is length in base units + c3_d len_y = _get_length(y_shape); // N*P + + // syz_x is length in bytes + c3_d syz_y = len_y * pow(2, bloq-3); // N*P + + // y_bytes is the data array (w/o leading 0x1) + c3_y* y_bytes = (c3_y*)u3a_malloc(syz_y*sizeof(c3_y)); + u3r_bytes(0, syz_y, y_bytes, y_data); + + // len_r is length in base units + c3_d len_r = M*P; // M*P + + // syz_r is length in bytes + c3_d syz_r = len_r * pow(2, bloq-3); // M*P + + // r_bytes is the result array + c3_y* r_bytes = (c3_y*)u3a_malloc((syz_r+1)*sizeof(c3_y)); + r_bytes[syz_r] = 0x1; // pin head + // initialize with 0x0s + for (c3_d i = 0; i < syz_r; i++) { + r_bytes[i] = 0x0; + } + + // Switch on the block size. + switch (u3x_atom(bloq)) { + case 4: + hgemm('N', 'N', M, N, P, (float16_t){SB_REAL16_ONE}, (float16_t*)x_bytes, N, (float16_t*)y_bytes, P, (float16_t){SB_REAL16_ZERO}, (float16_t*)r_bytes, P); + break; + + case 5: + sgemm('N', 'N', M, N, P, (float32_t){SB_REAL32_ONE}, (float32_t*)x_bytes, N, (float32_t*)y_bytes, P, (float32_t){SB_REAL32_ZERO}, (float32_t*)r_bytes, P); + break; + + case 6: + dgemm('N', 'N', M, N, P, (float64_t){SB_REAL64_ONE}, (float64_t*)x_bytes, N, (float64_t*)y_bytes, P, (float64_t){SB_REAL64_ZERO}, (float64_t*)r_bytes, P); + break; + + case 7: + qgemm('N', 'N', M, N, P, (float128_t){SB_REAL128L_ONE,SB_REAL128U_ONE}, (float128_t*)x_bytes, N, (float128_t*)y_bytes, P, (float128_t){SB_REAL128L_ZERO,SB_REAL128U_ZERO}, (float128_t*)r_bytes, P); + break; + } + + // Unpack the result back into a noun. + u3_noun r_data = u3i_bytes(syz_r+1, r_bytes); + u3_noun M_ = u3i_chub(M); + u3_noun P_ = u3i_chub(P); + + u3a_free(x_bytes); + u3a_free(y_bytes); + u3a_free(r_bytes); + + return u3nc(u3nq(u3nt(M_, P_, u3_nul), u3k(bloq), c3__i754, u3_nul), r_data); + } + + u3_noun + u3wi_la_add(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(rnd) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_add_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_sub(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(rnd) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_sub_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_mul(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(rnd) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_mul_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_div(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(rnd) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_div_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_mod(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(rnd) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_mod_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_cumsum(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_cumsum_i754(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3nc(0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_argmin(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_argmin_i754(x_data, x_shape, x_bloq); + // bare atom (@ index) + return r_data;} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_ravel(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_ravel_i754(x_data, x_shape, x_bloq); + // (list @) + return r_data;} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_argmax(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_argmax_i754(x_data, x_shape, x_bloq); + // bare atom (@ index) + return r_data;} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_min(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_min_i754(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_max(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(u3nc(x_meta, x_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_max_i754(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_abs(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_abs_i754(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_gth(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_gth_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3k(x_meta), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_gte(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_gte_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3k(x_meta), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_lth(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_lth_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3k(x_meta), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_lte(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_lte_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3k(x_meta), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_adds(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_adds_i754(x_data, n, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + + u3_noun + u3wi_la_subs(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_subs_i754(x_data, n, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + + u3_noun + u3wi_la_muls(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_muls_i754(x_data, n, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + + u3_noun + u3wi_la_divs(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_divs_i754(x_data, n, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + + u3_noun + u3wi_la_mods(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, n; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_3, &n, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(n) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_mods_i754(x_data, n, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + + u3_noun + u3wi_la_dot(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3r_sing(x_meta, y_meta) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_dot_i754(x_data, y_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + c3_d len_x0 = _get_dims(x_shape)[0]; + return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_transpose(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(cor) + ) + { + return u3m_bail(c3__exit); + } else { + u3_noun r_data = u3qi_la_transpose(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3nt(u3k(u3h(x_shape)), u3k(u3h(u3t(x_shape))), u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + } + } + } + + u3_noun + u3wi_la_linspace(u3_noun cor) + { + u3_noun x_meta, a, b, n, rnd; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_12, &a, + u3x_sam_13, &b, + u3x_sam_7, &n, + 0)) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == u3ud(n) || + (n < 1) // crash on zero size + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_linspace_i754(a, b, n, x_bloq); + if (r_data == u3_none) { return u3_none; } + x_shape = u3nc(u3x_atom(n), u3_nul); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_range(u3_noun cor) + { + u3_noun x_meta, a, b, d, rnd; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_12, &a, + u3x_sam_13, &b, + u3x_sam_7, &d, + 0)) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_range_i754(a, b, d, x_bloq); + if (r_data == u3_none) { return u3_none; } + c3_d a_, b_, d_; + c3_ds n_; + switch (x_bloq) { + case 4: + u3r_bytes(0, 2, (c3_y*)&a_, a); + u3r_bytes(0, 2, (c3_y*)&b_, b); + u3r_bytes(0, 2, (c3_y*)&d_, d); + n_ = f16_to_i64(f16_ceil(f16_div(f16_sub((float16_t){b_}, (float16_t){a_}), (float16_t){d_})), softfloat_round_minMag, false) - 1; + break; + case 5: + u3r_bytes(0, 4, (c3_y*)&a_, a); + u3r_bytes(0, 4, (c3_y*)&b_, b); + u3r_bytes(0, 4, (c3_y*)&d_, d); + n_ = f32_to_i64(f32_ceil(f32_div(f32_sub((float32_t){b_}, (float32_t){a_}), (float32_t){d_})), softfloat_round_minMag, false) - 1; + break; + case 6: + u3r_bytes(0, 8, (c3_y*)&a_, a); + u3r_bytes(0, 8, (c3_y*)&b_, b); + u3r_bytes(0, 8, (c3_y*)&d_, d); + n_ = f64_to_i64(f64_ceil(f64_div(f64_sub((float64_t){b_}, (float64_t){a_}), (float64_t){d_})), softfloat_round_minMag, false) - 1; + break; + case 7: { + c3_d a__[2], b__[2], d__[2]; + u3r_bytes(0, 16, (c3_y*)&a__, a); + u3r_bytes(0, 16, (c3_y*)&b__, b); + u3r_bytes(0, 16, (c3_y*)&d__, d); + float128_t tmp; + f128M_sub((float128_t*)&b__, (float128_t*)&a__, &tmp); + f128M_div(&tmp, (float128_t*)&d__, &tmp); + f128M_ceil(&tmp, &tmp); + n_ = f128M_to_i64(&tmp, softfloat_round_minMag, false) - 1; + break;} + } + u3_noun n = u3i_chub(n_+1); + x_shape = u3nc(u3k(n), u3_nul); + return u3nc(u3nq(u3k(x_shape), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_diag(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + if ( c3n == u3ud(x_bloq) || + c3n == u3ud(x_kind) || + c3n == _check(cor) + ) + { + return u3m_bail(c3__exit); + } else { + u3_noun r_data = u3qi_la_diag(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + c3_d len_x0 = _get_dims(x_shape)[0]; + return u3nc(u3nq(u3nt(len_x0, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data); + } + } + } + + u3_noun + u3wi_la_trace(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_2, &x_meta, + u3x_sam_3, &x_data, + 0) || + c3n == u3ud(x_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail; + if ( c3n == u3r_mean(x_meta, + 2, &x_shape, + 6, &x_bloq, + 14, &x_kind, + 15, &x_tail, + 0) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: { + u3_noun r_data = u3qi_la_trace_i754(x_data, x_shape, x_bloq); + if (r_data == u3_none) { return u3_none; } + return u3nc(u3nq(u3nt(0x1, 0x1, u3_nul), u3k(x_bloq), u3k(x_kind), u3k(x_tail)), r_data);} + + default: + return u3_none; + } + } + } + } + + u3_noun + u3wi_la_mmul(u3_noun cor) + { + // Each argument is a ray, [=meta data=@ux] + u3_noun x_meta, x_data, + y_meta, y_data; + + if ( c3n == u3r_mean(cor, + u3x_sam_4, &x_meta, + u3x_sam_5, &x_data, + u3x_sam_6, &y_meta, + u3x_sam_7, &y_data, + 0) || + c3n == u3ud(x_data) || + c3n == u3ud(y_data) ) + { + return u3m_bail(c3__exit); + } else { + u3_noun x_shape, x_bloq, x_kind, x_tail, + y_shape, + rnd; + x_shape = u3h(x_meta); // 2 + x_bloq = u3h(u3t(x_meta)); // 6 + x_kind = u3h(u3t(u3t(x_meta))); // 14 + x_tail = u3t(u3t(u3t(x_meta))); // 15 + y_shape = u3h(y_meta); // 2 + rnd = u3h(u3t(u3t(u3t(cor)))); // 30 + if ( c3n == _check(u3nc(x_meta, x_data)) || + c3n == _check(u3nc(y_meta, y_data)) + ) + { + return u3m_bail(c3__exit); + } else { + switch (x_kind) { + case c3__i754: + _set_rounding(rnd); + u3_noun r_data = u3qi_la_mmul_i754(x_data, y_data, x_shape, y_shape, x_bloq); + // result is already [meta data] + return r_data; + + default: + return u3_none; + } + } + } + } diff --git a/pkg/noun/jets/q.h b/pkg/noun/jets/q.h index fd7c5981fb..720cd8a2de 100644 --- a/pkg/noun/jets/q.h +++ b/pkg/noun/jets/q.h @@ -245,6 +245,35 @@ u3_noun u3qfp_nepo(u3_noun, u3_noun); u3_noun u3qfp_rake(u3_noun); + u3_noun u3qi_la_add_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_sub_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mul_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_div_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mod_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_adds_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_subs_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_muls_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_divs_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mods_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_dot_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_diag(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_transpose(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_cumsum_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_argmin_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_argmax_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_ravel_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_min_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_max_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_linspace_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_range_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_abs_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_gth_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_gte_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_lth_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_lte_i754(u3_noun, u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_trace_i754(u3_noun, u3_noun, u3_noun); + u3_noun u3qi_la_mmul_i754(u3_noun, u3_noun, u3_noun, u3_noun, u3_noun); + # define u3qfu_van_fan 28 # define u3qfu_van_rib 58 # define u3qfu_van_vet 59 diff --git a/pkg/noun/jets/tree.c b/pkg/noun/jets/tree.c index e4fc71643c..f4bb68ae17 100644 --- a/pkg/noun/jets/tree.c +++ b/pkg/noun/jets/tree.c @@ -2126,8 +2126,77 @@ static u3j_core _139_hex_json_d[] = {} }; +/* /lib jets in non core +*/ +static u3j_harm _139_non__lagoon_add_a[] = {{".2", u3wi_la_add}, {}}; +static u3j_harm _139_non__lagoon_sub_a[] = {{".2", u3wi_la_sub}, {}}; +static u3j_harm _139_non__lagoon_mul_a[] = {{".2", u3wi_la_mul}, {}}; +static u3j_harm _139_non__lagoon_div_a[] = {{".2", u3wi_la_div}, {}}; +static u3j_harm _139_non__lagoon_mod_a[] = {{".2", u3wi_la_mod}, {}}; +static u3j_harm _139_non__lagoon_adds_a[] = {{".2", u3wi_la_adds}, {}}; +static u3j_harm _139_non__lagoon_subs_a[] = {{".2", u3wi_la_subs}, {}}; +static u3j_harm _139_non__lagoon_muls_a[] = {{".2", u3wi_la_muls}, {}}; +static u3j_harm _139_non__lagoon_divs_a[] = {{".2", u3wi_la_divs}, {}}; +static u3j_harm _139_non__lagoon_mods_a[] = {{".2", u3wi_la_mods}, {}}; +static u3j_harm _139_non__lagoon_dot_a[] = {{".2", u3wi_la_dot}, {}}; +static u3j_harm _139_non__lagoon_trans_a[] ={{".2", u3wi_la_transpose}, {}}; +static u3j_harm _139_non__lagoon_cumsum_a[]={{".2", u3wi_la_cumsum}, {}}; +static u3j_harm _139_non__lagoon_argmin_a[]={{".2", u3wi_la_argmin}, {}}; +static u3j_harm _139_non__lagoon_argmax_a[]={{".2", u3wi_la_argmax}, {}}; +static u3j_harm _139_non__lagoon_ravel_a[]={{".2", u3wi_la_ravel}, {}}; +static u3j_harm _139_non__lagoon_min_a[] = {{".2", u3wi_la_min}, {}}; +static u3j_harm _139_non__lagoon_max_a[] = {{".2", u3wi_la_max}, {}}; +static u3j_harm _139_non__lagoon_linspace_a[]={{".2", u3wi_la_linspace}, {}}; +static u3j_harm _139_non__lagoon_range_a[]= {{".2", u3wi_la_range}, {}}; +static u3j_harm _139_non__lagoon_abs_a[] = {{".2", u3wi_la_abs}, {}}; +static u3j_harm _139_non__lagoon_gth_a[] = {{".2", u3wi_la_gth}, {}}; +static u3j_harm _139_non__lagoon_gte_a[] = {{".2", u3wi_la_gte}, {}}; +static u3j_harm _139_non__lagoon_lth_a[] = {{".2", u3wi_la_lth}, {}}; +static u3j_harm _139_non__lagoon_lte_a[] = {{".2", u3wi_la_lte}, {}}; +static u3j_harm _139_non__lagoon_diag_a[] = {{".2", u3wi_la_diag}, {}}; +static u3j_harm _139_non__lagoon_trace_a[]= {{".2", u3wi_la_trace}, {}}; +static u3j_harm _139_non__lagoon_mmul_a[] = {{".2", u3wi_la_mmul}, {}}; +static u3j_core _139_non__la_core_d[] = + { { "add-rays", 7, _139_non__lagoon_add_a, 0, no_hashes }, + { "sub-rays", 7, _139_non__lagoon_sub_a, 0, no_hashes }, + { "mul-rays", 7, _139_non__lagoon_mul_a, 0, no_hashes }, + { "div-rays", 7, _139_non__lagoon_div_a, 0, no_hashes }, + { "mod-rays", 7, _139_non__lagoon_mod_a, 0, no_hashes }, + { "add-scal", 7, _139_non__lagoon_adds_a, 0, no_hashes }, + { "sub-scal", 7, _139_non__lagoon_subs_a, 0, no_hashes }, + { "mul-scal", 7, _139_non__lagoon_muls_a, 0, no_hashes }, + { "div-scal", 7, _139_non__lagoon_divs_a, 0, no_hashes }, + { "mod-scal", 7, _139_non__lagoon_mods_a, 0, no_hashes }, + { "dot", 7, _139_non__lagoon_dot_a, 0, no_hashes }, + { "transpose",7, _139_non__lagoon_trans_a, 0, no_hashes }, + { "cumsum", 7, _139_non__lagoon_cumsum_a, 0, no_hashes }, + { "argmin", 7, _139_non__lagoon_argmin_a, 0, no_hashes }, + { "argmax", 7, _139_non__lagoon_argmax_a, 0, no_hashes }, + { "ravel", 7, _139_non__lagoon_ravel_a, 0, no_hashes }, + { "min", 7, _139_non__lagoon_min_a, 0, no_hashes }, + { "max", 7, _139_non__lagoon_max_a, 0, no_hashes }, + { "linspace", 7, _139_non__lagoon_linspace_a, 0, no_hashes }, + { "range", 7, _139_non__lagoon_range_a, 0, no_hashes }, + { "abs", 7, _139_non__lagoon_abs_a, 0, no_hashes }, + { "gth", 7, _139_non__lagoon_gth_a, 0, no_hashes }, + { "gte", 7, _139_non__lagoon_gte_a, 0, no_hashes }, + { "lth", 7, _139_non__lagoon_lth_a, 0, no_hashes }, + { "lte", 7, _139_non__lagoon_lte_a, 0, no_hashes }, + { "diag", 7, _139_non__lagoon_diag_a, 0, no_hashes }, + { "trace", 7, _139_non__lagoon_trace_a,0, no_hashes }, + { "mmul", 7, _139_non__lagoon_mmul_a, 0, no_hashes }, + {} + }; + +static u3j_core _139_non_d[] = + { { "lagoon", 7, 0, _139_non__la_core_d, no_hashes }, + {} + }; + static u3j_core _139_hex_d[] = -{ { "lore", 63, _140_hex_lore_a, 0, no_hashes }, +{ { "non", 7, 0, _139_non_d, no_hashes }, + + { "lore", 63, _140_hex_lore_a, 0, no_hashes }, { "leer", 63, _140_hex_leer_a, 0, no_hashes }, { "loss", 63, _140_hex_loss_a, 0, no_hashes }, { "lune", 127, _140_hex_lune_a, 0, no_hashes }, @@ -2144,6 +2213,7 @@ static u3j_core _139_hex_d[] = { "secp", 6, 0, _140_hex_secp_d, no_hashes }, { "mimes", 31, 0, _140_hex_mimes_d, no_hashes }, { "json", 31, 0, _139_hex_json_d, no_hashes }, + {} }; diff --git a/pkg/noun/jets/w.h b/pkg/noun/jets/w.h index 07e8498344..379723d0a9 100644 --- a/pkg/noun/jets/w.h +++ b/pkg/noun/jets/w.h @@ -337,5 +337,35 @@ u3_noun u3wfu_repo(u3_noun); u3_noun u3wfu_rest(u3_noun); + u3_noun u3wi_la_add(u3_noun); + u3_noun u3wi_la_sub(u3_noun); + u3_noun u3wi_la_mul(u3_noun); + u3_noun u3wi_la_div(u3_noun); + u3_noun u3wi_la_mod(u3_noun); + u3_noun u3wi_la_adds(u3_noun); + u3_noun u3wi_la_subs(u3_noun); + u3_noun u3wi_la_muls(u3_noun); + u3_noun u3wi_la_divs(u3_noun); + u3_noun u3wi_la_mods(u3_noun); + u3_noun u3wi_la_dot(u3_noun); + u3_noun u3wi_la_diag(u3_noun); + u3_noun u3wi_la_transpose(u3_noun); + u3_noun u3wi_la_cumsum(u3_noun); + u3_noun u3wi_la_argmin(u3_noun); + u3_noun u3wi_la_argmax(u3_noun); + u3_noun u3wi_la_ravel(u3_noun); + u3_noun u3wi_la_min(u3_noun); + u3_noun u3wi_la_max(u3_noun); + u3_noun u3wi_la_linspace(u3_noun); + u3_noun u3wi_la_range(u3_noun); + u3_noun u3wi_la_abs(u3_noun); + u3_noun u3wi_la_gth(u3_noun); + u3_noun u3wi_la_gte(u3_noun); + u3_noun u3wi_la_lth(u3_noun); + u3_noun u3wi_la_lte(u3_noun); + + u3_noun u3wi_la_trace(u3_noun); + u3_noun u3wi_la_mmul(u3_noun); + #endif /* ifndef U3_JETS_W_H */