diff --git a/js/bin/integration.js b/js/bin/integration.js index fe32433d3845a..2aeb14d0e3425 100755 --- a/js/bin/integration.js +++ b/js/bin/integration.js @@ -17,6 +17,8 @@ // specific language governing permissions and limitations // under the License. +var fs = require('fs'); +var glob = require('glob'); var path = require('path'); var gulp = require.resolve(path.join(`..`, `node_modules/gulp/bin/gulp.js`)); var child_process = require(`child_process`); @@ -29,12 +31,14 @@ var optionList = [ { type: String, name: 'arrow', alias: 'a', - description: 'The Arrow file to read/write' + multiple: true, defaultValue: [], + description: 'The Arrow file[s] to read/write' }, { type: String, name: 'json', alias: 'j', - description: 'The JSON file to read/write' + multiple: true, defaultValue: [], + description: 'The JSON file[s] to read/write' } ]; @@ -66,20 +70,60 @@ function print_usage() { process.exit(1); } -if (!argv.arrow || !argv.json || !argv.mode) { +let jsonPaths = argv.json; +let arrowPaths = argv.arrow; + +if (!argv.mode) { + return print_usage(); +} + +let mode = argv.mode.toUpperCase(); +if (mode === 'VALIDATE' && !jsonPaths.length) { + jsonPaths = glob.sync(path.resolve(__dirname, `../test/data/json/`, `*.json`)); + if (!arrowPaths.length) { + [jsonPaths, arrowPaths] = jsonPaths.reduce(([jsonPaths, arrowPaths], jsonPath) => { + const { name } = path.parse(jsonPath); + for (const source of ['cpp', 'java']) { + for (const format of ['file', 'stream']) { + const arrowPath = path.resolve(__dirname, `../test/data/${source}/${format}/${name}.arrow`); + if (fs.existsSync(arrowPath)) { + jsonPaths.push(jsonPath); + arrowPaths.push(arrowPath); + console.log('-j', jsonPath, '-a', arrowPath, '\\'); + } + } + } + return [jsonPaths, arrowPaths]; + }, [[], []]); + } +} else if (!jsonPaths.length) { return print_usage(); } -switch (argv.mode.toUpperCase()) { +switch (mode) { case 'VALIDATE': + const args = [`test`, `-i`].concat(argv._unknown || []); + jsonPaths.forEach((p, i) => { + args.push('-j', p, '-a', arrowPaths[i]); + }); child_process.spawnSync( - gulp, - [`test`, `-i`].concat(process.argv.slice(2)), + gulp, args, { cwd: path.resolve(__dirname, '..'), stdio: ['ignore', 'inherit', 'inherit'] } ); + // for (let i = -1, n = jsonPaths.length; ++i < n;) { + // const jsonPath = jsonPaths[i]; + // const arrowPath = arrowPaths[i]; + // child_process.spawnSync( + // gulp, args.concat(['-j', jsonPath, '-a', arrowPath]), + // { + // cwd: path.resolve(__dirname, '..'), + // stdio: ['ignore', 'inherit', 'inherit'] + // } + // ); + // } break; default: print_usage(); diff --git a/js/gulp/argv.js b/js/gulp/argv.js index 6f80912e97e52..8a83820c1fe59 100644 --- a/js/gulp/argv.js +++ b/js/gulp/argv.js @@ -15,6 +15,10 @@ // specific language governing permissions and limitations // under the License. +const fs = require('fs'); +const glob = require('glob'); +const path = require('path'); + const argv = require(`command-line-args`)([ { name: `all`, type: Boolean }, { name: 'update', alias: 'u', type: Boolean }, @@ -22,13 +26,11 @@ const argv = require(`command-line-args`)([ { name: `target`, type: String, defaultValue: `` }, { name: `module`, type: String, defaultValue: `` }, { name: `coverage`, type: Boolean, defaultValue: false }, - { name: `json_file`, alias: `j`, type: String, defaultValue: null }, - { name: `arrow_file`, alias: `a`, type: String, defaultValue: null }, { name: `integration`, alias: `i`, type: Boolean, defaultValue: false }, { name: `targets`, alias: `t`, type: String, multiple: true, defaultValue: [] }, { name: `modules`, alias: `m`, type: String, multiple: true, defaultValue: [] }, - { name: `sources`, alias: `s`, type: String, multiple: true, defaultValue: [`cpp`, `java`] }, - { name: `formats`, alias: `f`, type: String, multiple: true, defaultValue: [`file`, `stream`] }, + { name: `json_files`, alias: `j`, type: String, multiple: true, defaultValue: [] }, + { name: `arrow_files`, alias: `a`, type: String, multiple: true, defaultValue: [] }, ], { partial: true }); const { targets, modules } = argv; @@ -38,4 +40,25 @@ argv.module && !modules.length && modules.push(argv.module); (argv.all || !targets.length) && targets.push(`all`); (argv.all || !modules.length) && modules.push(`all`); +if (argv.coverage && (!argv.json_files || !argv.json_files.length)) { + + let [jsonPaths, arrowPaths] = glob + .sync(path.resolve(__dirname, `../test/data/json/`, `*.json`)) + .reduce((paths, jsonPath) => { + const { name } = path.parse(jsonPath); + const [jsonPaths, arrowPaths] = paths; + ['cpp', 'java'].forEach((source) => ['file', 'stream'].forEach((format) => { + const arrowPath = path.resolve(__dirname, `../test/data/${source}/${format}/${name}.arrow`); + if (fs.existsSync(arrowPath)) { + jsonPaths.push(jsonPath); + arrowPaths.push(arrowPath); + } + })); + return paths; + }, [[], []]); + + argv.json_files = jsonPaths; + argv.arrow_files = arrowPaths; +} + module.exports = { argv, targets, modules }; diff --git a/js/gulp/closure-task.js b/js/gulp/closure-task.js index 1bd872fd3044a..0b2ef1b846b81 100644 --- a/js/gulp/closure-task.js +++ b/js/gulp/closure-task.js @@ -36,7 +36,7 @@ const closureTask = ((cache) => memoizeTask(cache, function closure(target, form const src = targetDir(target, `cls`); const out = targetDir(target, format); const entry = path.join(src, mainExport); - const externs = path.join(src, `${mainExport}.externs`); + const externs = path.join(`src/Arrow.externs.js`); return observableFromStreams( gulp.src([ /* external libs first --> */ `node_modules/tslib/package.json`, @@ -46,7 +46,6 @@ const closureTask = ((cache) => memoizeTask(cache, function closure(target, form `node_modules/text-encoding-utf-8/package.json`, `node_modules/text-encoding-utf-8/src/encoding.js`, /* then sources globs --> */ `${src}/**/*.js`, -/* and exclusions last --> */ `!${src}/Arrow.externs.js`, ], { base: `./` }), sourcemaps.init(), closureCompiler(createClosureArgs(entry, externs)), @@ -60,14 +59,15 @@ const closureTask = ((cache) => memoizeTask(cache, function closure(target, form }))({}); const createClosureArgs = (entry, externs) => ({ + externs, third_party: true, warning_level: `QUIET`, dependency_mode: `STRICT`, rewrite_polyfills: false, - externs: `${externs}.js`, entry_point: `${entry}.js`, module_resolution: `NODE`, - // formatting: `PRETTY_PRINT`, debug: true, + // formatting: `PRETTY_PRINT`, + // debug: true, compilation_level: `ADVANCED`, allow_method_call_decomposing: true, package_json_entry_names: `module,jsnext:main,main`, diff --git a/js/gulp/package-task.js b/js/gulp/package-task.js index 2976d0ad45d09..c42b3fc323321 100644 --- a/js/gulp/package-task.js +++ b/js/gulp/package-task.js @@ -45,10 +45,11 @@ const createMainPackageJson = (target, format) => (orig) => ({ ...createTypeScriptPackageJson(target, format)(orig), name: npmPkgName, main: mainExport, + types: `${mainExport}.d.ts`, module: `${mainExport}.mjs`, dist: `${mainExport}.es5.min.js`, [`dist:es2015`]: `${mainExport}.es2015.min.js`, - [`@std/esm`]: { esm: `mjs` } + [`@std/esm`]: { esm: `mjs`, warnings: false, sourceMap: true } }); const createTypeScriptPackageJson = (target, format) => (orig) => ({ @@ -63,18 +64,20 @@ const createTypeScriptPackageJson = (target, format) => (orig) => ({ const createScopedPackageJSON = (target, format) => (({ name, ...orig }) => conditionallyAddStandardESMEntry(target, format)( - packageJSONFields.reduce( - (xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }), - { name: `${npmOrgName}/${packageName(target, format)}`, - version: undefined, main: `${mainExport}.js`, types: `${mainExport}.d.ts`, - dist: undefined, [`dist:es2015`]: undefined, module: undefined, [`@std/esm`]: undefined } - ) + packageJSONFields.reduce( + (xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }), + { + name: `${npmOrgName}/${packageName(target, format)}`, + version: undefined, main: `${mainExport}.js`, types: `${mainExport}.d.ts`, + dist: undefined, [`dist:es2015`]: undefined, module: undefined, [`@std/esm`]: undefined + } + ) ) ); const conditionallyAddStandardESMEntry = (target, format) => (packageJSON) => ( - format !== `esm` - ? packageJSON - : { ...packageJSON, [`@std/esm`]: { esm: `js` } } + format !== `esm` && format !== `cls` + ? packageJSON + : { ...packageJSON, [`@std/esm`]: { esm: `js`, warnings: false, sourceMap: true } } ); \ No newline at end of file diff --git a/js/gulp/test-task.js b/js/gulp/test-task.js index ab280b092635c..7f655548eb8ef 100644 --- a/js/gulp/test-task.js +++ b/js/gulp/test-task.js @@ -44,15 +44,15 @@ const testOptions = { const testTask = ((cache, execArgv, testOptions) => memoizeTask(cache, function test(target, format, debug = false) { const opts = { ...testOptions }; const args = !debug ? [...execArgv] : [...debugArgv, ...execArgv]; - args.push(`test/${argv.integration ? `integration/*` : `unit/*`}`); + if (!argv.coverage) { + args.push(`test/${argv.integration ? `integration/*` : `unit/*`}`); + } opts.env = { ...opts.env, TEST_TARGET: target, TEST_MODULE: format, - JSON_PATH: argv.json_file, - ARROW_PATH: argv.arrow_file, TEST_TS_SOURCE: !!argv.coverage, - TEST_SOURCES: JSON.stringify(Array.isArray(argv.sources) ? argv.sources : [argv.sources]), - TEST_FORMATS: JSON.stringify(Array.isArray(argv.formats) ? argv.formats : [argv.formats]), + JSON_PATHS: JSON.stringify(Array.isArray(argv.json_files) ? argv.json_files : [argv.json_files]), + ARROW_PATHS: JSON.stringify(Array.isArray(argv.arrow_files) ? argv.arrow_files : [argv.arrow_files]), }; return !debug ? child_process.spawn(jest, args, opts) : diff --git a/js/gulp/typescript-task.js b/js/gulp/typescript-task.js index 8b755cf7f1624..c42357adb2f75 100644 --- a/js/gulp/typescript-task.js +++ b/js/gulp/typescript-task.js @@ -34,7 +34,7 @@ const typescriptTask = ((cache) => memoizeTask(cache, function typescript(target const tsProject = ts.createProject(path.join(`tsconfig`, tsconfigFile), { typescript: require(`typescript`) }); const { stream: { js, dts } } = observableFromStreams( tsProject.src(), sourcemaps.init(), - tsProject(ts.reporter.fullReporter(true)) + tsProject(ts.reporter.defaultReporter()) ); const writeDTypes = observableFromStreams(dts, gulp.dest(out)); const writeJS = observableFromStreams(js, sourcemaps.write(), gulp.dest(out)); @@ -52,12 +52,12 @@ function maybeCopyRawJSArrowFormatFiles(target, format) { return Observable.empty(); } return Observable.defer(async () => { - const outFormatDir = path.join(targetDir(target, format), `format`, `fb`); + const outFormatDir = path.join(targetDir(target, format), `fb`); await del(path.join(outFormatDir, '*.js')); await observableFromStreams( - gulp.src(path.join(`src`, `format`, `fb`, `*_generated.js`)), + gulp.src(path.join(`src`, `fb`, `*_generated.js`)), gulpRename((p) => { p.basename = p.basename.replace(`_generated`, ``); }), gulp.dest(outFormatDir) ).toPromise(); }); -} \ No newline at end of file +} diff --git a/js/gulp/uglify-task.js b/js/gulp/uglify-task.js index 5c605cb7882bd..9ba3e41a16f41 100644 --- a/js/gulp/uglify-task.js +++ b/js/gulp/uglify-task.js @@ -29,7 +29,7 @@ const webpack = require(`webpack`); const { memoizeTask } = require('./memoize-task'); const { Observable, ReplaySubject } = require('rxjs'); const UglifyJSPlugin = require(`uglifyjs-webpack-plugin`); -const esmRequire = require(`@std/esm`)(module, { cjs: true, esm: `js` }); +const esmRequire = require(`@std/esm`)(module, { cjs: true, esm: `js`, warnings: false }); const uglifyTask = ((cache, commonConfig) => memoizeTask(cache, function uglifyJS(target, format) { @@ -84,11 +84,20 @@ module.exports = uglifyTask; module.exports.uglifyTask = uglifyTask; const reservePublicNames = ((ESKeywords) => function reservePublicNames(target, format) { - const publicModulePath = `../${targetDir(target, format)}/${mainExport}.js`; - return [ - ...ESKeywords, - ...reserveExportedNames(esmRequire(publicModulePath)) + const src = targetDir(target, format); + const publicModulePaths = [ + `../${src}/data.js`, + `../${src}/type.js`, + `../${src}/table.js`, + `../${src}/vector.js`, + `../${src}/util/int.js`, + `../${src}/predicate.js`, + `../${src}/recordbatch.js`, + `../${src}/${mainExport}.js`, ]; + return publicModulePaths.reduce((keywords, publicModulePath) => [ + ...keywords, ...reserveExportedNames(esmRequire(publicModulePath, { warnings: false })) + ], [...ESKeywords]); })(ESKeywords); // Reflect on the Arrow modules to come up with a list of keys to save from Uglify's @@ -104,8 +113,8 @@ const reserveExportedNames = (entryModule) => ( .map((name) => [name, entryModule[name]]) .reduce((reserved, [name, value]) => { const fn = function() {}; - const ownKeys = value && Object.getOwnPropertyNames(value) || []; - const protoKeys = typeof value === `function` && Object.getOwnPropertyNames(value.prototype) || []; + const ownKeys = value && typeof value === 'object' && Object.getOwnPropertyNames(value) || []; + const protoKeys = typeof value === `function` && Object.getOwnPropertyNames(value.prototype || {}) || []; const publicNames = [...ownKeys, ...protoKeys].filter((x) => x !== `default` && x !== `undefined` && !(x in fn)); return [...reserved, name, ...publicNames]; }, [] diff --git a/js/gulp/util.js b/js/gulp/util.js index ba6ebece51bba..f35a447e70830 100644 --- a/js/gulp/util.js +++ b/js/gulp/util.js @@ -87,7 +87,7 @@ const ESKeywords = [ // EventTarget `addListener`, `removeListener`, `addEventListener`, `removeEventListener`, // Arrow properties - `low`, `high`, `data`, `index`, `field`, `validity`, `columns`, `fieldNode`, `subarray`, + `low`, `high`, `data`, `index`, `field`, `columns`, 'numCols', 'numRows', `values`, `valueOffsets`, `nullBitmap`, `subarray` ]; function taskName(target, format) { @@ -108,14 +108,13 @@ function targetDir(target, format) { function logAndDie(e) { if (e) { - console.error(e); process.exit(1); } } function observableFromStreams(...streams) { - const pumped = streams.length <= 1 ? streams[0] - : pump(...streams, logAndDie); + if (streams.length <= 0) { return Observable.empty(); } + const pumped = streams.length <= 1 ? streams[0] : pump(...streams, logAndDie); const fromEvent = Observable.fromEvent.bind(null, pumped); const streamObs = fromEvent(`data`) .merge(fromEvent(`error`).flatMap((e) => Observable.throw(e))) diff --git a/js/package.json b/js/package.json index d68e7a6279e61..1c8b23604ab85 100644 --- a/js/package.json +++ b/js/package.json @@ -3,7 +3,7 @@ "name": "apache-arrow", "description": "Apache Arrow columnar in-memory format", "bin": { - "arrow2csv": "bin/arrow2csv" + "arrow2csv": "bin/arrow2csv.js" }, "scripts": { "lerna": "lerna", @@ -12,6 +12,8 @@ "clean": "gulp clean", "debug": "gulp debug", "perf": "node ./perf/index.js", + "test:integration": "node ./bin/integration.js --mode validate", + "create:perfdata": "python ./test/data/tables/generate.py ./test/data/tables/tracks.arrow", "release": "./npm-release.sh", "clean:all": "run-p clean clean:testdata", "clean:testdata": "gulp clean:testdata", @@ -51,18 +53,18 @@ ], "dependencies": { "@types/text-encoding-utf-8": "1.0.1", - "command-line-args": "4.0.7", - "command-line-usage": "4.0.2", + "command-line-args": "5.0.1", + "command-line-usage": "4.1.0", "flatbuffers": "trxcllnt/flatbuffers-esm", "json-bignum": "0.0.3", "text-encoding-utf-8": "^1.0.2", - "tslib": "1.8.1" + "tslib": "1.9.0" }, "devDependencies": { - "@std/esm": "0.19.1", + "@std/esm": "0.19.7", "@types/flatbuffers": "1.6.5", - "@types/glob": "5.0.34", - "@types/jest": "22.0.1", + "@types/glob": "5.0.35", + "@types/jest": "22.1.0", "@types/node": "9.3.0", "ast-types": "0.10.1", "benchmark": "2.1.4", @@ -77,13 +79,13 @@ "gulp-rename": "1.2.2", "gulp-sourcemaps": "2.6.3", "gulp-transform-js-ast": "1.0.2", - "gulp-typescript": "3.2.3", + "gulp-typescript": "3.2.4", "ix": "2.3.4", - "jest": "22.0.5", + "jest": "22.1.4", "jest-environment-node-debug": "2.0.0", "json": "9.0.6", - "lerna": "2.6.0", - "lint-staged": "6.0.0", + "lerna": "2.7.1", + "lint-staged": "6.0.1", "merge2": "1.2.1", "mkdirp": "0.5.1", "npm-run-all": "4.1.2", @@ -100,6 +102,9 @@ "webpack": "3.10.0", "xml2js": "0.4.19" }, + "@std/esm": { + "warnings": false + }, "lint-staged": { "*.@(ts)": [ "tslint --fix", @@ -126,7 +131,7 @@ "lcov" ], "coveragePathIgnorePatterns": [ - "format\\/(File|Message|Schema|Tensor)_generated\\.(js|ts)$", + "fb\\/(File|Message|Schema|Tensor)_generated\\.(js|ts)$", "test\\/.*\\.(ts|tsx|js)$", "/node_modules/" ], @@ -135,7 +140,8 @@ ".(js|jsx)": "./node_modules/babel-jest/build/index.js" }, "transformIgnorePatterns": [ - "/node_modules/", "/(es2015|esnext)\/umd/" + "/node_modules/", + "/(es2015|esnext)/umd/" ], "testRegex": "(.*(-|\\.)(test|spec)s?)\\.(ts|tsx|js)$" } diff --git a/js/perf/index.js b/js/perf/index.js index 9eac40e64ac71..42cb6abe29cb7 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -16,29 +16,41 @@ // under the License. // Use the ES5 UMD target as perf baseline -// const { Table, readVectors } = require('../targets/es5/umd'); -// const { Table, readVectors } = require('../targets/es5/cjs'); -const { Table, readVectors } = require('../targets/es2015/umd'); -// const { Table, readVectors } = require('../targets/es2015/cjs'); +// const { predicate, Table, read: readBatches } = require('../targets/es5/umd'); +// const { predicate, Table, read: readBatches } = require('../targets/es5/cjs'); +// const { predicate, Table, read: readBatches } = require('../targets/es2015/umd'); +const { predicate, Table, read: readBatches } = require('../targets/es2015/cjs'); +const { col } = predicate; -const config = require('./config'); const Benchmark = require('benchmark'); const suites = []; -for (let { name, buffers} of config) { - const parseSuite = new Benchmark.Suite(`Parse ${name}`, { async: true }); - const sliceSuite = new Benchmark.Suite(`Slice ${name} vectors`, { async: true }); - const iterateSuite = new Benchmark.Suite(`Iterate ${name} vectors`, { async: true }); - const getByIndexSuite = new Benchmark.Suite(`Get ${name} values by index`, { async: true }); - parseSuite.add(createFromTableTest(name, buffers)); - parseSuite.add(createReadVectorsTest(name, buffers)); - for (const vector of Table.from(buffers).columns) { - sliceSuite.add(createSliceTest(vector)); - iterateSuite.add(createIterateTest(vector)); - getByIndexSuite.add(createGetByIndexTest(vector)); - } - suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); +for (let { name, buffers } of require('./table_config')) { + const parseSuiteName = `Parse "${name}"`; + const sliceSuiteName = `Slice "${name}" vectors`; + const iterateSuiteName = `Iterate "${name}" vectors`; + const getByIndexSuiteName = `Get "${name}" values by index`; + const sliceToArraySuiteName = `Slice toArray "${name}" vectors`; + suites.push(createTestSuite(parseSuiteName, createFromTableTest(name, buffers))); + suites.push(createTestSuite(parseSuiteName, createReadBatchesTest(name, buffers))); + const table = Table.from(buffers), schema = table.schema; + suites.push(...schema.fields.map((f, i) => createTestSuite(getByIndexSuiteName, createGetByIndexTest(table.getColumnAt(i), f.name)))); + suites.push(...schema.fields.map((f, i) => createTestSuite(iterateSuiteName, createIterateTest(table.getColumnAt(i), f.name)))); + suites.push(...schema.fields.map((f, i) => createTestSuite(sliceToArraySuiteName, createSliceToArrayTest(table.getColumnAt(i), f.name)))); + suites.push(...schema.fields.map((f, i) => createTestSuite(sliceSuiteName, createSliceTest(table.getColumnAt(i), f.name)))); +} + +for (let {name, buffers, countBys, counts} of require('./table_config')) { + const table = Table.from(buffers); + + const dfCountBySuiteName = `DataFrame Count By "${name}"`; + const dfFilterCountSuiteName = `DataFrame Filter-Scan Count "${name}"`; + const dfDirectCountSuiteName = `DataFrame Direct Count "${name}"`; + + suites.push(...countBys.map((countBy) => createTestSuite(dfCountBySuiteName, createDataFrameCountByTest(table, countBy)))); + suites.push(...counts.map(({ col, test, value }) => createTestSuite(dfFilterCountSuiteName, createDataFrameFilterCountTest(table, col, test, value)))); + suites.push(...counts.map(({ col, test, value }) => createTestSuite(dfDirectCountSuiteName, createDataFrameDirectCountTest(table, col, test, value)))); } console.log('Running apache-arrow performance tests...\n'); @@ -52,7 +64,7 @@ function run() { var str = x.toString(); var meanMsPerOp = Math.round(x.stats.mean * 100000)/100; var sliceOf60FPS = Math.round((meanMsPerOp / (1000/60)) * 100000)/1000; - return `${str} (avg: ${meanMsPerOp}ms, or ${sliceOf60FPS}% of a frame @ 60FPS) ${x.suffix || ''}`; + return `${str}\n avg: ${meanMsPerOp}ms\n ${sliceOf60FPS}% of a frame @ 60FPS ${x.suffix || ''}`; }).join('\n') + '\n'); if (suites.length > 0) { setTimeout(run, 1000); @@ -61,47 +73,60 @@ function run() { .run({ async: true }); } +function createTestSuite(name, test) { + return new Benchmark.Suite(name, { async: true }).add(test); +} + function createFromTableTest(name, buffers) { let table; return { async: true, - name: `Table.from`, + name: `Table.from\n`, fn() { table = Table.from(buffers); } }; } -function createReadVectorsTest(name, buffers) { - let vectors; +function createReadBatchesTest(name, buffers) { + let recordBatch; return { async: true, - name: `readVectors`, - fn() { for (vectors of readVectors(buffers)) {} } + name: `readBatches\n`, + fn() { for (recordBatch of readBatches(buffers)) {} } }; } -function createSliceTest(vector) { +function createSliceTest(vector, name) { let xs; return { async: true, - name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`, + name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`, fn() { xs = vector.slice(); } }; } -function createIterateTest(vector) { +function createSliceToArrayTest(vector, name) { + let xs; + return { + async: true, + name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`, + fn() { xs = vector.slice().toArray(); } + }; +} + +function createIterateTest(vector, name) { let value; return { async: true, - name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`, + name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`, fn() { for (value of vector) {} } }; } -function createGetByIndexTest(vector) { +function createGetByIndexTest(vector, name) { let value; return { async: true, - name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`, + name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`, fn() { for (let i = -1, n = vector.length; ++i < n;) { value = vector.get(i); @@ -109,3 +134,80 @@ function createGetByIndexTest(vector) { } }; } + +function createDataFrameDirectCountTest(table, column, test, value) { + let sum, colidx = table.schema.fields.findIndex((c)=>c.name === column); + + if (test == 'gteq') { + op = function () { + sum = 0; + let batches = table.batches; + let numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const vector = batch.getChildAt(colidx); + // yield all indices + for (let index = -1; ++index < batch.length;) { + sum += (vector.get(index) >= value); + } + } + } + } else if (test == 'eq') { + op = function() { + sum = 0; + let batches = table.batches; + let numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const vector = batch.getChildAt(colidx); + // yield all indices + for (let index = -1; ++index < batch.length;) { + sum += (vector.get(index) === value); + } + } + } + } else { + throw new Error(`Unrecognized test "${test}"`); + } + + return { + async: true, + name: `name: '${column}', length: ${table.length}, type: ${table.getColumnAt(colidx).type}, test: ${test}, value: ${value}\n`, + fn: op + }; +} + +function createDataFrameCountByTest(table, column) { + let colidx = table.schema.fields.findIndex((c)=> c.name === column); + + return { + async: true, + name: `name: '${column}', length: ${table.length}, type: ${table.getColumnAt(colidx).type}\n`, + fn() { + table.countBy(column); + } + }; +} + +function createDataFrameFilterCountTest(table, column, test, value) { + let colidx = table.schema.fields.findIndex((c)=> c.name === column); + let df; + + if (test == 'gteq') { + df = table.filter(col(column).gteq(value)); + } else if (test == 'eq') { + df = table.filter(col(column).eq(value)); + } else { + throw new Error(`Unrecognized test "${test}"`); + } + + return { + async: true, + name: `name: '${column}', length: ${table.length}, type: ${table.getColumnAt(colidx).type}, test: ${test}, value: ${value}\n`, + fn() { + df.count(); + } + }; +} diff --git a/js/src/vector/utf8.ts b/js/perf/table_config.js similarity index 51% rename from js/src/vector/utf8.ts rename to js/perf/table_config.js index ba875cf333fe7..e3c332c870f38 100644 --- a/js/src/vector/utf8.ts +++ b/js/perf/table_config.js @@ -15,26 +15,34 @@ // specific language governing permissions and limitations // under the License. -import { Vector } from './vector'; -import { VirtualVector } from './virtual'; -import { TextDecoder } from 'text-encoding-utf-8'; +const fs = require('fs'); +const path = require('path'); +const glob = require('glob'); -const decoder = new TextDecoder('utf-8'); +const config = []; +const filenames = glob.sync(path.resolve(__dirname, `../test/data/tables/`, `*.arrow`)); -export class Utf8Vector extends Vector { - readonly values: Vector; - constructor(argv: { values: Vector }) { - super(); - this.values = argv.values; - } - get(index: number) { - const chars = this.getCodePoints(index); - return chars ? decoder.decode(chars) : null; - } - getCodePoints(index: number) { - return this.values.get(index); - } - concat(...vectors: Vector[]): Vector { - return new VirtualVector(Array, this, ...vectors); +countBys = { + "tracks": ['origin', 'destination'] +} +counts = { + "tracks": [ + {col: 'lat', test: 'gteq', value: 0 }, + {col: 'lng', test: 'gteq', value: 0 }, + {col: 'origin', test: 'eq', value: 'Seattle'}, + ] +} + +for (const filename of filenames) { + const { name } = path.parse(filename); + if (name in counts) { + config.push({ + name, + buffers: [fs.readFileSync(filename)], + countBys: countBys[name], + counts: counts[name], + }); } } + +module.exports = config; diff --git a/js/src/Arrow.externs.js b/js/src/Arrow.externs.js new file mode 100644 index 0000000000000..438ac8b736cac --- /dev/null +++ b/js/src/Arrow.externs.js @@ -0,0 +1,680 @@ +// @ts-nocheck +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* tslint:disable */ + +/** + * @fileoverview Closure Compiler externs for Arrow + * @externs + * @suppress {duplicate,checkTypes} + */ +/** @type {symbol} */ +Symbol.iterator; +/** @type {symbol} */ +Symbol.asyncIterator; + +var Table = function() {}; +/** @type {?} */ +Table.from = function() {}; +/** @type {?} */ +Table.fromAsync = function() {}; +/** @type {?} */ +Table.empty = function() {}; +/** @type {?} */ +Table.prototype.schema; +/** @type {?} */ +Table.prototype.length; +/** @type {?} */ +Table.prototype.numCols; +/** @type {?} */ +Table.prototype.get; +/** @type {?} */ +Table.prototype.getColumn; +/** @type {?} */ +Table.prototype.getColumnAt; +/** @type {?} */ +Table.prototype.getColumnIndex; +/** @type {?} */ +Table.prototype.toArray; +/** @type {?} */ +Table.prototype.select; +/** @type {?} */ +Table.prototype.rowsToString; +/** @type {?} */ +Table.prototype.batchesUnion; +/** @type {?} */ +Table.prototype.batches; +/** @type {?} */ +Table.prototype.countBy; +/** @type {?} */ +Table.prototype.scan; + +var CountByResult = function() {}; +/** @type {?} */ +CountByResult.prototype.asJSON; + +var col = function () {}; +var lit = function () {}; + +var Value = function() {}; +/** @type {?} */ +Value.prototype.gteq; +/** @type {?} */ +Value.prototype.lteq; +/** @type {?} */ +Value.prototype.eq; + +var Col = function() {}; +/** @type {?} */ +Col.prototype.bind; +var Or = function() {}; +var And = function() {}; +var GTeq = function () {}; +/** @type {?} */ +GTeq.prototype.and; +/** @type {?} */ +GTeq.prototype.or; +var LTeq = function () {}; +/** @type {?} */ +LTeq.prototype.and; +/** @type {?} */ +LTeq.prototype.or; +var Equals = function () {}; +/** @type {?} */ +Equals.prototype.and; +/** @type {?} */ +Equals.prototype.or; +var Predicate = function() {}; +/** @type {?} */ +Predicate.prototype.bind; +/** @type {?} */ +Predicate.prototype.and; +/** @type {?} */ +Predicate.prototype.or; +/** @type {?} */ +Predicate.prototype.ands; +var Literal = function() {}; + +var TableToStringIterator = function() {}; +/** @type {?} */ +TableToStringIterator.prototype.pipe; + +var RecordBatch = function() {}; +/** @type {?} */ +RecordBatch.from = function() {}; +/** @type {?} */ +RecordBatch.prototype.numCols; +/** @type {?} */ +RecordBatch.prototype.length; +/** @type {?} */ +RecordBatch.prototype.schema; +/** @type {?} */ +RecordBatch.prototype.columns; +/** @type {?} */ +RecordBatch.prototype.select; + +var Vector = function() {}; +/** @type {?} */ +Vector.create = function() {}; +/** @type {?} */ +Vector.prototype.data; +/** @type {?} */ +Vector.prototype.type; +/** @type {?} */ +Vector.prototype.length; +/** @type {?} */ +Vector.prototype.nullCount; +/** @type {?} */ +Vector.prototype.nullBitmap; +/** @type {?} */ +Vector.prototype.isValid; +/** @type {?} */ +Vector.prototype.get; +/** @type {?} */ +Vector.prototype.set; +/** @type {?} */ +Vector.prototype.toArray; +/** @type {?} */ +Vector.prototype.concat; +/** @type {?} */ +Vector.prototype.slice; +/** @type {?} */ +Vector.prototype.acceptTypeVisitor; + +var BaseInt64 = function() {}; +/** @type {?} */ +BaseInt64.prototype.lessThan; +/** @type {?} */ +BaseInt64.prototype.equals; +/** @type {?} */ +BaseInt64.prototype.greaterThan; +/** @type {?} */ +BaseInt64.prototype.hex; + +var Uint64 = function() {}; +/** @type {?} */ +Uint64.add = function() {}; +/** @type {?} */ +Uint64.multiply = function() {}; +/** @type {?} */ +Uint64.prototype.times; +/** @type {?} */ +Uint64.prototype.plus + +var Int64 = function() {}; +/** @type {?} */ +Int64.add = function() {}; +/** @type {?} */ +Int64.multiply = function() {}; +/** @type {?} */ +Int64.fromString = function() {}; +/** @type {?} */ +Int64.prototype.negate +/** @type {?} */ +Int64.prototype.times +/** @type {?} */ +Int64.prototype.plus +/** @type {?} */ +Int64.prototype.lessThan + +var Int128 = function() {}; +/** @type {?} */ +Int128.add = function() {}; +/** @type {?} */ +Int128.multiply = function() {}; +/** @type {?} */ +Int128.fromString = function() {}; +/** @type {?} */ +Int128.prototype.negate +/** @type {?} */ +Int128.prototype.times +/** @type {?} */ +Int128.prototype.plus +/** @type {?} */ +Int128.prototype.hex + +var Type = function() {}; +/** @type {?} */ +Type.NONE = function() {}; +/** @type {?} */ +Type.Null = function() {}; +/** @type {?} */ +Type.Int = function() {}; +/** @type {?} */ +Type.Float = function() {}; +/** @type {?} */ +Type.Binary = function() {}; +/** @type {?} */ +Type.Utf8 = function() {}; +/** @type {?} */ +Type.Bool = function() {}; +/** @type {?} */ +Type.Decimal = function() {}; +/** @type {?} */ +Type.Date = function() {}; +/** @type {?} */ +Type.Time = function() {}; +/** @type {?} */ +Type.Timestamp = function() {}; +/** @type {?} */ +Type.Interval = function() {}; +/** @type {?} */ +Type.List = function() {}; +/** @type {?} */ +Type.Struct = function() {}; +/** @type {?} */ +Type.Union = function() {}; +/** @type {?} */ +Type.FixedSizeBinary = function() {}; +/** @type {?} */ +Type.FixedSizeList = function() {}; +/** @type {?} */ +Type.Map = function() {}; +/** @type {?} */ +Type.Dictionary = function() {}; +/** @type {?} */ +Type.DenseUnion = function() {}; +/** @type {?} */ +Type.SparseUnion = function() {}; + +var DataType = function() {}; +/** @type {?} */ +DataType.isNull = function() {}; +/** @type {?} */ +DataType.isInt = function() {}; +/** @type {?} */ +DataType.isFloat = function() {}; +/** @type {?} */ +DataType.isBinary = function() {}; +/** @type {?} */ +DataType.isUtf8 = function() {}; +/** @type {?} */ +DataType.isBool = function() {}; +/** @type {?} */ +DataType.isDecimal = function() {}; +/** @type {?} */ +DataType.isDate = function() {}; +/** @type {?} */ +DataType.isTime = function() {}; +/** @type {?} */ +DataType.isTimestamp = function() {}; +/** @type {?} */ +DataType.isInterval = function() {}; +/** @type {?} */ +DataType.isList = function() {}; +/** @type {?} */ +DataType.isStruct = function() {}; +/** @type {?} */ +DataType.isUnion = function() {}; +/** @type {?} */ +DataType.isDenseUnion = function() {}; +/** @type {?} */ +DataType.isSparseUnion = function() {}; +/** @type {?} */ +DataType.isFixedSizeBinary = function() {}; +/** @type {?} */ +DataType.isFixedSizeList = function() {}; +/** @type {?} */ +DataType.isMap = function() {}; +/** @type {?} */ +DataType.isDictionary = function() {}; +/** @type {?} */ +DataType.prototype.ArrayType; + +var Schema = function() {}; +/** @type {?} */ +Schema.from = function() {}; +/** @type {?} */ +Schema.prototype.fields; +/** @type {?} */ +Schema.prototype.version; +/** @type {?} */ +Schema.prototype.metadata; +/** @type {?} */ +Schema.prototype.dictionaries; +/** @type {?} */ +Schema.prototype.select; +var Field = function() {}; +/** @type {?} */ +Field.prototype.name; +/** @type {?} */ +Field.prototype.type; +/** @type {?} */ +Field.prototype.nullable; +/** @type {?} */ +Field.prototype.metadata; +var Null = function() {}; +var Int8 = function() {}; +var Int16 = function() {}; +var Int32 = function() {}; +var Int64 = function() {}; +var Uint8 = function() {}; +var Uint16 = function() {}; +var Uint32 = function() {}; +var Uint64 = function() {}; +var Float16 = function() {}; +var Float32 = function() {}; +var Float64 = function() {}; +var Binary = function() {}; +var Utf8 = function() {}; +var Bool = function() {}; +var Decimal = function() {}; +var Date_ = function() {}; +var Time = function() {}; +var Timestamp = function() {}; +var Interval = function() {}; +var List = function() {}; +var Struct = function() {}; +var Union = function() {}; +var DenseUnion = function() {}; +var SparseUnion = function() {}; +var FixedSizeBinary = function() {}; +var FixedSizeList = function() {}; +var Map_ = function() {}; +var Dictionary = function() {}; + +var BaseData = function() {}; +/** @type {?} */ +BaseData.prototype.type; +/** @type {?} */ +BaseData.prototype.clone; +/** @type {?} */ +BaseData.prototype.slice; +/** @type {?} */ +BaseData.prototype.length; +/** @type {?} */ +BaseData.prototype.offset; +/** @type {?} */ +BaseData.prototype.typeId; +/** @type {?} */ +BaseData.prototype.childData; +/** @type {?} */ +BaseData.prototype.nullBitmap; +/** @type {?} */ +BaseData.prototype.nullCount; + +var BoolData = function() {}; +var NestedData = function() {}; +var SparseUnionData = function() {}; +var ChunkedData = function() {}; + +var FlatData = function() {}; +/** @type {?} */ +FlatData.prototype.values; + +var FlatListData = function() {}; +/** @type {?} */ +FlatListData.prototype.values; +/** @type {?} */ +FlatListData.prototype.valueOffsets; + +var DictionaryData = function() {}; +/** @type {?} */ +DictionaryData.prototype.indicies; +/** @type {?} */ +DictionaryData.prototype.dictionary; + +var ListData = function() {}; +/** @type {?} */ +ListData.prototype.values; +/** @type {?} */ +ListData.prototype.valueOffsets; + +var UnionData = function() {}; +/** @type {?} */ +UnionData.prototype.typeIds; + +var DenseUnionData = function() {}; +/** @type {?} */ +DenseUnionData.prototype.valueOffsets; + +var ChunkedData = function() {}; +/** @type {?} */ +ChunkedData.computeOffsets = function() {}; + +var FlatVector = function() {}; +/** @type {?} */ +FlatVector.prototype.values; +/** @type {?} */ +FlatVector.prototype.lows; +/** @type {?} */ +FlatVector.prototype.highs; +/** @type {?} */ +FlatVector.prototype.asInt32; + +var ListVectorBase = function() {}; +/** @type {?} */ +ListVectorBase.prototype.values; +/** @type {?} */ +ListVectorBase.prototype.valueOffsets; +/** @type {?} */ +ListVectorBase.prototype.getValueOffset; +/** @type {?} */ +ListVectorBase.prototype.getValueLength; + +var NestedVector = function() {}; +/** @type {?} */ +NestedVector.prototype.childData; +/** @type {?} */ +NestedVector.prototype.getChildAt; + +var NullVector = function() {}; +var BoolVector = function() {}; +/** @type {?} */ +BoolVector.from = function() {}; +/** @type {?} */ +BoolVector.prototype.values; +var IntVector = function() {}; +/** @type {?} */ +IntVector.from = function() {}; + +var FloatVector = function() {}; +/** @type {?} */ +FloatVector.from = function() {}; + +var DateVector = function() {}; +var DecimalVector = function() {}; +var TimeVector = function() {}; +var TimestampVector = function() {}; +var IntervalVector = function() {}; +var BinaryVector = function() {}; +var FixedSizeBinaryVector = function() {}; +var Utf8Vector = function() {}; +var ListVector = function() {}; +var FixedSizeListVector = function() {}; +var MapVector = function() {}; +var StructVector = function() {}; +var UnionVector = function() {}; + +var DictionaryVector = function() {}; +/** @type {?} */ +DictionaryVector.prototype.getKey; +/** @type {?} */ +DictionaryVector.prototype.getValue; + +var FlatView = function() {}; +/** @type {?} */ +FlatView.prototype.get; +/** @type {?} */ +FlatView.prototype.clone; +/** @type {?} */ +FlatView.prototype.isValid; +/** @type {?} */ +FlatView.prototype.toArray; +/** @type {?} */ +FlatView.prototype.set; + +var PrimitiveView = function() {}; +/** @type {?} */ +PrimitiveView.prototype.size; +/** @type {?} */ +PrimitiveView.prototype.clone; + +var NullView = function() {}; +/** @type {?} */ +NullView.prototype.get; +/** @type {?} */ +NullView.prototype.clone; +/** @type {?} */ +NullView.prototype.isValid; +/** @type {?} */ +NullView.prototype.toArray; +/** @type {?} */ +NullView.prototype.set; + +var BoolView = function() {}; +/** @type {?} */ +BoolView.prototype.get; +/** @type {?} */ +BoolView.prototype.clone; +/** @type {?} */ +BoolView.prototype.isValid; +/** @type {?} */ +BoolView.prototype.toArray; +/** @type {?} */ +BoolView.prototype.set; + +var ValidityView = function() {}; +/** @type {?} */ +ValidityView.prototype.get; +/** @type {?} */ +ValidityView.prototype.clone; +/** @type {?} */ +ValidityView.prototype.isValid; +/** @type {?} */ +ValidityView.prototype.toArray; +/** @type {?} */ +ValidityView.prototype.set; + +var DictionaryView = function() {}; +/** @type {?} */ +DictionaryView.prototype.get; +/** @type {?} */ +DictionaryView.prototype.clone; +/** @type {?} */ +DictionaryView.prototype.isValid; +/** @type {?} */ +DictionaryView.prototype.toArray; +/** @type {?} */ +DictionaryView.prototype.set; + +var ListViewBase = function() {}; +/** @type {?} */ +ListViewBase.prototype.get; +/** @type {?} */ +ListViewBase.prototype.clone; +/** @type {?} */ +ListViewBase.prototype.isValid; +/** @type {?} */ +ListViewBase.prototype.toArray; +/** @type {?} */ +ListViewBase.prototype.set; + +var NestedView = function() {}; +/** @type {?} */ +NestedView.prototype.get; +/** @type {?} */ +NestedView.prototype.clone; +/** @type {?} */ +NestedView.prototype.isValid; +/** @type {?} */ +NestedView.prototype.toArray; +/** @type {?} */ +NestedView.prototype.set; + +var ChunkedView = function() {}; +/** @type {?} */ +ChunkedView.prototype.get; +/** @type {?} */ +ChunkedView.prototype.clone; +/** @type {?} */ +ChunkedView.prototype.isValid; +/** @type {?} */ +ChunkedView.prototype.toArray; +/** @type {?} */ +ChunkedView.prototype.set; + +var ListView = function() {}; +var FixedSizeListView = function() {}; +var BinaryView = function() {}; +var Utf8View = function() {}; +var UnionView = function() {}; +var DenseUnionView = function() {}; +var StructView = function() {}; +var MapView = function() {}; +var NullView = function() {}; +var FixedSizeView = function() {}; +var Float16View = function() {}; +var DateDayView = function() {}; +var DateMillisecondView = function() {}; +var TimestampDayView = function() {}; +var TimestampSecondView = function() {}; +var TimestampMillisecondView = function() {}; +var TimestampMicrosecondView = function() {}; +var TimestampNanosecondView = function() {}; +var IntervalYearMonthView = function() {}; +var IntervalYearView = function() {}; +var IntervalMonthView = function() {}; + +var TypeVisitor = function() {}; +/** @type {?} */ +TypeVisitor.visitTypeInline = function() {}; +/** @type {?} */ +TypeVisitor.prototype.visit; +/** @type {?} */ +TypeVisitor.prototype.visitMany; +/** @type {?} */ +TypeVisitor.prototype.visitNull; +/** @type {?} */ +TypeVisitor.prototype.visitBool; +/** @type {?} */ +TypeVisitor.prototype.visitInt; +/** @type {?} */ +TypeVisitor.prototype.visitFloat; +/** @type {?} */ +TypeVisitor.prototype.visitUtf8; +/** @type {?} */ +TypeVisitor.prototype.visitBinary; +/** @type {?} */ +TypeVisitor.prototype.visitFixedSizeBinary; +/** @type {?} */ +TypeVisitor.prototype.visitDate; +/** @type {?} */ +TypeVisitor.prototype.visitTimestamp; +/** @type {?} */ +TypeVisitor.prototype.visitTime; +/** @type {?} */ +TypeVisitor.prototype.visitDecimal; +/** @type {?} */ +TypeVisitor.prototype.visitList; +/** @type {?} */ +TypeVisitor.prototype.visitStruct; +/** @type {?} */ +TypeVisitor.prototype.visitUnion; +/** @type {?} */ +TypeVisitor.prototype.visitDictionary; +/** @type {?} */ +TypeVisitor.prototype.visitInterval; +/** @type {?} */ +TypeVisitor.prototype.visitFixedSizeList; +/** @type {?} */ +TypeVisitor.prototype.visitMap; + +var VectorVisitor = function() {}; +/** @type {?} */ +VectorVisitor.visitTypeInline = function() {}; +/** @type {?} */ +VectorVisitor.prototype.visit; +/** @type {?} */ +VectorVisitor.prototype.visitMany; +/** @type {?} */ +VectorVisitor.prototype.visitNullVector; +/** @type {?} */ +VectorVisitor.prototype.visitBoolVector; +/** @type {?} */ +VectorVisitor.prototype.visitIntVector; +/** @type {?} */ +VectorVisitor.prototype.visitFloatVector; +/** @type {?} */ +VectorVisitor.prototype.visitUtf8Vector; +/** @type {?} */ +VectorVisitor.prototype.visitBinaryVector; +/** @type {?} */ +VectorVisitor.prototype.visitFixedSizeBinaryVector; +/** @type {?} */ +VectorVisitor.prototype.visitDateVector; +/** @type {?} */ +VectorVisitor.prototype.visitTimestampVector; +/** @type {?} */ +VectorVisitor.prototype.visitTimeVector; +/** @type {?} */ +VectorVisitor.prototype.visitDecimalVector; +/** @type {?} */ +VectorVisitor.prototype.visitListVector; +/** @type {?} */ +VectorVisitor.prototype.visitStructVector; +/** @type {?} */ +VectorVisitor.prototype.visitUnionVector; +/** @type {?} */ +VectorVisitor.prototype.visitDictionaryVector; +/** @type {?} */ +VectorVisitor.prototype.visitIntervalVector; +/** @type {?} */ +VectorVisitor.prototype.visitFixedSizeListVector; +/** @type {?} */ +VectorVisitor.prototype.visitMapVector; \ No newline at end of file diff --git a/js/src/Arrow.externs.ts b/js/src/Arrow.externs.ts deleted file mode 100644 index c23930271183d..0000000000000 --- a/js/src/Arrow.externs.ts +++ /dev/null @@ -1,84 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/** - * @fileoverview Closure Compiler externs for Arrow - * @externs - * @suppress {duplicate,checkTypes} - */ -/** @type {symbol} */ -Symbol.iterator; -/** @type {symbol} */ -Symbol.asyncIterator; - -let RowVector = function() {}; -/** @type {?} */ -RowVector.prototype.toJSON; -/** @type {?} */ -RowVector.prototype.toArray; -/** @type {?} */ -RowVector.prototype.toObject; -/** @type {?} */ -RowVector.prototype.toString; - -let Table = function() {}; -/** @type {?} */ -( Table).from; -/** @type {?} */ -Table.prototype.columns; -/** @type {?} */ -Table.prototype.length; -/** @type {?} */ -Table.prototype.col; -/** @type {?} */ -Table.prototype.key; -/** @type {?} */ -Table.prototype.select; -/** @type {?} */ -Table.prototype.toString; - -let Vector = function() {}; -/** @type {?} */ -Vector.prototype.length; -/** @type {?} */ -Vector.prototype.name; -/** @type {?} */ -Vector.prototype.type; -/** @type {?} */ -Vector.prototype.get; -/** @type {?} */ -Vector.prototype.concat; -/** @type {?} */ -Vector.prototype.slice; -/** @type {?} */ -Vector.prototype.metadata; -/** @type {?} */ -Vector.prototype.nullable; -/** @type {?} */ -Vector.prototype.nullCount; - -let BoolVector = function() {}; -/** @type {?} */ -( BoolVector).pack; -/** @type {?} */ -BoolVector.prototype.set; - -let DictionaryVector = function() {}; -/** @type {?} */ -DictionaryVector.prototype.getKey; -/** @type {?} */ -DictionaryVector.prototype.getValue; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index 3a8943434eece..e58aa69865bb3 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -15,109 +15,258 @@ // specific language governing permissions and limitations // under the License. -import { Table } from './vector/table'; -import { Vector } from './vector/vector'; -import { Utf8Vector } from './vector/utf8'; -import { DictionaryVector } from './vector/dictionary'; -import { StructVector, StructRow } from './vector/struct'; -import { read, readAsync } from './reader/arrow'; -import { Uint64, Int64, Int128 } from './util/int'; -import { ListVector, BinaryVector, FixedSizeListVector } from './vector/list'; - -import { - BoolVector, - Int8Vector, - Int16Vector, - Int32Vector, - Int64Vector, - Uint8Vector, - Uint16Vector, - Uint32Vector, - Uint64Vector, - Float16Vector, - Float32Vector, - Float64Vector, - Date32Vector, - Date64Vector, - Time32Vector, - Time64Vector, - DecimalVector, - TimestampVector, -} from './vector/numeric'; - -// closure compiler always erases static method names: -// https://github.com/google/closure-compiler/issues/1776 -// set them via string indexers to save them from the mangler -Table['from'] = Table.from; -Table['fromAsync'] = Table.fromAsync; -BoolVector['pack'] = BoolVector.pack; +import * as type_ from './type'; +import * as data_ from './data'; +import * as vector_ from './vector'; +import * as util_ from './util/int'; +import * as visitor_ from './visitor'; +import * as view_ from './vector/view'; +import * as predicate_ from './predicate'; +import { Vector } from './vector'; +import { RecordBatch } from './recordbatch'; +import { Schema, Field, Type } from './type'; +import { Table, DataFrame, NextFunc, BindFunc, CountByResult } from './table'; +import { read, readAsync } from './ipc/reader/arrow'; + +export import View = vector_.View; +export import VectorLike = vector_.VectorLike; +export import TypedArray = type_.TypedArray; +export import IntBitWidth = type_.IntBitWidth; +export import TimeBitWidth = type_.TimeBitWidth; +export import TypedArrayConstructor = type_.TypedArrayConstructor; export { read, readAsync }; -export { Table, Vector, StructRow }; -export { Uint64, Int64, Int128 }; -export { NumericVectorConstructor } from './vector/numeric'; -export { List, TypedArray, TypedArrayConstructor } from './vector/types'; -export { - BoolVector, - ListVector, - Utf8Vector, - Int8Vector, - Int16Vector, - Int32Vector, - Int64Vector, - Uint8Vector, - Uint16Vector, - Uint32Vector, - Uint64Vector, - Date32Vector, - Date64Vector, - Time32Vector, - Time64Vector, - BinaryVector, - StructVector, - Float16Vector, - Float32Vector, - Float64Vector, - DecimalVector, - TimestampVector, - DictionaryVector, - FixedSizeListVector, -}; - -/* These exports are needed for the closure umd targets */ +export { Table, DataFrame, NextFunc, BindFunc, CountByResult }; +export { Field, Schema, RecordBatch, Vector, Type }; + +export namespace util { + export import Uint64 = util_.Uint64; + export import Int64 = util_.Int64; + export import Int128 = util_.Int128; +} + +export namespace data { + export import BaseData = data_.BaseData; + export import FlatData = data_.FlatData; + export import BoolData = data_.BoolData; + export import FlatListData = data_.FlatListData; + export import DictionaryData = data_.DictionaryData; + export import NestedData = data_.NestedData; + export import ListData = data_.ListData; + export import UnionData = data_.UnionData; + export import SparseUnionData = data_.SparseUnionData; + export import DenseUnionData = data_.DenseUnionData; + export import ChunkedData = data_.ChunkedData; +} + +export namespace type { + export import Schema = type_.Schema; + export import Field = type_.Field; + export import Null = type_.Null; + export import Int = type_.Int; + export import Int8 = type_.Int8; + export import Int16 = type_.Int16; + export import Int32 = type_.Int32; + export import Int64 = type_.Int64; + export import Uint8 = type_.Uint8; + export import Uint16 = type_.Uint16; + export import Uint32 = type_.Uint32; + export import Uint64 = type_.Uint64; + export import Float = type_.Float; + export import Float16 = type_.Float16; + export import Float32 = type_.Float32; + export import Float64 = type_.Float64; + export import Binary = type_.Binary; + export import Utf8 = type_.Utf8; + export import Bool = type_.Bool; + export import Decimal = type_.Decimal; + export import Date_ = type_.Date_; + export import Time = type_.Time; + export import Timestamp = type_.Timestamp; + export import Interval = type_.Interval; + export import List = type_.List; + export import Struct = type_.Struct; + export import Union = type_.Union; + export import DenseUnion = type_.DenseUnion; + export import SparseUnion = type_.SparseUnion; + export import FixedSizeBinary = type_.FixedSizeBinary; + export import FixedSizeList = type_.FixedSizeList; + export import Map_ = type_.Map_; + export import Dictionary = type_.Dictionary; +} + +export namespace vector { + export import Vector = vector_.Vector; + export import NullVector = vector_.NullVector; + export import BoolVector = vector_.BoolVector; + export import IntVector = vector_.IntVector; + export import FloatVector = vector_.FloatVector; + export import DateVector = vector_.DateVector; + export import DecimalVector = vector_.DecimalVector; + export import TimeVector = vector_.TimeVector; + export import TimestampVector = vector_.TimestampVector; + export import IntervalVector = vector_.IntervalVector; + export import BinaryVector = vector_.BinaryVector; + export import FixedSizeBinaryVector = vector_.FixedSizeBinaryVector; + export import Utf8Vector = vector_.Utf8Vector; + export import ListVector = vector_.ListVector; + export import FixedSizeListVector = vector_.FixedSizeListVector; + export import MapVector = vector_.MapVector; + export import StructVector = vector_.StructVector; + export import UnionVector = vector_.UnionVector; + export import DictionaryVector = vector_.DictionaryVector; +} + +export namespace visitor { + export import TypeVisitor = visitor_.TypeVisitor; + export import VectorVisitor = visitor_.VectorVisitor; +} + +export namespace view { + export import ChunkedView = view_.ChunkedView; + export import DictionaryView = view_.DictionaryView; + export import ListView = view_.ListView; + export import FixedSizeListView = view_.FixedSizeListView; + export import BinaryView = view_.BinaryView; + export import Utf8View = view_.Utf8View; + export import UnionView = view_.UnionView; + export import DenseUnionView = view_.DenseUnionView; + export import NestedView = view_.NestedView; + export import StructView = view_.StructView; + export import MapView = view_.MapView; + export import FlatView = view_.FlatView; + export import NullView = view_.NullView; + export import BoolView = view_.BoolView; + export import ValidityView = view_.ValidityView; + export import PrimitiveView = view_.PrimitiveView; + export import FixedSizeView = view_.FixedSizeView; + export import Float16View = view_.Float16View; + export import DateDayView = view_.DateDayView; + export import DateMillisecondView = view_.DateMillisecondView; + export import TimestampDayView = view_.TimestampDayView; + export import TimestampSecondView = view_.TimestampSecondView; + export import TimestampMillisecondView = view_.TimestampMillisecondView; + export import TimestampMicrosecondView = view_.TimestampMicrosecondView; + export import TimestampNanosecondView = view_.TimestampNanosecondView; + export import IntervalYearMonthView = view_.IntervalYearMonthView; + export import IntervalYearView = view_.IntervalYearView; + export import IntervalMonthView = view_.IntervalMonthView; +} + +export namespace predicate { + export import col = predicate_.col; + export import lit = predicate_.lit; + + export import Or = predicate_.Or; + export import Col = predicate_.Col; + export import And = predicate_.And; + export import GTeq = predicate_.GTeq; + export import LTeq = predicate_.LTeq; + export import Value = predicate_.Value; + export import Equals = predicate_.Equals; + export import Literal = predicate_.Literal; + export import Predicate = predicate_.Predicate; + + export import PredicateFunc = predicate_.PredicateFunc; +} + +/* These exports are needed for the closure and uglify umd targets */ try { - const Arrow = eval('exports'); - if (typeof Arrow === 'object') { - // string indexers tell closure compiler not to rename these properties + let Arrow: any = eval('exports'); + if (Arrow && typeof Arrow === 'object') { + // string indexers tell closure and uglify not to rename these properties + Arrow['data'] = data; + Arrow['type'] = type; + Arrow['util'] = util; + Arrow['view'] = view; + Arrow['vector'] = vector; + Arrow['visitor'] = visitor; + Arrow['predicate'] = predicate; + Arrow['read'] = read; Arrow['readAsync'] = readAsync; - Arrow['Table'] = Table; + + Arrow['Type'] = Type; + Arrow['Field'] = Field; + Arrow['Schema'] = Schema; Arrow['Vector'] = Vector; - Arrow['StructRow'] = StructRow; - Arrow['BoolVector'] = BoolVector; - Arrow['ListVector'] = ListVector; - Arrow['Utf8Vector'] = Utf8Vector; - Arrow['Int8Vector'] = Int8Vector; - Arrow['Int16Vector'] = Int16Vector; - Arrow['Int32Vector'] = Int32Vector; - Arrow['Int64Vector'] = Int64Vector; - Arrow['Uint8Vector'] = Uint8Vector; - Arrow['Uint16Vector'] = Uint16Vector; - Arrow['Uint32Vector'] = Uint32Vector; - Arrow['Uint64Vector'] = Uint64Vector; - Arrow['Date32Vector'] = Date32Vector; - Arrow['Date64Vector'] = Date64Vector; - Arrow['Time32Vector'] = Time32Vector; - Arrow['Time64Vector'] = Time64Vector; - Arrow['BinaryVector'] = BinaryVector; - Arrow['StructVector'] = StructVector; - Arrow['Float16Vector'] = Float16Vector; - Arrow['Float32Vector'] = Float32Vector; - Arrow['Float64Vector'] = Float64Vector; - Arrow['DecimalVector'] = DecimalVector; - Arrow['TimestampVector'] = TimestampVector; - Arrow['DictionaryVector'] = DictionaryVector; - Arrow['FixedSizeListVector'] = FixedSizeListVector; + Arrow['RecordBatch'] = RecordBatch; + + Arrow['Table'] = Table; + Arrow['CountByResult'] = CountByResult; } } catch (e) { /* not the UMD bundle */ } -/* end closure exports */ +/* end umd exports */ + +// closure compiler erases static properties/methods: +// https://github.com/google/closure-compiler/issues/1776 +// set them via string indexers to save them from the mangler +Schema['from'] = Schema.from; +Table['from'] = Table.from; +Table['fromAsync'] = Table.fromAsync; +Table['empty'] = Table.empty; +Vector['create'] = Vector.create; +RecordBatch['from'] = RecordBatch.from; + +util_.Uint64['add'] = util_.Uint64.add; +util_.Uint64['multiply'] = util_.Uint64.multiply; + +util_.Int64['add'] = util_.Int64.add; +util_.Int64['multiply'] = util_.Int64.multiply; +util_.Int64['fromString'] = util_.Int64.fromString; + +util_.Int128['add'] = util_.Int128.add; +util_.Int128['multiply'] = util_.Int128.multiply; +util_.Int128['fromString'] = util_.Int128.fromString; + +data_.ChunkedData['computeOffsets'] = data_.ChunkedData.computeOffsets; + +(type_.Type as any)['NONE'] = type_.Type.NONE; +(type_.Type as any)['Null'] = type_.Type.Null; +(type_.Type as any)['Int'] = type_.Type.Int; +(type_.Type as any)['Float'] = type_.Type.Float; +(type_.Type as any)['Binary'] = type_.Type.Binary; +(type_.Type as any)['Utf8'] = type_.Type.Utf8; +(type_.Type as any)['Bool'] = type_.Type.Bool; +(type_.Type as any)['Decimal'] = type_.Type.Decimal; +(type_.Type as any)['Date'] = type_.Type.Date; +(type_.Type as any)['Time'] = type_.Type.Time; +(type_.Type as any)['Timestamp'] = type_.Type.Timestamp; +(type_.Type as any)['Interval'] = type_.Type.Interval; +(type_.Type as any)['List'] = type_.Type.List; +(type_.Type as any)['Struct'] = type_.Type.Struct; +(type_.Type as any)['Union'] = type_.Type.Union; +(type_.Type as any)['FixedSizeBinary'] = type_.Type.FixedSizeBinary; +(type_.Type as any)['FixedSizeList'] = type_.Type.FixedSizeList; +(type_.Type as any)['Map'] = type_.Type.Map; +(type_.Type as any)['Dictionary'] = type_.Type.Dictionary; +(type_.Type as any)['DenseUnion'] = type_.Type.DenseUnion; +(type_.Type as any)['SparseUnion'] = type_.Type.SparseUnion; + +type_.DataType['isNull'] = type_.DataType.isNull; +type_.DataType['isInt'] = type_.DataType.isInt; +type_.DataType['isFloat'] = type_.DataType.isFloat; +type_.DataType['isBinary'] = type_.DataType.isBinary; +type_.DataType['isUtf8'] = type_.DataType.isUtf8; +type_.DataType['isBool'] = type_.DataType.isBool; +type_.DataType['isDecimal'] = type_.DataType.isDecimal; +type_.DataType['isDate'] = type_.DataType.isDate; +type_.DataType['isTime'] = type_.DataType.isTime; +type_.DataType['isTimestamp'] = type_.DataType.isTimestamp; +type_.DataType['isInterval'] = type_.DataType.isInterval; +type_.DataType['isList'] = type_.DataType.isList; +type_.DataType['isStruct'] = type_.DataType.isStruct; +type_.DataType['isUnion'] = type_.DataType.isUnion; +type_.DataType['isDenseUnion'] = type_.DataType.isDenseUnion; +type_.DataType['isSparseUnion'] = type_.DataType.isSparseUnion; +type_.DataType['isFixedSizeBinary'] = type_.DataType.isFixedSizeBinary; +type_.DataType['isFixedSizeList'] = type_.DataType.isFixedSizeList; +type_.DataType['isMap'] = type_.DataType.isMap; +type_.DataType['isDictionary'] = type_.DataType.isDictionary; + +vector_.BoolVector['from'] = vector_.BoolVector.from; +vector_.IntVector['from'] = vector_.IntVector.from; +vector_.FloatVector['from'] = vector_.FloatVector.from; + +visitor_.TypeVisitor['visitTypeInline'] = visitor_.TypeVisitor.visitTypeInline; +visitor_.VectorVisitor['visitTypeInline'] = visitor_.VectorVisitor.visitTypeInline; \ No newline at end of file diff --git a/js/src/bin/arrow2csv.ts b/js/src/bin/arrow2csv.ts index 01ef0b848ce75..ee9561323785b 100644 --- a/js/src/bin/arrow2csv.ts +++ b/js/src/bin/arrow2csv.ts @@ -1,4 +1,4 @@ -// #! /usr/bin/env node +#! /usr/bin/env node // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file @@ -19,11 +19,9 @@ /* tslint:disable */ +import * as fs from 'fs'; import * as Arrow from '../Arrow'; -(function() { - -const fs = require('fs'); const { parse } = require('json-bignum'); const optionList = [ { @@ -36,12 +34,13 @@ const optionList = [ { type: String, name: 'file', alias: 'f', + optional: false, multiple: true, description: 'The Arrow file to read' } ]; const argv = require(`command-line-args`)(optionList, { partial: true }); -const files = [argv.file, ...(argv._unknown || [])].filter(Boolean); +const files = [...argv.file, ...(argv._unknown || [])].filter(Boolean); if (!files.length) { console.log(require('command-line-usage')([ @@ -85,51 +84,16 @@ if (!files.length) { } files.forEach((source) => { - let table: any, input = fs.readFileSync(source); + debugger; + let table: Arrow.Table, input = fs.readFileSync(source); try { - table = Arrow.Table.from([input]); + table = Arrow.Table.from(input); } catch (e) { + debugger; table = Arrow.Table.from(parse(input + '')); } if (argv.schema && argv.schema.length) { table = table.select(...argv.schema); } - printTable(table); + table.rowsToString().pipe(process.stdout); }); - -function printTable(table: Arrow.Table) { - let header = [...table.columns.map((_, i) => table.key(i))].map(stringify); - let maxColumnWidths = header.map(x => x.length); - // Pass one to convert to strings and count max column widths - for (let i = -1, n = table.length - 1; ++i < n;) { - let val, - row = [i, ...table.get(i)]; - for (let j = -1, k = row.length; ++j < k; ) { - val = stringify(row[j]); - maxColumnWidths[j] = Math.max(maxColumnWidths[j], val.length); - } - } - console.log(header.map((x, j) => leftPad(x, ' ', maxColumnWidths[j])).join(' | ')); - // Pass two to pad each one to max column width - for (let i = -1, n = table.length; ++i < n; ) { - console.log( - [...table.get(i)] - .map(stringify) - .map((x, j) => leftPad(x, ' ', maxColumnWidths[j])) - .join(' | ') - ); - } -} - -function leftPad(str: string, fill: string, n: number) { - return (new Array(n + 1).join(fill) + str).slice(-1 * n); -} - -function stringify(x: any) { - return typeof x === 'string' ? `"${x}"` - : Array.isArray(x) ? JSON.stringify(x) - : ArrayBuffer.isView(x) ? `[${x}]` - : `${x}`; -} - -})(); \ No newline at end of file diff --git a/js/src/data.ts b/js/src/data.ts new file mode 100644 index 0000000000000..81d19a3cf63cd --- /dev/null +++ b/js/src/data.ts @@ -0,0 +1,327 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { popcnt_bit_range } from './util/bit'; +import { VectorLike, Vector } from './vector'; +import { VectorType, TypedArray, TypedArrayConstructor, Dictionary } from './type'; +import { Int, Bool, FlatListType, List, FixedSizeList, Struct, Map_ } from './type'; +import { DataType, FlatType, ListType, NestedType, SingleNestedType, DenseUnion, SparseUnion } from './type'; + +export function toTypedArray(ArrayType: TypedArrayConstructor, values?: T | ArrayLike | Iterable | null): T { + if (!ArrayType && ArrayBuffer.isView(values)) { return values; } + return values instanceof ArrayType ? values + : !values || !ArrayBuffer.isView(values) ? ArrayType.from(values || []) + : new ArrayType(values.buffer, values.byteOffset, values.byteLength / ArrayType.BYTES_PER_ELEMENT); +} + +export type Data = DataTypes[T['TType']] & BaseData; +export interface DataTypes { +/* [Type.NONE]*/ 0: BaseData; +/* [Type.Null]*/ 1: FlatData; +/* [Type.Int]*/ 2: FlatData; +/* [Type.Float]*/ 3: FlatData; +/* [Type.Binary]*/ 4: FlatListData; +/* [Type.Utf8]*/ 5: FlatListData; +/* [Type.Bool]*/ 6: BoolData; +/* [Type.Decimal]*/ 7: FlatData; +/* [Type.Date]*/ 8: FlatData; +/* [Type.Time]*/ 9: FlatData; +/* [Type.Timestamp]*/ 10: FlatData; +/* [Type.Interval]*/ 11: FlatData; +/* [Type.List]*/ 12: ListData>; +/* [Type.Struct]*/ 13: NestedData; +/* [Type.Union]*/ 14: UnionData; +/* [Type.FixedSizeBinary]*/ 15: FlatData; +/* [Type.FixedSizeList]*/ 16: SingleNestedData>; +/* [Type.Map]*/ 17: NestedData; +/* [Type.DenseUnion]*/ DenseUnion: DenseUnionData; +/*[Type.SparseUnion]*/ SparseUnion: SparseUnionData; +/*[ Type.Dictionary]*/ Dictionary: DictionaryData; +} +// When slicing, we do not know the null count of the sliced range without +// doing some computation. To avoid doing this eagerly, we set the null count +// to -1 (any negative number will do). When Array::null_count is called the +// first time, the null count will be computed. See ARROW-33 +export type kUnknownNullCount = -1; +export const kUnknownNullCount = -1; + +export class BaseData implements VectorLike { + public type: T; + public length: number; + public offset: number; + // @ts-ignore + public childData: Data[]; + protected _nullCount: number | kUnknownNullCount; + protected /* [VectorType.OFFSET]:*/ 0?: Int32Array; + protected /* [VectorType.DATA]:*/ 1?: T['TArray']; + protected /*[VectorType.VALIDITY]:*/ 2?: Uint8Array; + protected /* [VectorType.TYPE]:*/ 3?: Int8Array; + constructor(type: T, length: number, offset?: number, nullCount?: number) { + this.type = type; + this.length = Math.floor(Math.max(length || 0, 0)); + this.offset = Math.floor(Math.max(offset || 0, 0)); + this._nullCount = Math.floor(Math.max(nullCount || 0, -1)); + } + public get typeId() { return this.type.TType; } + public get nullBitmap() { return this[VectorType.VALIDITY]; } + public get nullCount() { + let nullCount = this._nullCount; + let nullBitmap: Uint8Array | undefined; + if (nullCount === -1 && (nullBitmap = this[VectorType.VALIDITY])) { + this._nullCount = nullCount = this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); + } + return nullCount; + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new BaseData(type, length, offset, nullCount); + } + public slice(offset: number, length: number) { + return length <= 0 ? this : this.sliceInternal(this.clone( + this.type, length, this.offset + offset, +(this._nullCount === 0) - 1 + ) as any, offset, length); + } + protected sliceInternal(clone: this, offset: number, length: number) { + let arr: any; + // If typeIds exist, slice the typeIds buffer + (arr = this[VectorType.TYPE]) && (clone[VectorType.TYPE] = this.sliceData(arr, offset, length)); + // If offsets exist, only slice the offsets buffer + (arr = this[VectorType.OFFSET]) && (clone[VectorType.OFFSET] = this.sliceOffsets(arr, offset, length)) || + // Otherwise if no offsets, slice the data buffer + (arr = this[VectorType.DATA]) && (clone[VectorType.DATA] = this.sliceData(arr, offset, length)); + return clone; + } + protected sliceData(data: T['TArray'] & TypedArray, offset: number, length: number) { + return data.subarray(offset, offset + length); + } + protected sliceOffsets(valueOffsets: Int32Array, offset: number, length: number) { + return valueOffsets.subarray(offset, offset + length + 1); + } +} + +export class FlatData extends BaseData { + public /* [VectorType.DATA]:*/ 1: T['TArray']; + public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; + public get values() { return this[VectorType.DATA]; } + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, data: Iterable, offset?: number, nullCount?: number) { + super(type, length, offset, nullCount); + this[VectorType.DATA] = toTypedArray(this.ArrayType, data); + this[VectorType.VALIDITY] = toTypedArray(Uint8Array, nullBitmap); + } + public get ArrayType(): T['ArrayType'] { return this.type.ArrayType; } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new (this.constructor as any)(type, length, this[VectorType.VALIDITY], this[VectorType.DATA], offset, nullCount) as FlatData; + } +} + +export class BoolData extends FlatData { + protected sliceData(data: Uint8Array) { return data; } +} + +export class FlatListData extends FlatData { + public /* [VectorType.OFFSET]:*/ 0: Int32Array; + public /* [VectorType.DATA]:*/ 1: T['TArray']; + public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; + public get values() { return this[VectorType.DATA]; } + public get valueOffsets() { return this[VectorType.OFFSET]; } + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueOffsets: Iterable, data: T['TArray'], offset?: number, nullCount?: number) { + super(type, length, nullBitmap, data, offset, nullCount); + this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new FlatListData(type, length, this[VectorType.VALIDITY], this[VectorType.OFFSET], this[VectorType.DATA], offset, nullCount); + } +} + +export class DictionaryData extends BaseData> { + protected _dictionary: Vector; + protected _indicies: Data>; + public get indicies() { return this._indicies; } + public get dictionary() { return this._dictionary; } + constructor(type: Dictionary, dictionary: Vector, indicies: Data>) { + super(type, indicies.length, (indicies as any)._nullCount); + this._indicies = indicies; + this._dictionary = dictionary; + this.length = this._indicies.length; + } + public get nullCount() { return this._indicies.nullCount; } + public clone>(type: R, length = this.length, offset = this.offset) { + const data = this._dictionary.data.clone(type.dictionary as any); + return new DictionaryData( + this.type as any, + this._dictionary.clone(data) as any, + this._indicies.slice(offset - this.offset, length) + ) as any; + } + protected sliceInternal(clone: this, _offset: number, _length: number) { + clone.length = clone._indicies.length; + clone._nullCount = (clone._indicies as any)._nullCount; + return clone; + } +} + +export class NestedData extends BaseData { + public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, childData: Data[], offset?: number, nullCount?: number) { + super(type, length, offset, nullCount); + this.childData = childData; + this[VectorType.VALIDITY] = toTypedArray(Uint8Array, nullBitmap); + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new NestedData(type, length, this[VectorType.VALIDITY], this.childData, offset, nullCount); + } + protected sliceInternal(clone: this, offset: number, length: number) { + if (!this[VectorType.OFFSET]) { + clone.childData = this.childData.map((child) => child.slice(offset, length)); + } + return super.sliceInternal(clone, offset, length); + } +} + +export class SingleNestedData extends NestedData { + protected _valuesData: Data; + public get values() { return this._valuesData; } + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueChildData: Data, offset?: number, nullCount?: number) { + super(type, length, nullBitmap, [valueChildData], offset, nullCount); + this._valuesData = valueChildData; + } +} + +export class ListData extends SingleNestedData { + public /* [VectorType.OFFSET]:*/ 0: Int32Array; + public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; + public get valueOffsets() { return this[VectorType.OFFSET]; } + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueOffsets: Iterable, valueChildData: Data, offset?: number, nullCount?: number) { + super(type, length, nullBitmap, valueChildData, offset, nullCount); + this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new ListData(type, length, this[VectorType.VALIDITY], this[VectorType.OFFSET], this._valuesData as any, offset, nullCount); + } +} + +export class UnionData extends NestedData { + public /* [VectorType.TYPE]:*/ 3: T['TArray']; + public get typeIds() { return this[VectorType.TYPE]; } + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, childData: Data[], offset?: number, nullCount?: number) { + super(type, length, nullBitmap, childData, offset, nullCount); + this[VectorType.TYPE] = toTypedArray(Int8Array, typeIds); + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new UnionData(type, length, this[VectorType.VALIDITY], this[VectorType.TYPE], this.childData, offset, nullCount); + } +} + +export class SparseUnionData extends UnionData { + constructor(type: SparseUnion, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, childData: Data[], offset?: number, nullCount?: number) { + super(type, length, nullBitmap, typeIds, childData, offset, nullCount); + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new SparseUnionData( + type, + length, + this[VectorType.VALIDITY], + this[VectorType.TYPE], + this.childData, + offset, nullCount + ) as any as UnionData; + } +} + +export class DenseUnionData extends UnionData { + public /* [VectorType.OFFSET]:*/ 0: Int32Array; + public get valueOffsets() { return this[VectorType.OFFSET]; } + constructor(type: DenseUnion, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, valueOffsets: Iterable, childData: Data[], offset?: number, nullCount?: number) { + super(type, length, nullBitmap, typeIds, childData, offset, nullCount); + this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new DenseUnionData( + type, + length, + this[VectorType.VALIDITY], + this[VectorType.TYPE], + this[VectorType.OFFSET], + this.childData, + offset, nullCount + ) as any as UnionData; + } +} + +export class ChunkedData extends BaseData { + // @ts-ignore + protected _chunkData: Data[]; + protected _chunkVectors: Vector[]; + protected _chunkOffsets: Uint32Array; + public get chunkVectors() { return this._chunkVectors; } + public get chunkOffsets() { return this._chunkOffsets; } + public get chunkData() { + return this._chunkData || ( + this._chunkData = this._chunkVectors.map(({ data }) => data)); + } + constructor(type: T, length: number, chunkVectors: Vector[], offset?: number, nullCount?: number, chunkOffsets?: Uint32Array) { + super(type, length, offset, nullCount); + this._chunkVectors = chunkVectors; + this._chunkOffsets = chunkOffsets || ChunkedData.computeOffsets(chunkVectors); + } + public get nullCount() { + let nullCount = this._nullCount; + if (nullCount === -1) { + this._nullCount = nullCount = this._chunkVectors.reduce((x, c) => x + c.nullCount, 0); + } + return nullCount; + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new ChunkedData( + type, length, + this._chunkVectors.map((vec) => vec.clone(vec.data.clone(type))) as any, + offset, nullCount, this._chunkOffsets + ); + } + protected sliceInternal(clone: this, offset: number, length: number) { + const chunks = this._chunkVectors; + const offsets = this._chunkOffsets; + const chunkSlices: Vector[] = []; + for (let childIndex = -1, numChildren = chunks.length; ++childIndex < numChildren;) { + const child = chunks[childIndex]; + const childLength = child.length; + const childOffset = offsets[childIndex]; + // If the child is to the right of the slice boundary, exclude + if (childOffset >= offset + length) { continue; } + // If the child is to the left of of the slice boundary, exclude + if (offset >= childOffset + childLength) { continue; } + // If the child is between both left and right boundaries, include w/o slicing + if (childOffset >= offset && (childOffset + childLength) <= offset + length) { + chunkSlices.push(child); + continue; + } + // If the child overlaps one of the slice boundaries, include that slice + const begin = Math.max(0, offset - childOffset); + const end = begin + Math.min(childLength - begin, (offset + length) - childOffset); + chunkSlices.push(child.slice(begin, end)); + } + clone._chunkVectors = chunkSlices; + clone._chunkOffsets = ChunkedData.computeOffsets(chunkSlices); + return clone; + } + static computeOffsets(childVectors: Vector[]) { + const childOffsets = new Uint32Array(childVectors.length + 1); + for (let index = 0, length = childOffsets.length, childOffset = childOffsets[0] = 0; ++index < length;) { + childOffsets[index] = (childOffset += childVectors[index - 1].length); + } + return childOffsets; + } +} diff --git a/js/src/format/fb/File.ts b/js/src/fb/File.ts similarity index 99% rename from js/src/format/fb/File.ts rename to js/src/fb/File.ts index 56f50ed20e936..f4ba865ff040b 100644 --- a/js/src/format/fb/File.ts +++ b/js/src/fb/File.ts @@ -14,6 +14,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -175,6 +176,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** diff --git a/js/src/format/fb/File_generated.js b/js/src/fb/File_generated.js similarity index 100% rename from js/src/format/fb/File_generated.js rename to js/src/fb/File_generated.js diff --git a/js/src/format/fb/Message.ts b/js/src/fb/Message.ts similarity index 99% rename from js/src/format/fb/Message.ts rename to js/src/fb/Message.ts index 4610fbef2e1c8..537c65d1f8c93 100644 --- a/js/src/format/fb/Message.ts +++ b/js/src/fb/Message.ts @@ -45,6 +45,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -110,6 +111,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -265,6 +267,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -369,6 +372,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** diff --git a/js/src/format/fb/Message_generated.js b/js/src/fb/Message_generated.js similarity index 100% rename from js/src/format/fb/Message_generated.js rename to js/src/fb/Message_generated.js diff --git a/js/src/format/fb/Schema.ts b/js/src/fb/Schema.ts similarity index 99% rename from js/src/format/fb/Schema.ts rename to js/src/fb/Schema.ts index d9b45ed20089c..4a4aeb65599be 100644 --- a/js/src/format/fb/Schema.ts +++ b/js/src/fb/Schema.ts @@ -165,6 +165,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -221,6 +222,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -273,6 +275,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -325,6 +328,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -420,6 +424,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -495,6 +500,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -617,6 +623,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -701,6 +708,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -771,6 +779,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -823,6 +832,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -875,6 +885,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -945,6 +956,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -997,6 +1009,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1092,6 +1105,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1164,6 +1178,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1255,6 +1270,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1363,6 +1379,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1435,6 +1452,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1527,6 +1545,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1620,6 +1639,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1741,6 +1761,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -2026,6 +2047,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -2089,6 +2111,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** diff --git a/js/src/format/fb/Schema_generated.js b/js/src/fb/Schema_generated.js similarity index 100% rename from js/src/format/fb/Schema_generated.js rename to js/src/fb/Schema_generated.js diff --git a/js/src/format/arrow.ts b/js/src/format/arrow.ts deleted file mode 100644 index 14adf9040a47f..0000000000000 --- a/js/src/format/arrow.ts +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { footerFromByteBuffer, messageFromByteBuffer } from './fb'; -import { schemaFromJSON, recordBatchFromJSON, dictionaryBatchFromJSON } from './json'; -import { - IntBitWidth, TimeBitWidth, - VisitorNode, Visitor, Footer, Block, Message, Schema, RecordBatch, DictionaryBatch, Field, DictionaryEncoding, Buffer, FieldNode, - Null, Int, FloatingPoint, Binary, Bool, Utf8, Decimal, Date, Time, Timestamp, Interval, List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, -} from './types'; - -export { - IntBitWidth, TimeBitWidth, - footerFromByteBuffer, messageFromByteBuffer, - schemaFromJSON, recordBatchFromJSON, dictionaryBatchFromJSON, - VisitorNode, Visitor, Footer, Block, Message, Schema, RecordBatch, DictionaryBatch, Field, DictionaryEncoding, Buffer, FieldNode, - Null, Int, FloatingPoint, Binary, Bool, Utf8, Decimal, Date, Time, Timestamp, Interval, List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_ as Map, -}; diff --git a/js/src/format/fb.ts b/js/src/format/fb.ts deleted file mode 100644 index fdf7f7b0ed99a..0000000000000 --- a/js/src/format/fb.ts +++ /dev/null @@ -1,234 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import * as File_ from './fb/File'; -import * as Schema_ from './fb/Schema'; -import * as Message_ from './fb/Message'; -import { flatbuffers } from 'flatbuffers'; -import ByteBuffer = flatbuffers.ByteBuffer; -import Type = Schema_.org.apache.arrow.flatbuf.Type; -import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; -import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; -import _Footer = File_.org.apache.arrow.flatbuf.Footer; -import _Block = File_.org.apache.arrow.flatbuf.Block; -import _Message = Message_.org.apache.arrow.flatbuf.Message; -import _Schema = Schema_.org.apache.arrow.flatbuf.Schema; -import _Field = Schema_.org.apache.arrow.flatbuf.Field; -import _RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; -import _DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch; -import _FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; -import _Buffer = Schema_.org.apache.arrow.flatbuf.Buffer; -import _DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; -import _Null = Schema_.org.apache.arrow.flatbuf.Null; -import _Int = Schema_.org.apache.arrow.flatbuf.Int; -import _FloatingPoint = Schema_.org.apache.arrow.flatbuf.FloatingPoint; -import _Binary = Schema_.org.apache.arrow.flatbuf.Binary; -import _Bool = Schema_.org.apache.arrow.flatbuf.Bool; -import _Utf8 = Schema_.org.apache.arrow.flatbuf.Utf8; -import _Decimal = Schema_.org.apache.arrow.flatbuf.Decimal; -import _Date = Schema_.org.apache.arrow.flatbuf.Date; -import _Time = Schema_.org.apache.arrow.flatbuf.Time; -import _Timestamp = Schema_.org.apache.arrow.flatbuf.Timestamp; -import _Interval = Schema_.org.apache.arrow.flatbuf.Interval; -import _List = Schema_.org.apache.arrow.flatbuf.List; -import _Struct = Schema_.org.apache.arrow.flatbuf.Struct_; -import _Union = Schema_.org.apache.arrow.flatbuf.Union; -import _FixedSizeBinary = Schema_.org.apache.arrow.flatbuf.FixedSizeBinary; -import _FixedSizeList = Schema_.org.apache.arrow.flatbuf.FixedSizeList; -import _Map = Schema_.org.apache.arrow.flatbuf.Map; - -import { - IntBitWidth, TimeBitWidth, - Footer, Block, Schema, RecordBatch, DictionaryBatch, Field, DictionaryEncoding, Buffer, FieldNode, - Null, Int, FloatingPoint, Binary, Bool, Utf8, Decimal, Date, Time, Timestamp, Interval, List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, -} from './types'; - -export function footerFromByteBuffer(bb: ByteBuffer) { - const f = _Footer.getRootAsFooter(bb), s = f.schema()!; - return new Footer( - dictionaryBatchesFromFooter(f), recordBatchesFromFooter(f), - new Schema(f.version(), fieldsFromSchema(s), customMetadata(s), s.endianness()) - ); -} - -export function messageFromByteBuffer(bb: ByteBuffer) { - const m = _Message.getRootAsMessage(bb)!, type = m.headerType(), version = m.version(); - switch (type) { - case MessageHeader.Schema: return schemaFromMessage(version, m.header(new _Schema())!); - case MessageHeader.RecordBatch: return recordBatchFromMessage(version, m.header(new _RecordBatch())!); - case MessageHeader.DictionaryBatch: return dictionaryBatchFromMessage(version, m.header(new _DictionaryBatch())!); - } - return null; - // throw new Error(`Unrecognized Message type '${type}'`); -} - -function schemaFromMessage(version: MetadataVersion, s: _Schema) { - return new Schema(version, fieldsFromSchema(s), customMetadata(s), s.endianness()); -} - -function recordBatchFromMessage(version: MetadataVersion, b: _RecordBatch) { - return new RecordBatch(version, b.length(), fieldNodesFromRecordBatch(b), buffersFromRecordBatch(b, version)); -} - -function dictionaryBatchFromMessage(version: MetadataVersion, d: _DictionaryBatch) { - return new DictionaryBatch(version, recordBatchFromMessage(version, d.data()!), d.id(), d.isDelta()); -} - -function dictionaryBatchesFromFooter(f: _Footer) { - const blocks = [] as Block[]; - for (let b: _Block, i = -1, n = f && f.dictionariesLength(); ++i < n;) { - if (b = f.dictionaries(i)!) { - blocks.push(new Block(b.metaDataLength(), b.bodyLength(), b.offset())); - } - } - return blocks; -} - -function recordBatchesFromFooter(f: _Footer) { - const blocks = [] as Block[]; - for (let b: _Block, i = -1, n = f && f.recordBatchesLength(); ++i < n;) { - if (b = f.recordBatches(i)!) { - blocks.push(new Block(b.metaDataLength(), b.bodyLength(), b.offset())); - } - } - return blocks; -} - -function fieldsFromSchema(s: _Schema) { - const fields = [] as Field[]; - for (let i = -1, n = s && s.fieldsLength(); ++i < n;) { - fields.push(field(s.fields(i)!)); - } - return fields; -} - -function fieldsFromField(f: _Field) { - const fields = [] as Field[]; - for (let i = -1, n = f && f.childrenLength(); ++i < n;) { - fields.push(field(f.children(i)!)); - } - return fields; -} - -function fieldNodesFromRecordBatch(b: _RecordBatch) { - const fieldNodes = [] as FieldNode[]; - for (let i = -1, n = b.nodesLength(); ++i < n;) { - fieldNodes.push(fieldNodeFromRecordBatch(b.nodes(i)!)); - } - return fieldNodes; -} - -function buffersFromRecordBatch(b: _RecordBatch, version: MetadataVersion) { - const buffers = [] as Buffer[]; - for (let i = -1, n = b.buffersLength(); ++i < n;) { - let buffer = b.buffers(i)!; - // If this Arrow buffer was written before version 4, - // advance the buffer's bb_pos 8 bytes to skip past - // the now-removed page id field. - if (version < MetadataVersion.V4) { - buffer.bb_pos += (8 * (i + 1)); - } - buffers.push(bufferFromRecordBatch(buffer)); - } - return buffers; -} - -function field(f: _Field) { - return new Field( - f.name()!, - typeFromField(f), - f.typeType(), - f.nullable(), - fieldsFromField(f), - customMetadata(f), - dictionaryEncodingFromField(f) - ); -} - -function dictionaryEncodingFromField(f: _Field) { - let t: _Int | null; - let e: _DictionaryEncoding | null; - if (e = f.dictionary()) { - if (t = e.indexType()) { - return new DictionaryEncoding(new Int(t.isSigned(), t.bitWidth() as IntBitWidth), e.id(), e.isOrdered()); - } - return new DictionaryEncoding(null, e.id(), e.isOrdered()); - } - return undefined; -} - -function customMetadata(parent?: _Schema | _Field | null) { - const data = new Map(); - if (parent) { - for (let entry, key, i = -1, n = parent.customMetadataLength() | 0; ++i < n;) { - if ((entry = parent.customMetadata(i)) && (key = entry.key()) != null) { - data.set(key, entry.value()!); - } - } - } - return data; -} - -function fieldNodeFromRecordBatch(f: _FieldNode) { - return new FieldNode(f.length(), f.nullCount()); -} - -function bufferFromRecordBatch(b: _Buffer) { - return new Buffer(b.offset(), b.length()); -} - -function typeFromField(f: _Field) { - switch (f.typeType()) { - case Type.NONE: return nullFromField(f.type(new _Null())!); - case Type.Null: return nullFromField(f.type(new _Null())!); - case Type.Int: return intFromField(f.type(new _Int())!); - case Type.FloatingPoint: return floatingPointFromField(f.type(new _FloatingPoint())!); - case Type.Binary: return binaryFromField(f.type(new _Binary())!); - case Type.Utf8: return utf8FromField(f.type(new _Utf8())!); - case Type.Bool: return boolFromField(f.type(new _Bool())!); - case Type.Decimal: return decimalFromField(f.type(new _Decimal())!); - case Type.Date: return dateFromField(f.type(new _Date())!); - case Type.Time: return timeFromField(f.type(new _Time())!); - case Type.Timestamp: return timestampFromField(f.type(new _Timestamp())!); - case Type.Interval: return intervalFromField(f.type(new _Interval())!); - case Type.List: return listFromField(f.type(new _List())!); - case Type.Struct_: return structFromField(f.type(new _Struct())!); - case Type.Union: return unionFromField(f.type(new _Union())!); - case Type.FixedSizeBinary: return fixedSizeBinaryFromField(f.type(new _FixedSizeBinary())!); - case Type.FixedSizeList: return fixedSizeListFromField(f.type(new _FixedSizeList())!); - case Type.Map: return mapFromField(f.type(new _Map())!); - } - throw new Error(`Unrecognized type ${f.typeType()}`); -} - -function nullFromField(_type: _Null) { return new Null(); } -function intFromField(_type: _Int) { return new Int(_type.isSigned(), _type.bitWidth() as IntBitWidth); } -function floatingPointFromField(_type: _FloatingPoint) { return new FloatingPoint(_type.precision()); } -function binaryFromField(_type: _Binary) { return new Binary(); } -function utf8FromField(_type: _Utf8) { return new Utf8(); } -function boolFromField(_type: _Bool) { return new Bool(); } -function decimalFromField(_type: _Decimal) { return new Decimal(_type.scale(), _type.precision()); } -function dateFromField(_type: _Date) { return new Date(_type.unit()); } -function timeFromField(_type: _Time) { return new Time(_type.unit(), _type.bitWidth() as TimeBitWidth); } -function timestampFromField(_type: _Timestamp) { return new Timestamp(_type.unit(), _type.timezone()); } -function intervalFromField(_type: _Interval) { return new Interval(_type.unit()); } -function listFromField(_type: _List) { return new List(); } -function structFromField(_type: _Struct) { return new Struct(); } -function unionFromField(_type: _Union) { return new Union(_type.mode(), (_type.typeIdsArray() || []) as Type[]); } -function fixedSizeBinaryFromField(_type: _FixedSizeBinary) { return new FixedSizeBinary(_type.byteWidth()); } -function fixedSizeListFromField(_type: _FixedSizeList) { return new FixedSizeList(_type.listSize()); } -function mapFromField(_type: _Map) { return new Map_(_type.keysSorted()); } diff --git a/js/src/format/json.ts b/js/src/format/json.ts deleted file mode 100644 index 3da3db6d5fea3..0000000000000 --- a/js/src/format/json.ts +++ /dev/null @@ -1,173 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import * as Schema_ from './fb/Schema'; -import { flatbuffers } from 'flatbuffers'; -import Long = flatbuffers.Long; -import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; -import Type = Schema_.org.apache.arrow.flatbuf.Type; -import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; -import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; -import Precision = Schema_.org.apache.arrow.flatbuf.Precision; -import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; -import { - IntBitWidth, TimeBitWidth, - Schema, RecordBatch, DictionaryBatch, Field, DictionaryEncoding, Buffer, FieldNode, - Null, Int, FloatingPoint, Binary, Bool, Utf8, Decimal, Date, Time, Timestamp, Interval, List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, -} from './types'; - -export function schemaFromJSON(s: any): Schema { - // todo: metadataFromJSON - return new Schema( - MetadataVersion.V4, - fieldsFromJSON(s['fields']), - customMetadata(s['customMetadata']) - ); -} - -export function recordBatchFromJSON(b: any): RecordBatch { - return new RecordBatch( - MetadataVersion.V4, - new Long(b['count'], 0), - fieldNodesFromJSON(b['columns']), - buffersFromJSON(b['columns']) - ); -} - -export function dictionaryBatchFromJSON(b: any): DictionaryBatch { - return new DictionaryBatch( - MetadataVersion.V4, - recordBatchFromJSON(b['data']), - new Long(b['id'], 0), b['isDelta'] - ); -} - -function fieldsFromJSON(fs: any[]): Field[] { - return (fs || []).map(fieldFromJSON); -} - -function fieldNodesFromJSON(xs: any[]): FieldNode[] { - return (xs || []).reduce((fieldNodes, column: any) => [ - ...fieldNodes, - new FieldNode( - new Long(column['count'], 0), - new Long(nullCountFromJSON(column['VALIDITY']), 0) - ), - ...fieldNodesFromJSON(column['children']) - ], [] as FieldNode[]); -} - -function buffersFromJSON(xs: any[], buffers: Buffer[] = []): Buffer[] { - for (let i = -1, n = (xs || []).length; ++i < n;) { - const column = xs[i]; - column['VALIDITY'] && buffers.push(new Buffer(new Long(buffers.length, 0), new Long(column['VALIDITY'].length, 0))); - column['OFFSET'] && buffers.push(new Buffer(new Long(buffers.length, 0), new Long(column['OFFSET'].length, 0))); - column['DATA'] && buffers.push(new Buffer(new Long(buffers.length, 0), new Long(column['DATA'].length, 0))); - buffers = buffersFromJSON(column['children'], buffers); - } - return buffers; -} - -function nullCountFromJSON(validity: number[]) { - return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); -} - -function fieldFromJSON(f: any) { - return new Field( - f['name'], - typeFromJSON(f['type']), - namesToTypeMap[f['type']['name']], - f.nullable, - fieldsFromJSON(f['children']), - customMetadata(f['customMetadata']), - dictionaryEncodingFromJSON(f['dictionary']) - ); -} - -function dictionaryEncodingFromJSON(d: any) { - return !d ? null : new DictionaryEncoding( - d.indexType ? intFromJSON(d.indexType) : null, - new Long(d.id, 0), d.isOrdered - ); -} - -function customMetadata(metadata?: any) { - return new Map(Object.entries(metadata || {})); -} - -const namesToTypeMap: { [n: string]: Type } = { - 'NONE': Type.NONE, - 'null': Type.Null, - 'int': Type.Int, - 'floatingpoint': Type.FloatingPoint, - 'binary': Type.Binary, - 'bool': Type.Bool, - 'utf8': Type.Utf8, - 'decimal': Type.Decimal, - 'date': Type.Date, - 'time': Type.Time, - 'timestamp': Type.Timestamp, - 'interval': Type.Interval, - 'list': Type.List, - 'struct': Type.Struct_, - 'union': Type.Union, - 'fixedsizebinary': Type.FixedSizeBinary, - 'fixedsizelist': Type.FixedSizeList, - 'map': Type.Map, -}; - -function typeFromJSON(t: any) { - switch (namesToTypeMap[t['name']]) { - case Type.NONE: return nullFromJSON(t); - case Type.Null: return nullFromJSON(t); - case Type.Int: return intFromJSON(t); - case Type.FloatingPoint: return floatingPointFromJSON(t); - case Type.Binary: return binaryFromJSON(t); - case Type.Utf8: return utf8FromJSON(t); - case Type.Bool: return boolFromJSON(t); - case Type.Decimal: return decimalFromJSON(t); - case Type.Date: return dateFromJSON(t); - case Type.Time: return timeFromJSON(t); - case Type.Timestamp: return timestampFromJSON(t); - case Type.Interval: return intervalFromJSON(t); - case Type.List: return listFromJSON(t); - case Type.Struct_: return structFromJSON(t); - case Type.Union: return unionFromJSON(t); - case Type.FixedSizeBinary: return fixedSizeBinaryFromJSON(t); - case Type.FixedSizeList: return fixedSizeListFromJSON(t); - case Type.Map: return mapFromJSON(t); - } - throw new Error(`Unrecognized type ${t['name']}`); -} - -function nullFromJSON(_type: any) { return new Null(); } -function intFromJSON(_type: any) { return new Int(_type['isSigned'], _type['bitWidth'] as IntBitWidth); } -function floatingPointFromJSON(_type: any) { return new FloatingPoint(Precision[_type['precision']] as any); } -function binaryFromJSON(_type: any) { return new Binary(); } -function utf8FromJSON(_type: any) { return new Utf8(); } -function boolFromJSON(_type: any) { return new Bool(); } -function decimalFromJSON(_type: any) { return new Decimal(_type['scale'], _type['precision']); } -function dateFromJSON(_type: any) { return new Date(DateUnit[_type['unit']] as any); } -function timeFromJSON(_type: any) { return new Time(TimeUnit[_type['unit']] as any, _type['bitWidth'] as TimeBitWidth); } -function timestampFromJSON(_type: any) { return new Timestamp(TimeUnit[_type['unit']] as any, _type['timezone']); } -function intervalFromJSON(_type: any) { return new Interval(IntervalUnit[_type['unit']] as any); } -function listFromJSON(_type: any) { return new List(); } -function structFromJSON(_type: any) { return new Struct(); } -function unionFromJSON(_type: any) { return new Union(_type['mode'], (_type['typeIdsArray'] || []) as Type[]); } -function fixedSizeBinaryFromJSON(_type: any) { return new FixedSizeBinary(_type['byteWidth']); } -function fixedSizeListFromJSON(_type: any) { return new FixedSizeList(_type['listSize']); } -function mapFromJSON(_type: any) { return new Map_(_type['keysSorted']); } diff --git a/js/src/format/types.ts b/js/src/format/types.ts deleted file mode 100644 index 09df8ccbbdf7c..0000000000000 --- a/js/src/format/types.ts +++ /dev/null @@ -1,393 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/* tslint:disable:class-name */ - -import { align } from '../util/layout'; -import * as Schema_ from './fb/Schema'; -import * as Message_ from './fb/Message'; -import { flatbuffers } from 'flatbuffers'; -import Long = flatbuffers.Long; -import Type = Schema_.org.apache.arrow.flatbuf.Type; -import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; -import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; -import Precision = Schema_.org.apache.arrow.flatbuf.Precision; -import UnionMode = Schema_.org.apache.arrow.flatbuf.UnionMode; -import Endianness = Schema_.org.apache.arrow.flatbuf.Endianness; -import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; -import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; -import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; - -export type IntBitWidth = 8 | 16 | 32 | 64; -export type TimeBitWidth = IntBitWidth | 128; - -export interface VisitorNode { - accept(visitor: Visitor): any; -} - -export abstract class Visitor { - visit(node: VisitorNode): T { - return node.accept(this); - } - visitMany(nodes: VisitorNode[]): T[] { - return nodes.map((node) => this.visit(node)); - } - abstract visitFooter(node: Footer): any; - abstract visitBlock(node: Block): any; - abstract visitMessage(node: Message): any; - abstract visitSchema(node: Schema): any; - abstract visitField(node: Field): any; - abstract visitBuffer(node: Buffer): any; - abstract visitFieldNode(node: FieldNode): any; - abstract visitRecordBatch(node: RecordBatch): any; - abstract visitDictionaryBatch(node: DictionaryBatch): any; - abstract visitDictionaryEncoding(node: DictionaryEncoding): any; - abstract visitNullFieldType(node: Null): any; - abstract visitIntFieldType(node: Int): any; - abstract visitFloatingPointFieldType(node: FloatingPoint): any; - abstract visitBinaryFieldType(node: Binary): any; - abstract visitBoolFieldType(node: Bool): any; - abstract visitUtf8FieldType(node: Utf8): any; - abstract visitDecimalFieldType(node: Decimal): any; - abstract visitDateFieldType(node: Date): any; - abstract visitTimeFieldType(node: Time): any; - abstract visitTimestampFieldType(node: Timestamp): any; - abstract visitIntervalFieldType(node: Interval): any; - abstract visitListFieldType(node: List): any; - abstract visitStructFieldType(node: Struct): any; - abstract visitUnionFieldType(node: Union): any; - abstract visitFixedSizeBinaryFieldType(node: FixedSizeBinary): any; - abstract visitFixedSizeListFieldType(node: FixedSizeList): any; - abstract visitMapFieldType(node: Map_): any; -} - -export class Footer implements VisitorNode { - constructor(public dictionaryBatches: Block[], public recordBatches: Block[], public schema: Schema) {} - accept(visitor: Visitor): any { - return visitor.visitFooter(this); - } -} - -export class Block implements VisitorNode { - constructor(public metaDataLength: number, public bodyLength: Long, public offset: Long) {} - accept(visitor: Visitor): any { - return visitor.visitBlock(this); - } -} - -export class Message implements VisitorNode { - constructor(public version: MetadataVersion, public bodyLength: Long, public headerType: MessageHeader) {} - isSchema(): this is Schema { return this.headerType === MessageHeader.Schema; } - isRecordBatch(): this is RecordBatch { return this.headerType === MessageHeader.RecordBatch; } - isDictionaryBatch(): this is DictionaryBatch { return this.headerType === MessageHeader.DictionaryBatch; } - accept(visitor: Visitor): any { - visitor.visitMessage(this); - } -} - -export class Schema extends Message { - public dictionaries: Map; - constructor(version: MetadataVersion, public fields: Field[], public customMetadata?: Map, public endianness = Endianness.Little) { - super(version, Long.ZERO, MessageHeader.Schema); - const dictionaries = [] as Field[]; - for (let f: Field, i = -1, n = fields.length; ++i < n;) { - if ((f = fields[i])) { - f.dictionary && dictionaries.push(f); - dictionaries.push(...f.dictionaries); - } - } - this.dictionaries = new Map(dictionaries.map<[string, Field]>((f) => [ - f.dictionary!.dictionaryId.toFloat64().toString(), f - ])); - } - accept(visitor: Visitor): any { - return visitor.visitSchema(this); - } -} - -export class RecordBatch extends Message { - constructor(version: MetadataVersion, public length: Long, public fieldNodes: FieldNode[], public buffers: Buffer[]) { - super(version, new Long(buffers.reduce((s, b) => align(s + b.length.low + (b.offset.low - s), 8), 0), 0), MessageHeader.RecordBatch); - } - accept(visitor: Visitor) { - return visitor.visitRecordBatch(this); - } -} - -export class DictionaryBatch extends Message { - constructor(version: MetadataVersion, public dictionary: RecordBatch, public dictionaryId: Long, public isDelta: boolean) { - super(version, dictionary.bodyLength, MessageHeader.DictionaryBatch); - } - get fieldNodes(): FieldNode[] { return this.dictionary.fieldNodes; } - get buffers(): Buffer[] { return this.dictionary.buffers; } - accept(visitor: Visitor) { - return visitor.visitDictionaryBatch(this); - } - static atomicDictionaryId = 0; -} - -export class Field implements VisitorNode { - public dictionaries: Field[]; - constructor(public name: string, - public type: FieldType, - public typeType: Type, - public nullable = false, - public children: Field[] = [], - public metadata?: Map | null, - public dictionary?: DictionaryEncoding | null) { - const dictionaries = [] as Field[]; - for (let f: Field, i = -1, n = children.length; ++i < n;) { - if ((f = children[i])) { - f.dictionary && dictionaries.push(f); - dictionaries.push(...f.dictionaries); - } - } - this.dictionaries = dictionaries; - } - accept(visitor: Visitor): any { - return visitor.visitField(this); - } - indexField() { - return !this.dictionary ? this : new Field( - this.name, - this.dictionary.indexType, this.dictionary.indexType.type, - this.nullable, this.children, this.metadata, this.dictionary - ); - } - toString() { return `Field name[${this.name}], nullable[${this.nullable}], type[${this.type.toString()}]`; } -} - -export class Buffer implements VisitorNode { - constructor(public offset: Long, public length: Long) {} - accept(visitor: Visitor) { - return visitor.visitBuffer(this); - } -} - -export class FieldNode implements VisitorNode { - constructor(public length: Long, public nullCount: Long) {} - accept(visitor: Visitor) { - return visitor.visitFieldNode(this); - } -} - -export class DictionaryEncoding implements VisitorNode { - public isOrdered: boolean; - public dictionaryId: Long; - public indexType: Int; - constructor(indexType?: Int | null, dictionaryId?: Long | null, isOrdered?: boolean | null) { - this.isOrdered = isOrdered || false; - /* a dictionary index defaults to signed 32 bit int if unspecified */ - this.indexType = indexType || new Int(true, 32); - this.dictionaryId = dictionaryId || new Long(DictionaryBatch.atomicDictionaryId++, 0); - } - accept(visitor: Visitor): any { - return visitor.visitDictionaryEncoding(this); - } -} - -export abstract class FieldType implements VisitorNode { - constructor(public type: Type) {} - abstract accept(visitor: Visitor): any; - isNull(): this is Null { return this.type === Type.Null; } - isInt(): this is Int { return this.type === Type.Int; } - isFloatingPoint(): this is FloatingPoint { return this.type === Type.FloatingPoint; } - isBinary(): this is Binary { return this.type === Type.Binary; } - isUtf8(): this is Utf8 { return this.type === Type.Utf8; } - isBool(): this is Bool { return this.type === Type.Bool; } - isDecimal(): this is Decimal { return this.type === Type.Decimal; } - isDate(): this is Date { return this.type === Type.Date; } - isTime(): this is Time { return this.type === Type.Time; } - isTimestamp(): this is Timestamp { return this.type === Type.Timestamp; } - isInterval(): this is Interval { return this.type === Type.Interval; } - isList(): this is List { return this.type === Type.List; } - isStruct(): this is Struct { return this.type === Type.Struct_; } - isUnion(): this is Union { return this.type === Type.Union; } - isFixedSizeBinary(): this is FixedSizeBinary { return this.type === Type.FixedSizeBinary; } - isFixedSizeList(): this is FixedSizeList { return this.type === Type.FixedSizeList; } - isMap(): this is Map_ { return this.type === Type.Map; } -} - -export class Null extends FieldType { - toString() { return `Null`; } - constructor() { - super(Type.Null); - } - accept(visitor: Visitor) { - return visitor.visitNullFieldType(this); - } -} - -export class Int extends FieldType { - toString() { return `Int isSigned[${this.isSigned}], bitWidth[${this.bitWidth}]`; } - constructor(public isSigned: boolean, public bitWidth: IntBitWidth) { - super(Type.Int); - } - accept(visitor: Visitor) { - return visitor.visitIntFieldType(this); - } -} - -export class FloatingPoint extends FieldType { - toString() { return `FloatingPoint precision`; } - constructor(public precision: Precision) { - super(Type.FloatingPoint); - } - accept(visitor: Visitor) { - return visitor.visitFloatingPointFieldType(this); - } -} - -export class Binary extends FieldType { - toString() { return `Binary`; } - constructor() { - super(Type.Binary); - } - accept(visitor: Visitor) { - return visitor.visitBinaryFieldType(this); - } -} - -export class Utf8 extends FieldType { - toString() { return `Utf8`; } - constructor() { - super(Type.Utf8); - } - accept(visitor: Visitor) { - return visitor.visitUtf8FieldType(this); - } -} - -export class Bool extends FieldType { - toString() { return `Bool`; } - constructor() { - super(Type.Bool); - } - accept(visitor: Visitor) { - return visitor.visitBoolFieldType(this); - } -} - -export class Decimal extends FieldType { - toString() { return `Decimal scale[${this.scale}], precision[${this.precision}]`; } - constructor(public scale: number, public precision: number) { - super(Type.Decimal); - } - accept(visitor: Visitor) { - return visitor.visitDecimalFieldType(this); - } -} - -export class Date extends FieldType { - toString() { return `Date unit[${this.unit}]`; } - constructor(public unit: DateUnit) { - super(Type.Date); - } - accept(visitor: Visitor) { - return visitor.visitDateFieldType(this); - } -} - -export class Time extends FieldType { - toString() { return `Time unit[${this.unit}], bitWidth[${this.bitWidth}]`; } - constructor(public unit: TimeUnit, public bitWidth: TimeBitWidth) { - super(Type.Time); - } - accept(visitor: Visitor) { - return visitor.visitTimeFieldType(this); - } -} - -export class Timestamp extends FieldType { - toString() { return `Timestamp unit[${this.unit}], timezone[${this.timezone}]`; } - constructor(public unit: TimeUnit, public timezone?: string | null) { - super(Type.Timestamp); - } - accept(visitor: Visitor) { - return visitor.visitTimestampFieldType(this); - } -} - -export class Interval extends FieldType { - toString() { return `Interval unit[${this.unit}]`; } - constructor(public unit: IntervalUnit) { - super(Type.Interval); - } - accept(visitor: Visitor) { - return visitor.visitIntervalFieldType(this); - } -} - -export class List extends FieldType { - toString() { return `List`; } - constructor() { - super(Type.List); - } - accept(visitor: Visitor) { - return visitor.visitListFieldType(this); - } -} - -export class Struct extends FieldType { - toString() { return `Struct`; } - constructor() { - super(Type.Struct_); - } - accept(visitor: Visitor) { - return visitor.visitStructFieldType(this); - } -} - -export class Union extends FieldType { - toString() { return `Union mode[${this.mode}], typeIds[${this.typeIds}]`; } - constructor(public mode: UnionMode, public typeIds: Type[]) { - super(Type.Union); - } - accept(visitor: Visitor) { - return visitor.visitUnionFieldType(this); - } -} - -export class FixedSizeBinary extends FieldType { - toString() { return `FixedSizeBinary byteWidth[${this.byteWidth}]`; } - constructor(public byteWidth: number) { - super(Type.FixedSizeBinary); - } - accept(visitor: Visitor) { - return visitor.visitFixedSizeBinaryFieldType(this); - } -} - -export class FixedSizeList extends FieldType { - toString() { return `FixedSizeList listSize[${this.listSize}]`; } - constructor(public listSize: number) { - super(Type.FixedSizeList); - } - accept(visitor: Visitor) { - return visitor.visitFixedSizeListFieldType(this); - } -} - -export class Map_ extends FieldType { - toString() { return `Map keysSorted[${this.keysSorted}]`; } - constructor(public keysSorted: boolean) { - super(Type.Map); - } - accept(visitor: Visitor) { - return visitor.visitMapFieldType(this); - } -} diff --git a/js/src/ipc/metadata.ts b/js/src/ipc/metadata.ts new file mode 100644 index 0000000000000..88b7e52983b8e --- /dev/null +++ b/js/src/ipc/metadata.ts @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* tslint:disable:class-name */ + +import { align } from '../util/bit'; +import { Schema, Long, MessageHeader, MetadataVersion } from '../type'; + +export class Footer { + constructor(public dictionaryBatches: FileBlock[], public recordBatches: FileBlock[], public schema: Schema) {} +} + +export class FileBlock { + constructor(public metaDataLength: number, public bodyLength: Long, public offset: Long) {} +} + +export class Message { + public bodyLength: number; + public version: MetadataVersion; + public headerType: MessageHeader; + constructor(version: MetadataVersion, bodyLength: Long | number, headerType: MessageHeader) { + this.version = version; + this.headerType = headerType; + this.bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; + } + static isSchema(m: Message): m is Schema { return m.headerType === MessageHeader.Schema; } + static isRecordBatch(m: Message): m is RecordBatchMetadata { return m.headerType === MessageHeader.RecordBatch; } + static isDictionaryBatch(m: Message): m is DictionaryBatch { return m.headerType === MessageHeader.DictionaryBatch; } +} + +export class RecordBatchMetadata extends Message { + public length: number; + public nodes: FieldMetadata[]; + public buffers: BufferMetadata[]; + constructor(version: MetadataVersion, length: Long | number, nodes: FieldMetadata[], buffers: BufferMetadata[]) { + super(version, buffers.reduce((s, b) => align(s + b.length + (b.offset - s), 8), 0), MessageHeader.RecordBatch); + this.nodes = nodes; + this.buffers = buffers; + this.length = typeof length === 'number' ? length : length.low; + } +} + +export class DictionaryBatch extends Message { + public id: number; + public isDelta: boolean; + public data: RecordBatchMetadata; + constructor(version: MetadataVersion, data: RecordBatchMetadata, id: Long | number, isDelta: boolean = false) { + super(version, data.bodyLength, MessageHeader.DictionaryBatch); + this.isDelta = isDelta; + this.data = data; + this.id = typeof id === 'number' ? id : id.low; + } + private static atomicDictionaryId = 0; + public static getId() { return DictionaryBatch.atomicDictionaryId++; } + public get nodes(): FieldMetadata[] { return this.data.nodes; } + public get buffers(): BufferMetadata[] { return this.data.buffers; } +} + +export class BufferMetadata { + public offset: number; + public length: number; + constructor(offset: Long | number, length: Long | number) { + this.offset = typeof offset === 'number' ? offset : offset.low; + this.length = typeof length === 'number' ? length : length.low; + } +} + +export class FieldMetadata { + public length: number; + public nullCount: number; + constructor(length: Long | number, nullCount: Long | number) { + this.length = typeof length === 'number' ? length : length.low; + this.nullCount = typeof nullCount === 'number' ? nullCount : nullCount.low; + } +} diff --git a/js/src/reader/arrow.ts b/js/src/ipc/reader/arrow.ts similarity index 62% rename from js/src/reader/arrow.ts rename to js/src/ipc/reader/arrow.ts index cf8a3d6a281a2..af535900cbf46 100644 --- a/js/src/reader/arrow.ts +++ b/js/src/ipc/reader/arrow.ts @@ -16,33 +16,33 @@ // under the License. import { readJSON } from './json'; -import { readBuffers, readBuffersAsync } from './buffer'; -import { readVectors, readVectorsAsync } from './vector'; -import { Vector } from '../vector/vector'; +import { RecordBatch } from '../../recordbatch'; +import { readBuffers, readBuffersAsync } from './binary'; +import { readRecordBatches, readRecordBatchesAsync, TypeDataLoader } from './vector'; +import { Schema } from '../../type'; +import { Message } from '../metadata'; -export { readJSON }; +export { readJSON, RecordBatch }; export { readBuffers, readBuffersAsync }; -export { readVectors, readVectorsAsync }; +export { readRecordBatches, readRecordBatchesAsync }; export function* read(sources: Iterable | object | string) { let input: any = sources; - let batches: Iterable; + let messages: Iterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>; if (typeof input === 'string') { try { input = JSON.parse(input); } catch (e) { input = sources; } } if (!input || typeof input !== 'object') { - batches = (typeof input === 'string') ? readVectors(readBuffers([input])) : []; + messages = (typeof input === 'string') ? readBuffers([input]) : []; } else { - batches = (typeof input[Symbol.iterator] === 'function') - ? readVectors(readBuffers(input)) - : readVectors(readJSON(input)); + messages = (typeof input[Symbol.iterator] === 'function') ? readBuffers(input) : readJSON(input); } - yield* batches; + yield* readRecordBatches(messages); } export async function* readAsync(sources: AsyncIterable) { - for await (let vectors of readVectorsAsync(readBuffersAsync(sources))) { - yield vectors; + for await (let recordBatch of readRecordBatchesAsync(readBuffersAsync(sources))) { + yield recordBatch; } } diff --git a/js/src/ipc/reader/binary.ts b/js/src/ipc/reader/binary.ts new file mode 100644 index 0000000000000..6e3c7fc5cf080 --- /dev/null +++ b/js/src/ipc/reader/binary.ts @@ -0,0 +1,449 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../../vector'; +import { flatbuffers } from 'flatbuffers'; +import { TypeDataLoader } from './vector'; +import { Message, Footer, FileBlock, RecordBatchMetadata, DictionaryBatch, BufferMetadata, FieldMetadata, } from '../metadata'; +import { + Schema, Field, + DataType, Dictionary, + Null, TimeBitWidth, + Binary, Bool, Utf8, Decimal, + Date_, Time, Timestamp, Interval, + List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, +} from '../../type'; + +import { + Int8, Uint8, + Int16, Uint16, + Int32, Uint32, + Int64, Uint64, + Float16, Float64, Float32, +} from '../../type'; + +import ByteBuffer = flatbuffers.ByteBuffer; + +type MessageReader = (bb: ByteBuffer) => IterableIterator; + +export function* readBuffers(sources: Iterable | Uint8Array | Buffer | string) { + let schema: Schema | null = null; + let dictionaries = new Map(); + let readMessages: MessageReader | null = null; + if (ArrayBuffer.isView(sources) || typeof sources === 'string') { + sources = [sources as T]; + } + for (const source of sources) { + const bb = toByteBuffer(source); + if ((!schema && ({ schema, readMessages } = readSchema(bb))) && schema && readMessages) { + for (const message of readMessages(bb)) { + yield { + schema, message, + loader: new BinaryDataLoader( + bb, + arrayIterator(message.nodes), + arrayIterator(message.buffers), + dictionaries + ) + }; + } + } + } +} + +export async function* readBuffersAsync(sources: AsyncIterable) { + let schema: Schema | null = null; + let dictionaries = new Map(); + let readMessages: MessageReader | null = null; + for await (const source of sources) { + const bb = toByteBuffer(source); + if ((!schema && ({ schema, readMessages } = readSchema(bb))) && schema && readMessages) { + for (const message of readMessages(bb)) { + yield { + schema, message, + loader: new BinaryDataLoader( + bb, + arrayIterator(message.nodes), + arrayIterator(message.buffers), + dictionaries + ) + }; + } + } + } +} + +export class BinaryDataLoader extends TypeDataLoader { + private bytes: Uint8Array; + private messageOffset: number; + constructor(bb: ByteBuffer, nodes: Iterator, buffers: Iterator, dictionaries: Map) { + super(nodes, buffers, dictionaries); + this.bytes = bb.bytes(); + this.messageOffset = bb.position(); + } + protected readOffsets(type: T, buffer?: BufferMetadata) { return this.readData(type, buffer); } + protected readTypeIds(type: T, buffer?: BufferMetadata) { return this.readData(type, buffer); } + protected readData(_type: T, { length, offset }: BufferMetadata = this.getBufferMetadata()) { + return new Uint8Array(this.bytes.buffer, this.bytes.byteOffset + this.messageOffset + offset, length); + } +} + +function* arrayIterator(arr: Array) { yield* arr; } + +function toByteBuffer(bytes?: Uint8Array | Buffer | string) { + let arr: Uint8Array = bytes as any || new Uint8Array(0); + if (typeof bytes === 'string') { + arr = new Uint8Array(bytes.length); + for (let i = -1, n = bytes.length; ++i < n;) { + arr[i] = bytes.charCodeAt(i); + } + return new ByteBuffer(arr); + } + return new ByteBuffer(arr); +} + +function readSchema(bb: ByteBuffer) { + let schema: Schema, readMessages, footer: Footer | null; + if (footer = readFileSchema(bb)) { + schema = footer.schema; + readMessages = readFileMessages(footer); + } else if (schema = readStreamSchema(bb)!) { + readMessages = readStreamMessages; + } else { + throw new Error('Invalid Arrow buffer'); + } + return { schema, readMessages }; +} + +const PADDING = 4; +const MAGIC_STR = 'ARROW1'; +const MAGIC = new Uint8Array(MAGIC_STR.length); +for (let i = 0; i < MAGIC_STR.length; i += 1 | 0) { + MAGIC[i] = MAGIC_STR.charCodeAt(i); +} + +function checkForMagicArrowString(buffer: Uint8Array, index = 0) { + for (let i = -1, n = MAGIC.length; ++i < n;) { + if (MAGIC[i] !== buffer[index + i]) { + return false; + } + } + return true; +} + +const magicLength = MAGIC.length; +const magicAndPadding = magicLength + PADDING; +const magicX2AndPadding = magicLength * 2 + PADDING; + +function readStreamSchema(bb: ByteBuffer) { + if (!checkForMagicArrowString(bb.bytes(), 0)) { + for (const message of readMessages(bb)) { + if (Message.isSchema(message)) { + return message as Schema; + } + } + } + return null; +} + +function* readStreamMessages(bb: ByteBuffer) { + for (const message of readMessages(bb)) { + if (Message.isRecordBatch(message)) { + yield message; + } else if (Message.isDictionaryBatch(message)) { + yield message; + } else { + continue; + } + // position the buffer after the body to read the next message + bb.setPosition(bb.position() + message.bodyLength); + } +} + +function readFileSchema(bb: ByteBuffer) { + let fileLength = bb.capacity(), footerLength: number, footerOffset: number; + if ((fileLength < magicX2AndPadding /* Arrow buffer too small */) || + (!checkForMagicArrowString(bb.bytes(), 0) /* Missing magic start */) || + (!checkForMagicArrowString(bb.bytes(), fileLength - magicLength) /* Missing magic end */) || + (/* Invalid footer length */ + (footerLength = bb.readInt32(footerOffset = fileLength - magicAndPadding)) < 1 && + (footerLength + magicX2AndPadding > fileLength))) { + return null; + } + bb.setPosition(footerOffset - footerLength); + return footerFromByteBuffer(bb); +} + +function readFileMessages(footer: Footer) { + return function* (bb: ByteBuffer) { + for (let i = -1, batches = footer.dictionaryBatches, n = batches.length; ++i < n;) { + bb.setPosition(batches[i].offset.low); + yield readMessage(bb, bb.readInt32(bb.position())) as DictionaryBatch; + } + for (let i = -1, batches = footer.recordBatches, n = batches.length; ++i < n;) { + bb.setPosition(batches[i].offset.low); + yield readMessage(bb, bb.readInt32(bb.position())) as RecordBatchMetadata; + } + }; +} + +function* readMessages(bb: ByteBuffer) { + let length: number, message: Schema | RecordBatchMetadata | DictionaryBatch; + while (bb.position() < bb.capacity() && + (length = bb.readInt32(bb.position())) > 0) { + if (message = readMessage(bb, length)!) { + yield message; + } + } +} + +function readMessage(bb: ByteBuffer, length: number) { + bb.setPosition(bb.position() + PADDING); + const message = messageFromByteBuffer(bb); + bb.setPosition(bb.position() + length); + return message; +} + +import * as File_ from '../../fb/File'; +import * as Schema_ from '../../fb/Schema'; +import * as Message_ from '../../fb/Message'; + +import Type = Schema_.org.apache.arrow.flatbuf.Type; +import Precision = Schema_.org.apache.arrow.flatbuf.Precision; +import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; +import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; +import _Footer = File_.org.apache.arrow.flatbuf.Footer; +import _Block = File_.org.apache.arrow.flatbuf.Block; +import _Message = Message_.org.apache.arrow.flatbuf.Message; +import _Schema = Schema_.org.apache.arrow.flatbuf.Schema; +import _Field = Schema_.org.apache.arrow.flatbuf.Field; +import _RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; +import _DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch; +import _FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; +import _Buffer = Schema_.org.apache.arrow.flatbuf.Buffer; +import _DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; +import _Null = Schema_.org.apache.arrow.flatbuf.Null; +import _Int = Schema_.org.apache.arrow.flatbuf.Int; +import _FloatingPoint = Schema_.org.apache.arrow.flatbuf.FloatingPoint; +import _Binary = Schema_.org.apache.arrow.flatbuf.Binary; +import _Bool = Schema_.org.apache.arrow.flatbuf.Bool; +import _Utf8 = Schema_.org.apache.arrow.flatbuf.Utf8; +import _Decimal = Schema_.org.apache.arrow.flatbuf.Decimal; +import _Date = Schema_.org.apache.arrow.flatbuf.Date; +import _Time = Schema_.org.apache.arrow.flatbuf.Time; +import _Timestamp = Schema_.org.apache.arrow.flatbuf.Timestamp; +import _Interval = Schema_.org.apache.arrow.flatbuf.Interval; +import _List = Schema_.org.apache.arrow.flatbuf.List; +import _Struct = Schema_.org.apache.arrow.flatbuf.Struct_; +import _Union = Schema_.org.apache.arrow.flatbuf.Union; +import _FixedSizeBinary = Schema_.org.apache.arrow.flatbuf.FixedSizeBinary; +import _FixedSizeList = Schema_.org.apache.arrow.flatbuf.FixedSizeList; +import _Map = Schema_.org.apache.arrow.flatbuf.Map; + +function footerFromByteBuffer(bb: ByteBuffer) { + const dictionaryFields = new Map>(); + const f = _Footer.getRootAsFooter(bb), s = f.schema()!; + return new Footer( + dictionaryBatchesFromFooter(f), recordBatchesFromFooter(f), + new Schema(fieldsFromSchema(s, dictionaryFields), customMetadata(s), f.version(), dictionaryFields) + ); +} + +function messageFromByteBuffer(bb: ByteBuffer) { + const m = _Message.getRootAsMessage(bb)!, type = m.headerType(), version = m.version(); + switch (type) { + case MessageHeader.Schema: return schemaFromMessage(version, m.header(new _Schema())!, new Map()); + case MessageHeader.RecordBatch: return recordBatchFromMessage(version, m.header(new _RecordBatch())!); + case MessageHeader.DictionaryBatch: return dictionaryBatchFromMessage(version, m.header(new _DictionaryBatch())!); + } + return null; + // throw new Error(`Unrecognized Message type '${type}'`); +} + +function schemaFromMessage(version: MetadataVersion, s: _Schema, dictionaryFields: Map>) { + return new Schema(fieldsFromSchema(s, dictionaryFields), customMetadata(s), version, dictionaryFields); +} + +function recordBatchFromMessage(version: MetadataVersion, b: _RecordBatch) { + return new RecordBatchMetadata(version, b.length(), fieldNodesFromRecordBatch(b), buffersFromRecordBatch(b, version)); +} + +function dictionaryBatchFromMessage(version: MetadataVersion, d: _DictionaryBatch) { + return new DictionaryBatch(version, recordBatchFromMessage(version, d.data()!), d.id(), d.isDelta()); +} + +function dictionaryBatchesFromFooter(f: _Footer) { + const blocks = [] as FileBlock[]; + for (let b: _Block, i = -1, n = f && f.dictionariesLength(); ++i < n;) { + if (b = f.dictionaries(i)!) { + blocks.push(new FileBlock(b.metaDataLength(), b.bodyLength(), b.offset())); + } + } + return blocks; +} + +function recordBatchesFromFooter(f: _Footer) { + const blocks = [] as FileBlock[]; + for (let b: _Block, i = -1, n = f && f.recordBatchesLength(); ++i < n;) { + if (b = f.recordBatches(i)!) { + blocks.push(new FileBlock(b.metaDataLength(), b.bodyLength(), b.offset())); + } + } + return blocks; +} + +function fieldsFromSchema(s: _Schema, dictionaryFields: Map> | null) { + const fields = [] as Field[]; + for (let i = -1, c: Field | null, n = s && s.fieldsLength(); ++i < n;) { + if (c = field(s.fields(i)!, dictionaryFields)) { + fields.push(c); + } + } + return fields; +} + +function fieldsFromField(f: _Field, dictionaryFields: Map> | null) { + const fields = [] as Field[]; + for (let i = -1, c: Field | null, n = f && f.childrenLength(); ++i < n;) { + if (c = field(f.children(i)!, dictionaryFields)) { + fields.push(c); + } + } + return fields; +} + +function fieldNodesFromRecordBatch(b: _RecordBatch) { + const fieldNodes = [] as FieldMetadata[]; + for (let i = -1, n = b.nodesLength(); ++i < n;) { + fieldNodes.push(fieldNodeFromRecordBatch(b.nodes(i)!)); + } + return fieldNodes; +} + +function buffersFromRecordBatch(b: _RecordBatch, version: MetadataVersion) { + const buffers = [] as BufferMetadata[]; + for (let i = -1, n = b.buffersLength(); ++i < n;) { + let buffer = b.buffers(i)!; + // If this Arrow buffer was written before version 4, + // advance the buffer's bb_pos 8 bytes to skip past + // the now-removed page id field. + if (version < MetadataVersion.V4) { + buffer.bb_pos += (8 * (i + 1)); + } + buffers.push(bufferFromRecordBatch(buffer)); + } + return buffers; +} + +function field(f: _Field, dictionaryFields: Map> | null) { + let name = f.name()!; + let field: Field | void; + let nullable = f.nullable(); + let metadata = customMetadata(f); + let dataType: DataType | null; + let keysMeta: _Int | null, id: number; + let dictMeta: _DictionaryEncoding | null; + if (!dictionaryFields || !(dictMeta = f.dictionary())) { + if (dataType = typeFromField(f, fieldsFromField(f, dictionaryFields))) { + field = new Field(name, dataType, nullable, metadata); + } + } else if (dataType = dictionaryFields.has(id = dictMeta.id().low) + ? dictionaryFields.get(id)!.type.dictionary + : typeFromField(f, fieldsFromField(f, null))) { + dataType = new Dictionary(dataType, + // a dictionary index defaults to signed 32 bit int if unspecified + (keysMeta = dictMeta.indexType()) ? intFromField(keysMeta)! : new Int32(), + id, dictMeta.isOrdered() + ); + field = new Field(name, dataType, nullable, metadata); + dictionaryFields.has(id) || dictionaryFields.set(id, field as Field); + } + return field || null; +} + +function customMetadata(parent?: _Schema | _Field | null) { + const data = new Map(); + if (parent) { + for (let entry, key, i = -1, n = parent.customMetadataLength() | 0; ++i < n;) { + if ((entry = parent.customMetadata(i)) && (key = entry.key()) != null) { + data.set(key, entry.value()!); + } + } + } + return data; +} + +function fieldNodeFromRecordBatch(f: _FieldNode) { + return new FieldMetadata(f.length(), f.nullCount()); +} + +function bufferFromRecordBatch(b: _Buffer) { + return new BufferMetadata(b.offset(), b.length()); +} + +function typeFromField(f: _Field, children?: Field[]): DataType | null { + switch (f.typeType()) { + case Type.NONE: return null; + case Type.Null: return nullFromField(f.type(new _Null())!); + case Type.Int: return intFromField(f.type(new _Int())!); + case Type.FloatingPoint: return floatFromField(f.type(new _FloatingPoint())!); + case Type.Binary: return binaryFromField(f.type(new _Binary())!); + case Type.Utf8: return utf8FromField(f.type(new _Utf8())!); + case Type.Bool: return boolFromField(f.type(new _Bool())!); + case Type.Decimal: return decimalFromField(f.type(new _Decimal())!); + case Type.Date: return dateFromField(f.type(new _Date())!); + case Type.Time: return timeFromField(f.type(new _Time())!); + case Type.Timestamp: return timestampFromField(f.type(new _Timestamp())!); + case Type.Interval: return intervalFromField(f.type(new _Interval())!); + case Type.List: return listFromField(f.type(new _List())!, children || []); + case Type.Struct_: return structFromField(f.type(new _Struct())!, children || []); + case Type.Union: return unionFromField(f.type(new _Union())!, children || []); + case Type.FixedSizeBinary: return fixedSizeBinaryFromField(f.type(new _FixedSizeBinary())!); + case Type.FixedSizeList: return fixedSizeListFromField(f.type(new _FixedSizeList())!, children || []); + case Type.Map: return mapFromField(f.type(new _Map())!, children || []); + } + throw new Error(`Unrecognized type ${f.typeType()}`); +} + +function nullFromField (_type: _Null) { return new Null(); } +function intFromField (_type: _Int) { switch (_type.bitWidth()) { + case 8: return _type.isSigned() ? new Int8() : new Uint8(); + case 16: return _type.isSigned() ? new Int16() : new Uint16(); + case 32: return _type.isSigned() ? new Int32() : new Uint32(); + case 64: return _type.isSigned() ? new Int64() : new Uint64(); + } + return null; } +function floatFromField (_type: _FloatingPoint) { switch (_type.precision()) { + case Precision.HALF: return new Float16(); + case Precision.SINGLE: return new Float32(); + case Precision.DOUBLE: return new Float64(); + } + return null; } +function binaryFromField (_type: _Binary) { return new Binary(); } +function utf8FromField (_type: _Utf8) { return new Utf8(); } +function boolFromField (_type: _Bool) { return new Bool(); } +function decimalFromField (_type: _Decimal) { return new Decimal(_type.scale(), _type.precision()); } +function dateFromField (_type: _Date) { return new Date_(_type.unit()); } +function timeFromField (_type: _Time) { return new Time(_type.unit(), _type.bitWidth() as TimeBitWidth); } +function timestampFromField (_type: _Timestamp) { return new Timestamp(_type.unit(), _type.timezone()); } +function intervalFromField (_type: _Interval) { return new Interval(_type.unit()); } +function listFromField (_type: _List, children: Field[]) { return new List(children); } +function structFromField (_type: _Struct, children: Field[]) { return new Struct(children); } +function unionFromField (_type: _Union, children: Field[]) { return new Union(_type.mode(), (_type.typeIdsArray() || []) as Type[], children); } +function fixedSizeBinaryFromField(_type: _FixedSizeBinary) { return new FixedSizeBinary(_type.byteWidth()); } +function fixedSizeListFromField (_type: _FixedSizeList, children: Field[]) { return new FixedSizeList(_type.listSize(), children); } +function mapFromField (_type: _Map, children: Field[]) { return new Map_(_type.keysSorted(), children); } diff --git a/js/src/ipc/reader/json.ts b/js/src/ipc/reader/json.ts new file mode 100644 index 0000000000000..10819986f6d33 --- /dev/null +++ b/js/src/ipc/reader/json.ts @@ -0,0 +1,323 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../../vector'; +import { flatbuffers } from 'flatbuffers'; +import { TypeDataLoader } from './vector'; +import { packBools } from '../../util/bit'; +import * as IntUtil from '../../util/int'; +import { TextEncoder } from 'text-encoding-utf-8'; +import { RecordBatchMetadata, DictionaryBatch, BufferMetadata, FieldMetadata } from '../metadata'; +import { + Schema, Field, + DataType, Dictionary, + Null, TimeBitWidth, + Binary, Bool, Utf8, Decimal, + Date_, Time, Timestamp, Interval, + List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, +} from '../../type'; + +import { + Int8, Uint8, + Int16, Uint16, + Int32, Uint32, + Int64, Uint64, + Float16, Float64, Float32, +} from '../../type'; + +import Long = flatbuffers.Long; + +export function* readJSON(json: any) { + const schema = schemaFromJSON(json['schema']); + const dictionaries = new Map(); + for (const batch of (json['dictionaries'] || [])) { + const message = dictionaryBatchFromJSON(batch); + yield { + schema, message, + loader: new JSONDataLoader( + flattenDataSources(batch['data']['columns']), + arrayIterator(message.nodes), + arrayIterator(message.buffers), + dictionaries + ) + }; + } + for (const batch of (json['batches'] || [])) { + const message = recordBatchFromJSON(batch); + yield { + schema, message, + loader: new JSONDataLoader( + flattenDataSources(batch['columns']), + arrayIterator(message.nodes), + arrayIterator(message.buffers), + dictionaries + ) + }; + } +} + +function* arrayIterator(arr: Array) { yield* arr; } +function flattenDataSources(xs: any[]): any[][] { + return (xs || []).reduce((buffers, column: any) => [ + ...buffers, + ...(column['VALIDITY'] && [column['VALIDITY']] || []), + ...(column['OFFSET'] && [column['OFFSET']] || []), + ...(column['DATA'] && [column['DATA']] || []), + ...flattenDataSources(column['children']) + ], [] as any[][]); +} + +const utf8Encoder = new TextEncoder('utf-8'); + +export class JSONDataLoader extends TypeDataLoader { + constructor(private sources: any[][], nodes: Iterator, buffers: Iterator, dictionaries: Map) { + super(nodes, buffers, dictionaries); + } + protected readNullBitmap(_type: T, nullCount: number, { offset } = this.getBufferMetadata()) { + return nullCount <= 0 ? new Uint8Array(0) : packBools(this.sources[offset]); + } + protected readOffsets(_type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { + return new Int32Array(this.sources[offset]); + } + protected readTypeIds(_type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { + return new Int8Array(this.sources[offset]); + } + protected readData(type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { + const { sources } = this; + if (DataType.isTimestamp(type) === true) { + return new Uint8Array(int64DataFromJSON(sources[offset] as string[])); + } else if ((DataType.isInt(type) || DataType.isTime(type)) && type.bitWidth === 64) { + return new Uint8Array(int64DataFromJSON(sources[offset] as string[])); + } else if (DataType.isDate(type) && type.unit === DateUnit.MILLISECOND) { + return new Uint8Array(int64DataFromJSON(sources[offset] as string[])); + } else if (DataType.isDecimal(type) === true) { + return new Uint8Array(decimalDataFromJSON(sources[offset] as string[])); + } else if (DataType.isBinary(type) === true) { + return new Uint8Array(binaryDataFromJSON(sources[offset] as string[])); + } else if (DataType.isBool(type) === true) { + return new Uint8Array(packBools(sources[offset] as number[]).buffer); + } else if (DataType.isUtf8(type) === true) { + return utf8Encoder.encode((sources[offset] as string[]).join('')); + } else { + return toTypedArray(type.ArrayType, sources[offset].map((x) => +x)) as any; + } + } +} + +function int64DataFromJSON(values: string[]) { + const data = new Uint32Array(values.length * 2); + for (let i = -1, n = values.length; ++i < n;) { + // Force all values (even numbers) to be parsed as strings since + // pulling out high and low bits seems to lose precision sometimes + // For example: + // > -4613034156400212000 >>> 0 + // 721782784 + // The correct lower 32-bits are 721782752 + IntUtil.Int64.fromString(values[i].toString(), new Uint32Array(data.buffer, data.byteOffset + 2 * i * 4, 2)); + } + return data.buffer; +} + +function decimalDataFromJSON(values: string[]) { + const data = new Uint32Array(values.length * 4); + for (let i = -1, n = values.length; ++i < n;) { + IntUtil.Int128.fromString(values[i], new Uint32Array(data.buffer, data.byteOffset + 4 * 4 * i, 4)); + } + return data.buffer; +} + +function binaryDataFromJSON(values: string[]) { + // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] + // There are definitely more efficient ways to do this... but it gets the + // job done. + const joined = values.join(''); + const data = new Uint8Array(joined.length / 2); + for (let i = 0; i < joined.length; i += 2) { + data[i >> 1] = parseInt(joined.substr(i, 2), 16); + } + return data.buffer; +} + +import * as Schema_ from '../../fb/Schema'; +import Type = Schema_.org.apache.arrow.flatbuf.Type; +import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; +import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; +import Precision = Schema_.org.apache.arrow.flatbuf.Precision; +import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; +import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; +import { toTypedArray } from '../../data'; + +function schemaFromJSON(s: any): Schema { + const dictionaryFields = new Map>(); + return new Schema( + fieldsFromJSON(s['fields'], dictionaryFields), + customMetadata(s['customMetadata']), + MetadataVersion.V4, dictionaryFields + ); +} + +function recordBatchFromJSON(b: any): RecordBatchMetadata { + return new RecordBatchMetadata( + MetadataVersion.V4, + b['count'], + fieldNodesFromJSON(b['columns']), + buffersFromJSON(b['columns']) + ); +} + +function dictionaryBatchFromJSON(b: any): DictionaryBatch { + return new DictionaryBatch( + MetadataVersion.V4, + recordBatchFromJSON(b['data']), + b['id'], b['isDelta'] + ); +} + +function fieldsFromJSON(fs: any[], dictionaryFields: Map> | null): Field[] { + return (fs || []) + .map((f) => fieldFromJSON(f, dictionaryFields)) + .filter((f) => f != null) as Field[]; +} + +function fieldNodesFromJSON(xs: any[]): FieldMetadata[] { + return (xs || []).reduce((fieldNodes, column: any) => [ + ...fieldNodes, + new FieldMetadata( + new Long(column['count'], 0), + new Long(nullCountFromJSON(column['VALIDITY']), 0) + ), + ...fieldNodesFromJSON(column['children']) + ], [] as FieldMetadata[]); +} + +function buffersFromJSON(xs: any[], buffers: BufferMetadata[] = []): BufferMetadata[] { + for (let i = -1, n = (xs || []).length; ++i < n;) { + const column = xs[i]; + column['VALIDITY'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['VALIDITY'].length, 0))); + column['OFFSET'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['OFFSET'].length, 0))); + column['DATA'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['DATA'].length, 0))); + buffers = buffersFromJSON(column['children'], buffers); + } + return buffers; +} + +function nullCountFromJSON(validity: number[]) { + return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); +} + +function fieldFromJSON(f: any, dictionaryFields: Map> | null) { + let name = f['name']; + let field: Field | void; + let nullable = f['nullable']; + let dataType: DataType | null; + let id: number, keysMeta: any, dictMeta: any; + let metadata = customMetadata(f['customMetadata']); + if (!dictionaryFields || !(dictMeta = f['dictionary'])) { + if (dataType = typeFromJSON(f['type'], fieldsFromJSON(f['children'], dictionaryFields))) { + field = new Field(name, dataType, nullable, metadata); + } + } else if (dataType = dictionaryFields.has(id = dictMeta['id']) + ? dictionaryFields.get(id)!.type.dictionary + : typeFromJSON(f['type'], fieldsFromJSON(f['children'], null))) { + dataType = new Dictionary(dataType, + // a dictionary index defaults to signed 32 bit int if unspecified + (keysMeta = dictMeta['indexType']) ? intFromJSON(keysMeta)! : new Int32(), + id, dictMeta['isOrdered'] + ); + field = new Field(name, dataType, nullable, metadata); + dictionaryFields.has(id) || dictionaryFields.set(id, field as Field); + } + return field || null; +} + +function customMetadata(metadata?: any) { + return new Map(Object.entries(metadata || {})); +} + +const namesToTypeMap: { [n: string]: Type } = { + 'NONE': Type.NONE, + 'null': Type.Null, + 'int': Type.Int, + 'floatingpoint': Type.FloatingPoint, + 'binary': Type.Binary, + 'bool': Type.Bool, + 'utf8': Type.Utf8, + 'decimal': Type.Decimal, + 'date': Type.Date, + 'time': Type.Time, + 'timestamp': Type.Timestamp, + 'interval': Type.Interval, + 'list': Type.List, + 'struct': Type.Struct_, + 'union': Type.Union, + 'fixedsizebinary': Type.FixedSizeBinary, + 'fixedsizelist': Type.FixedSizeList, + 'map': Type.Map, +}; + +function typeFromJSON(t: any, children?: Field[]) { + switch (namesToTypeMap[t['name']]) { + case Type.NONE: return null; + case Type.Null: return nullFromJSON(t); + case Type.Int: return intFromJSON(t); + case Type.FloatingPoint: return floatingPointFromJSON(t); + case Type.Binary: return binaryFromJSON(t); + case Type.Utf8: return utf8FromJSON(t); + case Type.Bool: return boolFromJSON(t); + case Type.Decimal: return decimalFromJSON(t); + case Type.Date: return dateFromJSON(t); + case Type.Time: return timeFromJSON(t); + case Type.Timestamp: return timestampFromJSON(t); + case Type.Interval: return intervalFromJSON(t); + case Type.List: return listFromJSON(t, children || []); + case Type.Struct_: return structFromJSON(t, children || []); + case Type.Union: return unionFromJSON(t, children || []); + case Type.FixedSizeBinary: return fixedSizeBinaryFromJSON(t); + case Type.FixedSizeList: return fixedSizeListFromJSON(t, children || []); + case Type.Map: return mapFromJSON(t, children || []); + } + throw new Error(`Unrecognized type ${t['name']}`); +} + +function nullFromJSON (_type: any) { return new Null(); } +function intFromJSON (_type: any) { switch (_type['bitWidth']) { + case 8: return _type['isSigned'] ? new Int8() : new Uint8(); + case 16: return _type['isSigned'] ? new Int16() : new Uint16(); + case 32: return _type['isSigned'] ? new Int32() : new Uint32(); + case 64: return _type['isSigned'] ? new Int64() : new Uint64(); + } + return null; } +function floatingPointFromJSON (_type: any) { switch (Precision[_type['precision']] as any) { + case Precision.HALF: return new Float16(); + case Precision.SINGLE: return new Float32(); + case Precision.DOUBLE: return new Float64(); + } + return null; } +function binaryFromJSON (_type: any) { return new Binary(); } +function utf8FromJSON (_type: any) { return new Utf8(); } +function boolFromJSON (_type: any) { return new Bool(); } +function decimalFromJSON (_type: any) { return new Decimal(_type['scale'], _type['precision']); } +function dateFromJSON (_type: any) { return new Date_(DateUnit[_type['unit']] as any); } +function timeFromJSON (_type: any) { return new Time(TimeUnit[_type['unit']] as any, _type['bitWidth'] as TimeBitWidth); } +function timestampFromJSON (_type: any) { return new Timestamp(TimeUnit[_type['unit']] as any, _type['timezone']); } +function intervalFromJSON (_type: any) { return new Interval(IntervalUnit[_type['unit']] as any); } +function listFromJSON (_type: any, children: Field[]) { return new List(children); } +function structFromJSON (_type: any, children: Field[]) { return new Struct(children); } +function unionFromJSON (_type: any, children: Field[]) { return new Union(_type['mode'], (_type['typeIdsArray'] || []) as Type[], children); } +function fixedSizeBinaryFromJSON(_type: any) { return new FixedSizeBinary(_type['byteWidth']); } +function fixedSizeListFromJSON (_type: any, children: Field[]) { return new FixedSizeList(_type['listSize'], children); } +function mapFromJSON (_type: any, children: Field[]) { return new Map_(_type['keysSorted'], children); } diff --git a/js/src/ipc/reader/vector.ts b/js/src/ipc/reader/vector.ts new file mode 100644 index 0000000000000..809069c6d9864 --- /dev/null +++ b/js/src/ipc/reader/vector.ts @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../../vector'; +import { RecordBatch } from '../../recordbatch'; +import { TypeVisitor } from '../../visitor'; +import { FlatType, NestedType, ListType } from '../../type'; +import { Message, FieldMetadata, BufferMetadata } from '../metadata'; +import { FlatData, ListData, NestedData, SingleNestedData, DenseUnionData, SparseUnionData, BoolData, FlatListData, DictionaryData } from '../../data'; +import { + Schema, Field, + Dictionary, + Null, Int, Float, + Binary, Bool, Utf8, Decimal, + Date_, Time, Timestamp, Interval, + List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, + UnionMode, SparseUnion, DenseUnion, FlatListType, DataType, +} from '../../type'; + +export function* readRecordBatches(messages: Iterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>) { + for (const { schema, message, loader } of messages) { + yield* readRecordBatch(schema, message, loader); + } +} + +export async function* readRecordBatchesAsync(messages: AsyncIterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>) { + for await (const { schema, message, loader } of messages) { + yield* readRecordBatch(schema, message, loader); + } +} + +export function* readRecordBatch(schema: Schema, message: Message, loader: TypeDataLoader) { + if (Message.isRecordBatch(message)) { + yield new RecordBatch(schema, message.length, loader.visitFields(schema.fields)); + } else if (Message.isDictionaryBatch(message)) { + const dictionaryId = message.id; + const dictionaries = loader.dictionaries; + const dictionaryField = schema.dictionaries.get(dictionaryId)!; + const dictionaryDataType = (dictionaryField.type as Dictionary).dictionary; + let dictionaryVector = Vector.create(loader.visit(dictionaryDataType)); + if (message.isDelta && dictionaries.has(dictionaryId)) { + dictionaryVector = dictionaries.get(dictionaryId)!.concat(dictionaryVector); + } + dictionaries.set(dictionaryId, dictionaryVector); + } +} + +export abstract class TypeDataLoader extends TypeVisitor { + + public dictionaries: Map; + protected nodes: Iterator; + protected buffers: Iterator; + + constructor(nodes: Iterator, buffers: Iterator, dictionaries: Map) { + super(); + this.nodes = nodes; + this.buffers = buffers; + this.dictionaries = dictionaries; + } + + public visitFields(fields: Field[]) { return fields.map((field) => this.visit(field.type)); } + + public visitNull (type: Null) { return this.visitNullType(type); } + public visitInt (type: Int) { return this.visitFlatType(type); } + public visitFloat (type: Float) { return this.visitFlatType(type); } + public visitBinary (type: Binary) { return this.visitFlatList(type); } + public visitUtf8 (type: Utf8) { return this.visitFlatList(type); } + public visitBool (type: Bool) { return this.visitBoolType(type); } + public visitDecimal (type: Decimal) { return this.visitFlatType(type); } + public visitDate (type: Date_) { return this.visitFlatType(type); } + public visitTime (type: Time) { return this.visitFlatType(type); } + public visitTimestamp (type: Timestamp) { return this.visitFlatType(type); } + public visitInterval (type: Interval) { return this.visitFlatType(type); } + public visitList (type: List) { return this.visitListType(type); } + public visitStruct (type: Struct) { return this.visitNestedType(type); } + public visitUnion (type: Union) { return this.visitUnionType(type); } + public visitFixedSizeBinary(type: FixedSizeBinary) { return this.visitFlatType(type); } + public visitFixedSizeList (type: FixedSizeList) { return this.visitFixedSizeListType(type); } + public visitMap (type: Map_) { return this.visitNestedType(type); } + public visitDictionary (type: Dictionary) { + return new DictionaryData(type, this.dictionaries.get(type.id)!, this.visit(type.indicies)); + } + protected getFieldMetadata() { return this.nodes.next().value; } + protected getBufferMetadata() { return this.buffers.next().value; } + protected readNullBitmap(type: T, nullCount: number, buffer = this.getBufferMetadata()) { + return nullCount > 0 && this.readData(type, buffer) || new Uint8Array(0); + } + protected abstract readData(type: T, buffer?: BufferMetadata): any; + protected abstract readOffsets(type: T, buffer?: BufferMetadata): any; + protected abstract readTypeIds(type: T, buffer?: BufferMetadata): any; + protected visitNullType(type: Null, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new FlatData(type, length, this.readNullBitmap(type, nullCount), new Uint8Array(0), 0, nullCount); + } + protected visitFlatType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new FlatData(type, length, this.readNullBitmap(type, nullCount), this.readData(type), 0, nullCount); + } + protected visitBoolType(type: Bool, { length, nullCount }: FieldMetadata = this.getFieldMetadata(), data?: Uint8Array) { + return new BoolData(type, length, this.readNullBitmap(type, nullCount), data || this.readData(type), 0, nullCount); + } + protected visitFlatList(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new FlatListData(type, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.readData(type), 0, nullCount); + } + protected visitListType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new ListData(type, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.visit(type.children![0].type), 0, nullCount); + } + protected visitFixedSizeListType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new SingleNestedData(type, length, this.readNullBitmap(type, nullCount), this.visit(type.children![0].type), 0, nullCount); + } + protected visitNestedType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new NestedData(type, length, this.readNullBitmap(type, nullCount), this.visitFields(type.children), 0, nullCount); + } + protected visitUnionType(type: DenseUnion | SparseUnion, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return type.mode === UnionMode.Sparse ? + new SparseUnionData(type as SparseUnion, length, this.readNullBitmap(type, nullCount), this.readTypeIds(type), this.visitFields(type.children), 0, nullCount) : + new DenseUnionData(type as DenseUnion, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.readTypeIds(type), this.visitFields(type.children), 0, nullCount); + } +} diff --git a/js/src/predicate.ts b/js/src/predicate.ts new file mode 100644 index 0000000000000..9d55274bd880b --- /dev/null +++ b/js/src/predicate.ts @@ -0,0 +1,225 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { RecordBatch } from './recordbatch'; +import { Vector, DictionaryVector } from './vector'; + +export type ValueFunc = (idx: number, cols: RecordBatch) => T | null; +export type PredicateFunc = (idx: number, cols: RecordBatch) => boolean; + +export abstract class Value { + eq(other: Value | T): Predicate { + if (!(other instanceof Value)) { other = new Literal(other); } + return new Equals(this, other); + } + lteq(other: Value | T): Predicate { + if (!(other instanceof Value)) { other = new Literal(other); } + return new LTeq(this, other); + } + gteq(other: Value | T): Predicate { + if (!(other instanceof Value)) { other = new Literal(other); } + return new GTeq(this, other); + } +} + +export class Literal extends Value { + constructor(public v: T) { super(); } +} + +export class Col extends Value { + // @ts-ignore + public vector: Vector; + // @ts-ignore + public colidx: number; + + constructor(public name: string) { super(); } + bind(batch: RecordBatch) { + if (!this.colidx) { + // Assume column index doesn't change between calls to bind + //this.colidx = cols.findIndex(v => v.name.indexOf(this.name) != -1); + this.colidx = -1; + const fields = batch.schema.fields; + for (let idx = -1; ++idx < fields.length;) { + if (fields[idx].name === this.name) { + this.colidx = idx; + break; + } + } + if (this.colidx < 0) { throw new Error(`Failed to bind Col "${this.name}"`); } + } + this.vector = batch.getChildAt(this.colidx)!; + return this.vector.get.bind(this.vector); + } +} + +export abstract class Predicate { + abstract bind(batch: RecordBatch): PredicateFunc; + and(expr: Predicate): Predicate { return new And(this, expr); } + or(expr: Predicate): Predicate { return new Or(this, expr); } + ands(): Predicate[] { return [this]; } +} + +export abstract class ComparisonPredicate extends Predicate { + constructor(public readonly left: Value, public readonly right: Value) { + super(); + } + + bind(batch: RecordBatch) { + if (this.left instanceof Literal) { + if (this.right instanceof Literal) { + return this._bindLitLit(batch, this.left, this.right); + } else { // right is a Col + + return this._bindLitCol(batch, this.left, this.right as Col); + } + } else { // left is a Col + if (this.right instanceof Literal) { + return this._bindColLit(batch, this.left as Col, this.right); + } else { // right is a Col + return this._bindColCol(batch, this.left as Col, this.right as Col); + } + } + } + + protected abstract _bindLitLit(batch: RecordBatch, left: Literal, right: Literal): PredicateFunc; + protected abstract _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc; + protected abstract _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc; + protected abstract _bindLitCol(batch: RecordBatch, lit: Literal, col: Col): PredicateFunc; +} + +export abstract class CombinationPredicate extends Predicate { + constructor(public readonly left: Predicate, public readonly right: Predicate) { + super(); + } +} + +export class And extends CombinationPredicate { + bind(batch: RecordBatch) { + const left = this.left.bind(batch); + const right = this.right.bind(batch); + return (idx: number, batch: RecordBatch) => left(idx, batch) && right(idx, batch); + } + ands(): Predicate[] { return this.left.ands().concat(this.right.ands()); } +} + +export class Or extends CombinationPredicate { + bind(batch: RecordBatch) { + const left = this.left.bind(batch); + const right = this.right.bind(batch); + return (idx: number, batch: RecordBatch) => left(idx, batch) || right(idx, batch); + } +} + +export class Equals extends ComparisonPredicate { + protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { + const rtrn: boolean = left.v == right.v; + return () => rtrn; + } + + protected _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc { + const left_func = left.bind(batch); + const right_func = right.bind(batch); + return (idx: number, batch: RecordBatch) => left_func(idx, batch) == right_func(idx, batch); + } + + protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(batch); + if (col.vector instanceof DictionaryVector) { + // Assume that there is only one key with the value `lit.v` + // TODO: add lazily-computed reverse dictionary lookups, associated + // with col.vector.data so that we only have to do this once per + // dictionary + let key = -1; + let dict = col.vector; + let data = dict.dictionary!; + for (let len = data.length; ++key < len;) { + if (data.get(key) === lit.v) { + break; + } + } + + if (key == data.length) { + // the value doesn't exist in the dictionary - always return + // false + // TODO: special-case of PredicateFunc that encapsulates this + // "always false" behavior. That way filtering operations don't + // have to bother checking + return () => false; + } else { + return (idx: number) => { + return dict.getKey(idx) === key; + }; + } + } else { + return (idx: number, cols: RecordBatch) => col_func(idx, cols) == lit.v; + } + } + + protected _bindLitCol(batch: RecordBatch, lit: Literal, col: Col) { + // Equals is comutative + return this._bindColLit(batch, col, lit); + } +} + +export class LTeq extends ComparisonPredicate { + protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { + const rtrn: boolean = left.v <= right.v; + return () => rtrn; + } + + protected _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc { + const left_func = left.bind(batch); + const right_func = right.bind(batch); + return (idx: number, cols: RecordBatch) => left_func(idx, cols) <= right_func(idx, cols); + } + + protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(batch); + return (idx: number, cols: RecordBatch) => col_func(idx, cols) <= lit.v; + } + + protected _bindLitCol(batch: RecordBatch, lit: Literal, col: Col) { + const col_func = col.bind(batch); + return (idx: number, cols: RecordBatch) => lit.v <= col_func(idx, cols); + } +} + +export class GTeq extends ComparisonPredicate { + protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { + const rtrn: boolean = left.v >= right.v; + return () => rtrn; + } + + protected _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc { + const left_func = left.bind(batch); + const right_func = right.bind(batch); + return (idx: number, cols: RecordBatch) => left_func(idx, cols) >= right_func(idx, cols); + } + + protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(batch); + return (idx: number, cols: RecordBatch) => col_func(idx, cols) >= lit.v; + } + + protected _bindLitCol(batch: RecordBatch, lit: Literal, col: Col) { + const col_func = col.bind(batch); + return (idx: number, cols: RecordBatch) => lit.v >= col_func(idx, cols); + } +} + +export function lit(v: any): Value { return new Literal(v); } +export function col(n: string): Col { return new Col(n); } diff --git a/js/src/reader/buffer.ts b/js/src/reader/buffer.ts deleted file mode 100644 index c7b90507e396f..0000000000000 --- a/js/src/reader/buffer.ts +++ /dev/null @@ -1,229 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { flatbuffers } from 'flatbuffers'; -import { VectorLayoutReader } from './vector'; -import { TypedArray, TypedArrayConstructor } from '../vector/types'; -import { footerFromByteBuffer, messageFromByteBuffer } from '../format/fb'; -import { Footer, Schema, RecordBatch, DictionaryBatch, Field, Buffer, FieldNode } from '../format/arrow'; -import ByteBuffer = flatbuffers.ByteBuffer; - -export function* readBuffers(sources: Iterable) { - let schema: Schema | null = null; - let readMessages: ((bb: ByteBuffer) => IterableIterator) | null = null; - for (const source of sources) { - const bb = toByteBuffer(source); - if ((!schema && ({ schema, readMessages } = readSchema(bb))) && schema && readMessages) { - for (const message of readMessages(bb)) { - yield { - schema, message, reader: new BufferVectorLayoutReader( - bb, - (function* (fieldNodes) { yield* fieldNodes; })(message.fieldNodes), - (function* (buffers) { yield* buffers; })(message.buffers) - ) as VectorLayoutReader - }; - } - } - } -} - -export async function* readBuffersAsync(sources: AsyncIterable) { - let schema: Schema | null = null; - let readMessages: ((bb: ByteBuffer) => IterableIterator) | null = null; - for await (const source of sources) { - const bb = toByteBuffer(source); - if ((!schema && ({ schema, readMessages } = readSchema(bb))) && schema && readMessages) { - for (const message of readMessages(bb)) { - yield { - schema, message, reader: new BufferVectorLayoutReader( - bb, - (function* (fieldNodes) { yield* fieldNodes; })(message.fieldNodes), - (function* (buffers) { yield* buffers; })(message.buffers) - ) as VectorLayoutReader - }; - } - } - } -} - -function toByteBuffer(bytes?: Uint8Array | NodeBuffer | string) { - let arr: Uint8Array = bytes as any || new Uint8Array(0); - if (typeof bytes === 'string') { - arr = new Uint8Array(bytes.length); - for (let i = -1, n = bytes.length; ++i < n;) { - arr[i] = bytes.charCodeAt(i); - } - return new ByteBuffer(arr); - } - return new ByteBuffer(arr); -} - -function readSchema(bb: ByteBuffer) { - let schema: Schema, readMessages, footer: Footer | null; - if (footer = readFileSchema(bb)) { - schema = footer.schema!; - readMessages = readFileMessages(footer); - } else if (schema = readStreamSchema(bb)!) { - readMessages = readStreamMessages; - } else { - throw new Error('Invalid Arrow buffer'); - } - return { schema, readMessages }; -} - -const PADDING = 4; -const MAGIC_STR = 'ARROW1'; -const MAGIC = new Uint8Array(MAGIC_STR.length); -for (let i = 0; i < MAGIC_STR.length; i += 1 | 0) { - MAGIC[i] = MAGIC_STR.charCodeAt(i); -} - -function checkForMagicArrowString(buffer: Uint8Array, index = 0) { - for (let i = -1, n = MAGIC.length; ++i < n;) { - if (MAGIC[i] !== buffer[index + i]) { - return false; - } - } - return true; -} - -const magicLength = MAGIC.length; -const magicAndPadding = magicLength + PADDING; -const magicX2AndPadding = magicLength * 2 + PADDING; - -function readStreamSchema(bb: ByteBuffer) { - if (!checkForMagicArrowString(bb.bytes(), 0)) { - for (const message of readMessages(bb)) { - if (message.isSchema()) { - return message as Schema; - } - } - } - return null; -} - -function* readStreamMessages(bb: ByteBuffer) { - for (const message of readMessages(bb)) { - if (message.isRecordBatch()) { - yield message; - } else if (message.isDictionaryBatch()) { - yield message; - } else { - continue; - } - // position the buffer after the body to read the next message - bb.setPosition(bb.position() + message.bodyLength.low); - } -} - -function readFileSchema(bb: ByteBuffer) { - let fileLength = bb.capacity(), footerLength: number, footerOffset: number; - if ((fileLength < magicX2AndPadding /* Arrow buffer too small */) || - (!checkForMagicArrowString(bb.bytes(), 0) /* Missing magic start */) || - (!checkForMagicArrowString(bb.bytes(), fileLength - magicLength) /* Missing magic end */) || - (/* Invalid footer length */ - (footerLength = bb.readInt32(footerOffset = fileLength - magicAndPadding)) < 1 && - (footerLength + magicX2AndPadding > fileLength))) { - return null; - } - bb.setPosition(footerOffset - footerLength); - return footerFromByteBuffer(bb); -} - -function readFileMessages(footer: Footer) { - return function* (bb: ByteBuffer) { - for (let i = -1, batches = footer.dictionaryBatches, n = batches.length; ++i < n;) { - bb.setPosition(batches[i].offset.low); - yield readMessage(bb, bb.readInt32(bb.position())) as DictionaryBatch; - } - for (let i = -1, batches = footer.recordBatches, n = batches.length; ++i < n;) { - bb.setPosition(batches[i].offset.low); - yield readMessage(bb, bb.readInt32(bb.position())) as RecordBatch; - } - }; -} - -function* readMessages(bb: ByteBuffer) { - let length: number, message: Schema | RecordBatch | DictionaryBatch; - while (bb.position() < bb.capacity() && - (length = bb.readInt32(bb.position())) > 0) { - if (message = readMessage(bb, length)!) { - yield message; - } - } -} - -function readMessage(bb: ByteBuffer, length: number) { - bb.setPosition(bb.position() + PADDING); - const message = messageFromByteBuffer(bb); - bb.setPosition(bb.position() + length); - return message; -} - -class BufferVectorLayoutReader implements VectorLayoutReader { - private offset: number; - private bytes: Uint8Array; - constructor(bb: ByteBuffer, private fieldNodes: Iterator, private buffers: Iterator) { - this.bytes = bb.bytes(); - this.offset = bb.position(); - } - readContainerLayout(field: Field) { - const { bytes, offset, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(bytes, field, fieldNode, offset, buffers.next().value) - }; - } - readFixedWidthLayout(field: Field, dataType: TypedArrayConstructor) { - const { bytes, offset, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(bytes, field, fieldNode, offset, buffers.next().value), - data: createTypedArray(bytes, field, fieldNode, offset, buffers.next().value, dataType) - }; - } - readBinaryLayout(field: Field) { - const { bytes, offset, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(bytes, field, fieldNode, offset, buffers.next().value), - offsets: createTypedArray(bytes, field, fieldNode, offset, buffers.next().value, Int32Array), - data: createTypedArray(bytes, field, fieldNode, offset, buffers.next().value, Uint8Array) - }; - } - readVariableWidthLayout(field: Field) { - const { bytes, offset, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(bytes, field, fieldNode, offset, buffers.next().value), - offsets: createTypedArray(bytes, field, fieldNode, offset, buffers.next().value, Int32Array) - }; - } -} - -function createValidityArray(bytes: Uint8Array, field: Field, fieldNode: FieldNode, offset: number, buffer: Buffer) { - return field.nullable && fieldNode.nullCount.low > 0 && - createTypedArray(bytes, field, fieldNode, offset, buffer, Uint8Array) || null; -} - -function createTypedArray(bytes: Uint8Array, _field: Field, _fieldNode: FieldNode, offset: number, buffer: Buffer, ArrayConstructor: TypedArrayConstructor): T { - return new ArrayConstructor( - bytes.buffer, - bytes.byteOffset + offset + buffer.offset.low, - buffer.length.low / ArrayConstructor.BYTES_PER_ELEMENT - ); -} diff --git a/js/src/reader/json.ts b/js/src/reader/json.ts deleted file mode 100644 index 49431496354e8..0000000000000 --- a/js/src/reader/json.ts +++ /dev/null @@ -1,176 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import * as Schema_ from '../format/fb/Schema'; -import { Int64, Int128 } from '../util/int'; -import { VectorLayoutReader } from './vector'; -import { TextEncoder } from 'text-encoding-utf-8'; -import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; -import { TypedArray, TypedArrayConstructor } from '../vector/types'; -import { schemaFromJSON, recordBatchFromJSON, dictionaryBatchFromJSON } from '../format/json'; -import { Schema, RecordBatch, DictionaryBatch, Field, Buffer, FieldNode } from '../format/arrow'; -export { Schema, RecordBatch, DictionaryBatch }; - -export function* readJSON(json: any) { - const schema = schemaFromJSON(json['schema']); - for (const batch of (json['dictionaries'] || [])) { - const message = dictionaryBatchFromJSON(batch); - yield { - schema, message, reader: new JSONVectorLayoutReader( - flattenDataSources(batch['data']['columns']), - (function* (fieldNodes) { yield* fieldNodes; })(message.fieldNodes), - (function* (buffers) { yield* buffers; })(message.buffers) - ) as VectorLayoutReader - }; - } - for (const batch of (json['batches'] || [])) { - const message = recordBatchFromJSON(batch); - yield { - schema, message, reader: new JSONVectorLayoutReader( - flattenDataSources(batch['columns']), - (function* (fieldNodes) { yield* fieldNodes; })(message.fieldNodes), - (function* (buffers) { yield* buffers; })(message.buffers) - ) as VectorLayoutReader - }; - } -} - -function flattenDataSources(xs: any[]): any[][] { - return (xs || []).reduce((buffers, column: any) => [ - ...buffers, - ...(column['VALIDITY'] && [column['VALIDITY']] || []), - ...(column['OFFSET'] && [column['OFFSET']] || []), - ...(column['DATA'] && [column['DATA']] || []), - ...flattenDataSources(column['children']) - ], [] as any[][]); -} - -class JSONVectorLayoutReader implements VectorLayoutReader { - constructor(private sources: any[][], private fieldNodes: Iterator, private buffers: Iterator) {} - readContainerLayout(field: Field) { - const { sources, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(sources, field, fieldNode, buffers.next().value) - }; - } - readFixedWidthLayout(field: Field, dataType: TypedArrayConstructor) { - const { sources, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(sources, field, fieldNode, buffers.next().value), - data: createDataArray(sources, field, fieldNode, buffers.next().value, dataType) - }; - } - readBinaryLayout(field: Field) { - const { sources, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(sources, field, fieldNode, buffers.next().value), - offsets: new Int32Array(sources[buffers.next().value.offset.low]), - data: createDataArray(sources, field, fieldNode, buffers.next().value, Uint8Array) - }; - } - readVariableWidthLayout(field: Field) { - const { sources, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(sources, field, fieldNode, buffers.next().value), - offsets: new Int32Array(sources[buffers.next().value.offset.low]), - }; - } -} - -function createValidityArray(sources: any[][], field: Field, fieldNode: FieldNode, buffer: Buffer) { - return field.nullable && fieldNode.nullCount.low > 0 && - booleanFromJSON(sources[buffer.offset.low]) || null; -} - -const encoder = new TextEncoder('utf-8'); - -function createDataArray(sources: any[][], field: Field, _fieldNode: FieldNode, buffer: Buffer, ArrayConstructor: TypedArrayConstructor): T { - let type = field.type, data: ArrayLike | ArrayBufferLike; - if (type.isTimestamp() === true) { - data = int64sFromJSON(sources[buffer.offset.low] as string[]); - } else if ((type.isInt() || type.isTime()) && type.bitWidth === 64) { - data = int64sFromJSON(sources[buffer.offset.low] as string[]); - } else if (type.isDate() && type.unit === DateUnit.MILLISECOND) { - data = int64sFromJSON(sources[buffer.offset.low] as string[]); - } else if (type.isDecimal() === true) { - data = decimalFromJSON(sources[buffer.offset.low] as string[]); - } else if (type.isBinary() === true) { - data = binaryFromJSON(sources[buffer.offset.low] as string[]); - } else if (type.isBool() === true) { - data = booleanFromJSON(sources[buffer.offset.low] as number[]).buffer; - } else if (type.isUtf8() === true) { - data = encoder.encode((sources[buffer.offset.low] as string[]).join('')); - } else { - data = (sources[buffer.offset.low]).map((x) => +x); - } - return new ArrayConstructor(data); -} - -function int64sFromJSON(values: string[]) { - const data = new Uint32Array(values.length * 2); - for (let i = -1, n = values.length; ++i < n;) { - // Force all values (even numbers) to be parsed as strings since - // pulling out high and low bits seems to lose precision sometimes - // For example: - // > -4613034156400212000 >>> 0 - // 721782784 - // The correct lower 32-bits are 721782752 - Int64.fromString(values[i].toString(), new Uint32Array(data.buffer, data.byteOffset + 2 * i * 4, 2)); - } - return data.buffer; -} - -function decimalFromJSON(values: string[]) { - const data = new Uint32Array(values.length * 4); - for (let i = -1, n = values.length; ++i < n;) { - Int128.fromString(values[i], new Uint32Array(data.buffer, data.byteOffset + 4 * 4 * i, 4)); - } - return data.buffer; -} - -function binaryFromJSON(values: string[]) { - // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] - // There are definitely more efficient ways to do this... but it gets the - // job done. - const joined = values.join(''); - const data = new Uint8Array(joined.length / 2); - for (let i = 0; i < joined.length; i += 2) { - data[i >> 1] = parseInt(joined.substr(i, 2), 16); - } - return data.buffer; -} - -function booleanFromJSON(arr: number[]) { - let xs = [], n, i = 0; - let bit = 0, byte = 0; - for (const value of arr) { - value && (byte |= 1 << bit); - if (++bit === 8) { - xs[i++] = byte; - byte = bit = 0; - } - } - if (i === 0 || bit > 0) { xs[i++] = byte; } - if (i % 8 && (n = i + 8 - i % 8)) { - do { xs[i] = 0; } while (++i < n); - } - return new Uint8Array(xs); -} diff --git a/js/src/reader/vector.ts b/js/src/reader/vector.ts deleted file mode 100644 index 3bd6d2bb67650..0000000000000 --- a/js/src/reader/vector.ts +++ /dev/null @@ -1,255 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import * as Schema_ from '../format/fb/Schema'; -import { TypedArray, TypedArrayConstructor } from '../vector/types'; -import { Schema, RecordBatch, DictionaryBatch, Field, FieldNode } from '../format/arrow'; -import { Int, Date, Time, Timestamp, Decimal, FixedSizeList, FixedSizeBinary, FloatingPoint } from '../format/arrow'; -import { - Vector, BoolVector, BinaryVector, DictionaryVector, - Int8Vector, Int16Vector, Int32Vector, Int64Vector, - Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector, - Utf8Vector, ListVector, FixedSizeListVector, StructVector, - Float16Vector, Float32Vector, Float64Vector, DecimalVector, - Date32Vector, Date64Vector, Time32Vector, Time64Vector, TimestampVector, -} from '../vector/arrow'; - -import Type = Schema_.org.apache.arrow.flatbuf.Type; -import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; -import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; -import Precision = Schema_.org.apache.arrow.flatbuf.Precision; -// import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; - -export interface ContainerLayout { - fieldNode: FieldNode; - validity: Uint8Array | null | void; -} - -export interface VariableWidthLayout { - fieldNode: FieldNode; - offsets: Int32Array; - validity: Uint8Array | null | void; -} - -export interface BinaryLayout extends FixedWidthLayout { - offsets: Int32Array; -} - -export interface FixedWidthLayout { - fieldNode: FieldNode; - data: T; - validity: Uint8Array | null | void; -} - -export function* readVectors(messages: Iterable<{ schema: Schema, message: RecordBatch | DictionaryBatch, reader: VectorLayoutReader }>) { - const dictionaries = new Map(); - for (const { schema, message, reader } of messages) { - yield* readMessageVectors(schema, message, new VectorReader(dictionaries, reader)); - } -} - -export async function* readVectorsAsync(messages: AsyncIterable<{ schema: Schema, message: RecordBatch | DictionaryBatch, reader: VectorLayoutReader }>) { - const dictionaries = new Map(); - for await (const { schema, message, reader } of messages) { - yield* readMessageVectors(schema, message, new VectorReader(dictionaries, reader)); - } -} - -function* readMessageVectors(schema: Schema, message: RecordBatch | DictionaryBatch, reader: VectorReader) { - if (message.isRecordBatch() === true) { - yield schema.fields.map((field) => reader.readVector(field)); - } else if (message.isDictionaryBatch()) { - let id = message.dictionaryId.toFloat64().toString(); - let vector = reader.readValueVector(schema.dictionaries.get(id)!); - if (message.isDelta) { - vector = reader.dictionaries.get(id)!.concat(vector); - } - reader.dictionaries.set(id, vector); - } -} - -export interface VectorLayoutReader { - readBinaryLayout(field: Field): BinaryLayout; - readContainerLayout(field: Field): ContainerLayout; - readVariableWidthLayout(field: Field): VariableWidthLayout; - readFixedWidthLayout(field: Field, TypedArrayConstructor: TypedArrayConstructor): FixedWidthLayout; -} - -export class VectorReader implements VectorLayoutReader { - constructor(public dictionaries: Map, protected layout: VectorLayoutReader) {} - readVector(field: Field): Vector { - return this.readDictionaryVector(field) || this.readValueVector(field); - } - readDictionaryVector(field: Field) { - const encoding = field.dictionary; - if (encoding) { - const keys = this.readIntVector(field.indexField()); - const data = this.dictionaries.get(encoding.dictionaryId.toFloat64().toString())!; - return new DictionaryVector({ - field, data, keys, - validity: (keys as any).validity, - fieldNode: (keys as any).fieldNode, - }); - } - return null; - } - readValueVector(field: Field) { - switch (field.typeType) { - case Type.NONE: return this.readNullVector(); - case Type.Null: return this.readNullVector(); - // case Type.Map: return this.readMapVector(field); - case Type.Int: return this.readIntVector(field); - case Type.Bool: return this.readBoolVector(field); - case Type.Date: return this.readDateVector(field); - case Type.List: return this.readListVector(field); - case Type.Utf8: return this.readUtf8Vector(field); - case Type.Time: return this.readTimeVector(field); - // case Type.Union: return this.readUnionVector(field); - case Type.Binary: return this.readBinaryVector(field); - case Type.Decimal: return this.readDecimalVector(field); - case Type.Struct_: return this.readStructVector(field); - case Type.FloatingPoint: return this.readFloatVector(field); - case Type.Timestamp: return this.readTimestampVector(field); - case Type.FixedSizeList: return this.readFixedSizeListVector(field); - case Type.FixedSizeBinary: return this.readFixedSizeBinaryVector(field); - } - throw new Error(`Unrecognized ${field.toString()}`); - } - readNullVector() { - return new Vector(); - } - readBoolVector(field: Field) { - return new BoolVector(this.readFixedWidthLayout(field, Uint8Array)); - } - readDateVector(field: Field) { - const type = field.type as Date; - switch (type.unit) { - case DateUnit.DAY: return new Date32Vector({ ...this.readFixedWidthLayout(field, Int32Array), unit: DateUnit[type.unit] }); - case DateUnit.MILLISECOND: return new Date64Vector({ ...this.readFixedWidthLayout(field, Int32Array), unit: DateUnit[type.unit] }); - } - throw new Error(`Unrecognized ${type.toString()}`); - } - readTimeVector(field: Field) { - const type = field.type as Time; - switch (type.bitWidth) { - case 32: return new Time32Vector({ ...this.readFixedWidthLayout(field, Int32Array), unit: TimeUnit[type.unit] }); - case 64: return new Time64Vector({ ...this.readFixedWidthLayout(field, Uint32Array), unit: TimeUnit[type.unit] }); - } - throw new Error(`Unrecognized ${type.toString()}`); - } - readTimestampVector(field: Field) { - const type = field.type as Timestamp; - const { fieldNode, validity, data } = this.readFixedWidthLayout(field, Uint32Array); - return new TimestampVector({ - field, fieldNode, validity, data, - timezone: type.timezone!, - unit: TimeUnit[type.unit], - }); - } - readListVector(field: Field) { - const { fieldNode, validity, offsets } = this.readVariableWidthLayout(field); - return new ListVector({ - field, fieldNode, validity, offsets, - values: this.readVector(field.children[0]) - }); - } - readStructVector(field: Field) { - const { fieldNode, validity } = this.readContainerLayout(field); - return new StructVector({ - field, fieldNode, validity, - columns: field.children.map((field) => this.readVector(field)) - }); - } - readBinaryVector(field: Field) { - return new BinaryVector(this.readBinaryLayout(field)); - } - readDecimalVector(field: Field) { - const type = field.type as Decimal; - const { fieldNode, validity, data } = this.readFixedWidthLayout(field, Uint32Array); - return new DecimalVector({ - scale: type.scale, - precision: type.precision, - field, fieldNode, validity, data - }); - } - readUtf8Vector(field: Field) { - const { fieldNode, validity, offsets, data } = this.readBinaryLayout(field); - return new Utf8Vector({ - field, fieldNode, - values: new BinaryVector({ - validity, offsets, data - }) - }); - } - readFixedSizeListVector(field: Field) { - const type = field.type as FixedSizeList; - const { fieldNode, validity } = this.readContainerLayout(field); - return new FixedSizeListVector({ - field, fieldNode, validity, - size: type.listSize, - values: this.readVector(field.children[0]) - }); - } - readFixedSizeBinaryVector(field: Field) { - const type = field.type as FixedSizeBinary; - const { fieldNode, validity, data } = this.readFixedWidthLayout(field, Uint8Array); - return new FixedSizeListVector({ - size: type.byteWidth, - field, fieldNode, validity, - values: new Uint8Vector({ data }) - }); - } - readFloatVector(field: Field) { - const type = field.type as FloatingPoint; - switch (type.precision) { - case Precision.HALF: return new Float16Vector(this.readFixedWidthLayout(field, Uint16Array)); - case Precision.SINGLE: return new Float32Vector(this.readFixedWidthLayout(field, Float32Array)); - case Precision.DOUBLE: return new Float64Vector(this.readFixedWidthLayout(field, Float64Array)); - } - throw new Error(`Unrecognized FloatingPoint { precision: ${type.precision} }`); - } - readIntVector(field: Field) { - const type = field.type as Int; - if (type.isSigned) { - switch (type.bitWidth) { - case 8: return new Int8Vector(this.readFixedWidthLayout(field, Int8Array)); - case 16: return new Int16Vector(this.readFixedWidthLayout(field, Int16Array)); - case 32: return new Int32Vector(this.readFixedWidthLayout(field, Int32Array)); - case 64: return new Int64Vector(this.readFixedWidthLayout(field, Int32Array)); - } - } - switch (type.bitWidth) { - case 8: return new Uint8Vector(this.readFixedWidthLayout(field, Uint8Array)); - case 16: return new Uint16Vector(this.readFixedWidthLayout(field, Uint16Array)); - case 32: return new Uint32Vector(this.readFixedWidthLayout(field, Uint32Array)); - case 64: return new Uint64Vector(this.readFixedWidthLayout(field, Uint32Array)); - } - throw new Error(`Unrecognized Int { isSigned: ${type.isSigned}, bitWidth: ${type.bitWidth} }`); - } - readContainerLayout(field: Field) { - return this.layout.readContainerLayout(field); - } - readBinaryLayout(field: Field) { - return this.layout.readBinaryLayout(field); - } - readVariableWidthLayout(field: Field) { - return this.layout.readVariableWidthLayout(field); - } - readFixedWidthLayout(field: Field, TypedArrayConstructor: TypedArrayConstructor) { - return this.layout.readFixedWidthLayout(field, TypedArrayConstructor); - } -} diff --git a/js/src/recordbatch.ts b/js/src/recordbatch.ts new file mode 100644 index 0000000000000..07d94a9d49629 --- /dev/null +++ b/js/src/recordbatch.ts @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Schema, Struct, DataType } from './type'; +import { flatbuffers } from 'flatbuffers'; +import { View, Vector, StructVector } from './vector'; +import { Data, NestedData } from './data'; + +import Long = flatbuffers.Long; + +export class RecordBatch extends StructVector { + public static from(vectors: Vector[]) { + return new RecordBatch(Schema.from(vectors), + Math.max(...vectors.map((v) => v.length)), + vectors + ); + } + public readonly schema: Schema; + public readonly length: number; + public readonly numCols: number; + constructor(schema: Schema, data: Data, view: View); + constructor(schema: Schema, numRows: Long | number, cols: Data | Vector[]); + constructor(...args: any[]) { + if (typeof args[1] !== 'number') { + const data = args[1] as Data; + super(data, args[2]); + this.schema = args[0]; + this.length = data.length; + } else { + const [schema, numRows, cols] = args; + const childData: Data[] = new Array(cols.length); + for (let index = -1, length = cols.length; ++index < length;) { + const col: Data | Vector = cols[index]; + childData[index] = col instanceof Vector ? col.data : col; + } + super(new NestedData(new Struct(schema.fields), numRows, null, childData)); + this.schema = schema; + this.length = numRows; + } + this.numCols = this.schema.fields.length; + } + public clone(data: Data, view: View = this.view.clone(data)): this { + return new RecordBatch(this.schema, data as any, view) as any; + } + public getChildAt(index: number): Vector | null { + return index < 0 || index >= this.numCols ? null : super.getChildAt(index); + } + public select(...columnNames: string[]) { + const fields = this.schema.fields; + const namesToKeep = columnNames.reduce((xs, x) => (xs[x] = true) && xs, Object.create(null)); + return new RecordBatch( + this.schema.select(...columnNames), this.length, + this.childData.filter((_, i) => namesToKeep[fields[i].name]) + ); + } +} diff --git a/js/src/table.ts b/js/src/table.ts new file mode 100644 index 0000000000000..3e50d16e3724d --- /dev/null +++ b/js/src/table.ts @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { RecordBatch } from './recordbatch'; +import { Col, Predicate } from './predicate'; +import { Schema, Field, Struct } from './type'; +import { read, readAsync } from './ipc/reader/arrow'; +import { isPromise, isAsyncIterable } from './util/compat'; +import { Vector, DictionaryVector, IntVector, StructVector } from './vector'; +import { ChunkedView } from './vector/chunked'; + +export type NextFunc = (idx: number, batch: RecordBatch) => void; +export type BindFunc = (batch: RecordBatch) => void; + +export interface DataFrame { + filter(predicate: Predicate): DataFrame; + scan(next: NextFunc, bind?: BindFunc): void; + count(): number; + countBy(col: (Col|string)): CountByResult; +} + +export class Table implements DataFrame { + static empty() { return new Table(new Schema([]), []); } + static from(sources?: Iterable | object | string) { + if (sources) { + let schema: Schema | undefined; + let recordBatches: RecordBatch[] = []; + for (let recordBatch of read(sources)) { + schema = schema || recordBatch.schema; + recordBatches.push(recordBatch); + } + return new Table(schema || new Schema([]), recordBatches); + } + return Table.empty(); + } + static async fromAsync(sources?: AsyncIterable) { + if (isAsyncIterable(sources)) { + let schema: Schema | undefined; + let recordBatches: RecordBatch[] = []; + for await (let recordBatch of readAsync(sources)) { + schema = schema || recordBatch.schema; + recordBatches.push(recordBatch); + } + return new Table(schema || new Schema([]), recordBatches); + } else if (isPromise(sources)) { + return Table.from(await sources); + } else if (sources) { + return Table.from(sources); + } + return Table.empty(); + } + static fromStruct(struct: StructVector) { + const schema = new Schema(struct.type.children); + const chunks = struct.view instanceof ChunkedView ? + (struct.view.chunkVectors as StructVector[]) : + [struct]; + return new Table(chunks.map((chunk) => new RecordBatch(schema, chunk.length, chunk.view.childData))); + } + + public readonly schema: Schema; + public readonly length: number; + public readonly numCols: number; + // List of inner RecordBatches + public readonly batches: RecordBatch[]; + // List of inner Vectors, possibly spanning batches + protected readonly _columns: Vector[] = []; + // Union of all inner RecordBatches into one RecordBatch, possibly chunked. + // If the Table has just one inner RecordBatch, this points to that. + // If the Table has multiple inner RecordBatches, then this is a Chunked view + // over the list of RecordBatches. This allows us to delegate the responsibility + // of indexing, iterating, slicing, and visiting to the Nested/Chunked Data/Views. + public readonly batchesUnion: RecordBatch; + + constructor(batches: RecordBatch[]); + constructor(...batches: RecordBatch[]); + constructor(schema: Schema, batches: RecordBatch[]); + constructor(schema: Schema, ...batches: RecordBatch[]); + constructor(...args: any[]) { + let schema: Schema; + let batches: RecordBatch[]; + if (args[0] instanceof Schema) { + schema = args[0]; + batches = Array.isArray(args[1][0]) ? args[1][0] : args[1]; + } else if (args[0] instanceof RecordBatch) { + schema = (batches = args)[0].schema; + } else { + schema = (batches = args[0])[0].schema; + } + this.schema = schema; + this.batches = batches; + this.batchesUnion = batches.length == 0 ? + new RecordBatch(schema, 0, []) : + batches.reduce((union, batch) => union.concat(batch)); + this.length = this.batchesUnion.length; + this.numCols = this.batchesUnion.numCols; + } + public get(index: number): Struct['TValue'] { + return this.batchesUnion.get(index)!; + } + public getColumn(name: string) { + return this.getColumnAt(this.getColumnIndex(name)); + } + public getColumnAt(index: number) { + return index < 0 || index >= this.numCols + ? null + : this._columns[index] || ( + this._columns[index] = this.batchesUnion.getChildAt(index)!); + } + public getColumnIndex(name: string) { + return this.schema.fields.findIndex((f) => f.name === name); + } + public [Symbol.iterator](): IterableIterator { + return this.batchesUnion[Symbol.iterator]() as any; + } + public filter(predicate: Predicate): DataFrame { + return new FilteredDataFrame(this.batches, predicate); + } + public scan(next: NextFunc, bind?: BindFunc) { + const batches = this.batches, numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + if (bind) { bind(batch); } + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + next(index, batch); + } + } + } + public count(): number { return this.length; } + public countBy(name: Col | string): CountByResult { + const batches = this.batches, numBatches = batches.length; + const count_by = typeof name === 'string' ? new Col(name) : name; + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary + count_by.bind(batches[numBatches - 1]); + const vector = count_by.vector as DictionaryVector; + if (!(vector instanceof DictionaryVector)) { + throw new Error('countBy currently only supports dictionary-encoded columns'); + } + // TODO: Adjust array byte width based on overall length + // (e.g. if this.length <= 255 use Uint8Array, etc...) + const counts: Uint32Array = new Uint32Array(vector.dictionary.length); + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + // rebind the countBy Col + count_by.bind(batch); + const keys = (count_by.vector as DictionaryVector).indicies; + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + let key = keys.get(index); + if (key !== null) { counts[key]++; } + } + } + return new CountByResult(vector.dictionary, IntVector.from(counts)); + } + public select(...columnNames: string[]) { + return new Table(this.batches.map((batch) => batch.select(...columnNames))); + } + public toString(separator?: string) { + let str = ''; + for (const row of this.rowsToString(separator)) { + str += row + '\n'; + } + return str; + } + public rowsToString(separator = ' | '): TableToStringIterator { + return new TableToStringIterator(tableRowsToString(this, separator)); + } +} + +class FilteredDataFrame implements DataFrame { + private predicate: Predicate; + private batches: RecordBatch[]; + constructor (batches: RecordBatch[], predicate: Predicate) { + this.batches = batches; + this.predicate = predicate; + } + public scan(next: NextFunc, bind?: BindFunc) { + // inlined version of this: + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) next(idx, columns); + // }); + const batches = this.batches; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + // TODO: bind batches lazily + // If predicate doesn't match anything in the batch we don't need + // to bind the callback + if (bind) { bind(batch); } + const predicate = this.predicate.bind(batch); + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + if (predicate(index, batch)) { next(index, batch); } + } + } + } + public count(): number { + // inlined version of this: + // let sum = 0; + // this.parent.scan((idx, columns) => { + // if (this.predicate(idx, columns)) ++sum; + // }); + // return sum; + let sum = 0; + const batches = this.batches; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this.predicate.bind(batch); + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + if (predicate(index, batch)) { ++sum; } + } + } + return sum; + } + public filter(predicate: Predicate): DataFrame { + return new FilteredDataFrame( + this.batches, + this.predicate.and(predicate) + ); + } + public countBy(name: Col | string): CountByResult { + const batches = this.batches, numBatches = batches.length; + const count_by = typeof name === 'string' ? new Col(name) : name; + // Assume that all dictionary batches are deltas, which means that the + // last record batch has the most complete dictionary + count_by.bind(batches[numBatches - 1]); + const vector = count_by.vector as DictionaryVector; + if (!(vector instanceof DictionaryVector)) { + throw new Error('countBy currently only supports dictionary-encoded columns'); + } + // TODO: Adjust array byte width based on overall length + // (e.g. if this.length <= 255 use Uint8Array, etc...) + const counts: Uint32Array = new Uint32Array(vector.dictionary.length); + for (let batchIndex = -1; ++batchIndex < numBatches;) { + // load batches + const batch = batches[batchIndex]; + const predicate = this.predicate.bind(batch); + // rebind the countBy Col + count_by.bind(batch); + const keys = (count_by.vector as DictionaryVector).indicies; + // yield all indices + for (let index = -1, numRows = batch.length; ++index < numRows;) { + let key = keys.get(index); + if (key !== null && predicate(index, batch)) { counts[key]++; } + } + } + return new CountByResult(vector.dictionary, IntVector.from(counts)); + } +} + +export class CountByResult extends Table implements DataFrame { + constructor(values: Vector, counts: IntVector) { + super( + new RecordBatch(new Schema([ + new Field('values', values.type), + new Field('counts', counts.type) + ]), + counts.length, [values, counts] + )); + } + public toJSON(): Object { + const values = this.getColumnAt(0)!; + const counts = this.getColumnAt(1)!; + const result = {} as { [k: string]: number | null }; + for (let i = -1; ++i < this.length;) { + result[values.get(i)] = counts.get(i); + } + return result; + } +} + +export class TableToStringIterator implements IterableIterator { + constructor(private iterator: IterableIterator) {} + [Symbol.iterator]() { return this.iterator; } + next(value?: any) { return this.iterator.next(value); } + throw(error?: any) { return this.iterator.throw && this.iterator.throw(error) || { done: true, value: '' }; } + return(value?: any) { return this.iterator.return && this.iterator.return(value) || { done: true, value: '' }; } + pipe(stream: NodeJS.WritableStream) { + let res: IteratorResult; + let write = () => { + if (stream['writable']) { + do { + if ((res = this.next()).done) { break; } + } while (stream['write'](res.value + '\n', 'utf8')); + } + if (!res || !res.done) { + stream['once']('drain', write); + } else if (!(stream as any)['isTTY']) { + stream['end']('\n'); + } + }; + write(); + } +} + +function* tableRowsToString(table: Table, separator = ' | ') { + const fields = table.schema.fields; + const header = ['row_id', ...fields.map((f) => `${f}`)].map(stringify); + const maxColumnWidths = header.map(x => x.length); + // Pass one to convert to strings and count max column widths + for (let i = -1, n = table.length - 1; ++i < n;) { + let val, row = [i, ...table.get(i)]; + for (let j = -1, k = row.length; ++j < k; ) { + val = stringify(row[j]); + maxColumnWidths[j] = Math.max(maxColumnWidths[j], val.length); + } + } + yield header.map((x, j) => leftPad(x, ' ', maxColumnWidths[j])).join(separator); + for (let i = -1; ++i < table.length;) { + yield [i, ...table.get(i)] + .map((x) => stringify(x)) + .map((x, j) => leftPad(x, ' ', maxColumnWidths[j])) + .join(separator); + } +} + +function leftPad(str: string, fill: string, n: number) { + return (new Array(n + 1).join(fill) + str).slice(-1 * n); +} + +function stringify(x: any) { + return typeof x === 'string' ? `"${x}"` : ArrayBuffer.isView(x) ? `[${x}]` : JSON.stringify(x); +} diff --git a/js/src/type.ts b/js/src/type.ts new file mode 100644 index 0000000000000..6f382bd5b2b05 --- /dev/null +++ b/js/src/type.ts @@ -0,0 +1,578 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as Schema_ from './fb/Schema'; +import * as Message_ from './fb/Message'; +import { Vector, View } from './vector'; +import { flatbuffers } from 'flatbuffers'; +import { DictionaryBatch } from './ipc/metadata'; +import { TypeVisitor, VisitorNode } from './visitor'; + +export import Long = flatbuffers.Long; +export import ArrowType = Schema_.org.apache.arrow.flatbuf.Type; +export import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; +export import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; +export import Precision = Schema_.org.apache.arrow.flatbuf.Precision; +export import UnionMode = Schema_.org.apache.arrow.flatbuf.UnionMode; +export import VectorType = Schema_.org.apache.arrow.flatbuf.VectorType; +export import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; +export import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; +export import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; + +export class Schema { + public static from(vectors: Vector[]) { + return new Schema(vectors.map((v, i) => new Field('' + i, v.type))); + } + // @ts-ignore + protected _bodyLength: number; + // @ts-ignore + protected _headerType: MessageHeader; + public readonly fields: Field[]; + public readonly version: MetadataVersion; + public readonly metadata?: Map; + public readonly dictionaries: Map>; + constructor(fields: Field[], + metadata?: Map, + version: MetadataVersion = MetadataVersion.V4, + dictionaries: Map> = new Map()) { + this.fields = fields; + this.version = version; + this.metadata = metadata; + this.dictionaries = dictionaries; + } + public get bodyLength() { return this._bodyLength; } + public get headerType() { return this._headerType; } + public select(...fieldNames: string[]): Schema { + const namesToKeep = fieldNames.reduce((xs, x) => (xs[x] = true) && xs, Object.create(null)); + const newDictFields = new Map(), newFields = this.fields.filter((f) => namesToKeep[f.name]); + this.dictionaries.forEach((f, dictId) => (namesToKeep[f.name]) && newDictFields.set(dictId, f)); + return new Schema(newFields, this.metadata, this.version, newDictFields); + } + public static [Symbol.toStringTag] = ((prototype: Schema) => { + prototype._bodyLength = 0; + prototype._headerType = MessageHeader.Schema; + return 'Schema'; + })(Schema.prototype); +} + +export class Field { + public readonly type: T; + public readonly name: string; + public readonly nullable: boolean; + public readonly metadata?: Map | null; + constructor(name: string, type: T, nullable = false, metadata?: Map | null) { + this.name = name; + this.type = type; + this.nullable = nullable; + this.metadata = metadata; + } + public toString() { return `${this.name}: ${this.type}`; } + public get typeId(): T['TType'] { return this.type.TType; } + public get [Symbol.toStringTag](): string { return 'Field'; } + public get indicies(): T | Int { + return DataType.isDictionary(this.type) ? this.type.indicies : this.type; + } +} + +export type TimeBitWidth = 32 | 64; +export type IntBitWidth = 8 | 16 | 32 | 64; + +export type NumericType = Int | Float | Date_ | Time | Interval | Timestamp; +export type FixedSizeType = Int64 | Uint64 | Decimal | FixedSizeBinary; +export type PrimitiveType = NumericType | FixedSizeType; + +export type FlatListType = Utf8 | Binary; // <-- these types have `offset`, `data`, and `validity` buffers +export type FlatType = Bool | PrimitiveType | FlatListType; // <-- these types have `data` and `validity` buffers +export type ListType = List; // <-- these types have `offset` and `validity` buffers +export type NestedType = Map_ | Struct | List | FixedSizeList | Union; // <-- these types have `validity` buffer and nested childData +export type SingleNestedType = List | FixedSizeList; // <-- these are nested types that can only have a single child + +/** + * * + * Main data type enumeration: + * * + * Data types in this library are all *logical*. They can be expressed as + * either a primitive physical type (bytes or bits of some fixed size), a + * nested type consisting of other data types, or another data type (e.g. a + * timestamp encoded as an int64) + */ + export enum Type { + NONE = 0, // The default placeholder type + Null = 1, // A NULL type having no physical storage + Int = 2, // Signed or unsigned 8, 16, 32, or 64-bit little-endian integer + Float = 3, // 2, 4, or 8-byte floating point value + Binary = 4, // Variable-length bytes (no guarantee of UTF8-ness) + Utf8 = 5, // UTF8 variable-length string as List + Bool = 6, // Boolean as 1 bit, LSB bit-packed ordering + Decimal = 7, // Precision-and-scale-based decimal type. Storage type depends on the parameters. + Date = 8, // int32_t days or int64_t milliseconds since the UNIX epoch + Time = 9, // Time as signed 32 or 64-bit integer, representing either seconds, milliseconds, microseconds, or nanoseconds since midnight since midnight + Timestamp = 10, // Exact timestamp encoded with int64 since UNIX epoch (Default unit millisecond) + Interval = 11, // YEAR_MONTH or DAY_TIME interval in SQL style + List = 12, // A list of some logical data type + Struct = 13, // Struct of logical types + Union = 14, // Union of logical types + FixedSizeBinary = 15, // Fixed-size binary. Each value occupies the same number of bytes + FixedSizeList = 16, // Fixed-size list. Each value occupies the same number of bytes + Map = 17, // Map of named logical types + Dictionary = 'Dictionary', // Dictionary aka Category type + DenseUnion = 'DenseUnion', // Dense Union of logical types + SparseUnion = 'SparseUnion', // Sparse Union of logical types +} + +export interface DataType { + readonly TType: TType; + readonly TArray: any; + readonly TValue: any; + readonly ArrayType: any; +} + +export abstract class DataType implements Partial { + + // @ts-ignore + public [Symbol.toStringTag]: string; + + static isNull (x: DataType): x is Null { return x.TType === Type.Null; } + static isInt (x: DataType): x is Int { return x.TType === Type.Int; } + static isFloat (x: DataType): x is Float { return x.TType === Type.Float; } + static isBinary (x: DataType): x is Binary { return x.TType === Type.Binary; } + static isUtf8 (x: DataType): x is Utf8 { return x.TType === Type.Utf8; } + static isBool (x: DataType): x is Bool { return x.TType === Type.Bool; } + static isDecimal (x: DataType): x is Decimal { return x.TType === Type.Decimal; } + static isDate (x: DataType): x is Date_ { return x.TType === Type.Date; } + static isTime (x: DataType): x is Time { return x.TType === Type.Time; } + static isTimestamp (x: DataType): x is Timestamp { return x.TType === Type.Timestamp; } + static isInterval (x: DataType): x is Interval { return x.TType === Type.Interval; } + static isList (x: DataType): x is List { return x.TType === Type.List; } + static isStruct (x: DataType): x is Struct { return x.TType === Type.Struct; } + static isUnion (x: DataType): x is Union { return x.TType === Type.Union; } + static isDenseUnion (x: DataType): x is DenseUnion { return x.TType === Type.DenseUnion; } + static isSparseUnion (x: DataType): x is SparseUnion { return x.TType === Type.SparseUnion; } + static isFixedSizeBinary (x: DataType): x is FixedSizeBinary { return x.TType === Type.FixedSizeBinary; } + static isFixedSizeList (x: DataType): x is FixedSizeList { return x.TType === Type.FixedSizeList; } + static isMap (x: DataType): x is Map_ { return x.TType === Type.Map; } + static isDictionary (x: DataType): x is Dictionary { return x.TType === Type.Dictionary; } + + constructor(public readonly TType: TType, + public readonly children?: Field[]) {} + + acceptTypeVisitor(visitor: TypeVisitor): any { + switch (this.TType) { + case Type.Null: return DataType.isNull(this) && visitor.visitNull(this) || null; + case Type.Int: return DataType.isInt(this) && visitor.visitInt(this) || null; + case Type.Float: return DataType.isFloat(this) && visitor.visitFloat(this) || null; + case Type.Binary: return DataType.isBinary(this) && visitor.visitBinary(this) || null; + case Type.Utf8: return DataType.isUtf8(this) && visitor.visitUtf8(this) || null; + case Type.Bool: return DataType.isBool(this) && visitor.visitBool(this) || null; + case Type.Decimal: return DataType.isDecimal(this) && visitor.visitDecimal(this) || null; + case Type.Date: return DataType.isDate(this) && visitor.visitDate(this) || null; + case Type.Time: return DataType.isTime(this) && visitor.visitTime(this) || null; + case Type.Timestamp: return DataType.isTimestamp(this) && visitor.visitTimestamp(this) || null; + case Type.Interval: return DataType.isInterval(this) && visitor.visitInterval(this) || null; + case Type.List: return DataType.isList(this) && visitor.visitList(this) || null; + case Type.Struct: return DataType.isStruct(this) && visitor.visitStruct(this) || null; + case Type.Union: return DataType.isUnion(this) && visitor.visitUnion(this) || null; + case Type.FixedSizeBinary: return DataType.isFixedSizeBinary(this) && visitor.visitFixedSizeBinary(this) || null; + case Type.FixedSizeList: return DataType.isFixedSizeList(this) && visitor.visitFixedSizeList(this) || null; + case Type.Map: return DataType.isMap(this) && visitor.visitMap(this) || null; + case Type.Dictionary: return DataType.isDictionary(this) && visitor.visitDictionary(this) || null; + default: return null; + } + } + protected static [Symbol.toStringTag] = ((proto: DataType) => { + ( proto).ArrayType = Array; + return proto[Symbol.toStringTag] = 'DataType'; + })(DataType.prototype); +} + +export interface Null extends DataType { TArray: void; TValue: null; } +export class Null extends DataType { + constructor() { + super(Type.Null); + } + public toString() { return `Null`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitNull(this); + } + protected static [Symbol.toStringTag] = ((proto: Null) => { + return proto[Symbol.toStringTag] = 'Null'; + })(Null.prototype); +} + +export interface Int extends DataType { TArray: TArrayType; TValue: TValueType; } +export class Int extends DataType { + constructor(public readonly isSigned: boolean, + public readonly bitWidth: IntBitWidth) { + super(Type.Int); + } + public get ArrayType(): TypedArrayConstructor { + switch (this.bitWidth) { + case 8: return (this.isSigned ? Int8Array : Uint8Array) as any; + case 16: return (this.isSigned ? Int16Array : Uint16Array) as any; + case 32: return (this.isSigned ? Int32Array : Uint32Array) as any; + case 64: return (this.isSigned ? Int32Array : Uint32Array) as any; + } + throw new Error(`Unrecognized ${this[Symbol.toStringTag]} type`); + } + public toString() { return `${this.isSigned ? `I` : `Ui`}nt${this.bitWidth}`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitInt(this); } + protected static [Symbol.toStringTag] = ((proto: Int) => { + return proto[Symbol.toStringTag] = 'Int'; + })(Int.prototype); +} + +export class Int8 extends Int { constructor() { super(true, 8); } } +export class Int16 extends Int { constructor() { super(true, 16); } } +export class Int32 extends Int { constructor() { super(true, 32); } } +export class Int64 extends Int { constructor() { super(true, 64); } } +export class Uint8 extends Int { constructor() { super(false, 8); } } +export class Uint16 extends Int { constructor() { super(false, 16); } } +export class Uint32 extends Int { constructor() { super(false, 32); } } +export class Uint64 extends Int { constructor() { super(false, 64); } } + +export interface Float extends DataType { TArray: TArrayType; TValue: number; } +export class Float extends DataType { + constructor(public readonly precision: Precision) { + super(Type.Float); + } + // @ts-ignore + public get ArrayType(): TypedArrayConstructor { + switch (this.precision) { + case Precision.HALF: return Uint16Array as any; + case Precision.SINGLE: return Float32Array as any; + case Precision.DOUBLE: return Float64Array as any; + } + throw new Error(`Unrecognized ${this[Symbol.toStringTag]} type`); + } + public toString() { return `Float${(this.precision << 5) || 16}`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitFloat(this); } + protected static [Symbol.toStringTag] = ((proto: Float) => { + return proto[Symbol.toStringTag] = 'Float'; + })(Float.prototype); +} + +export class Float16 extends Float { constructor() { super(Precision.HALF); } } +export class Float32 extends Float { constructor() { super(Precision.SINGLE); } } +export class Float64 extends Float { constructor() { super(Precision.DOUBLE); } } + +export interface Binary extends DataType { TArray: Uint8Array; TValue: Uint8Array; } +export class Binary extends DataType { + constructor() { + super(Type.Binary); + } + public toString() { return `Binary`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitBinary(this); + } + protected static [Symbol.toStringTag] = ((proto: Binary) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Binary'; + })(Binary.prototype); +} + +export interface Utf8 extends DataType { TArray: Uint8Array; TValue: string; } +export class Utf8 extends DataType { + constructor() { + super(Type.Utf8); + } + public toString() { return `Utf8`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitUtf8(this); + } + protected static [Symbol.toStringTag] = ((proto: Utf8) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Utf8'; + })(Utf8.prototype); +} + +export interface Bool extends DataType { TArray: Uint8Array; TValue: boolean; } +export class Bool extends DataType { + constructor() { + super(Type.Bool); + } + public toString() { return `Bool`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitBool(this); + } + protected static [Symbol.toStringTag] = ((proto: Bool) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Bool'; + })(Bool.prototype); +} + +export interface Decimal extends DataType { TArray: Uint32Array; TValue: Uint32Array; } +export class Decimal extends DataType { + constructor(public readonly scale: number, + public readonly precision: number) { + super(Type.Decimal); + } + public toString() { return `Decimal[${this.precision}e${this.scale > 0 ? `+` : ``}${this.scale}]`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitDecimal(this); + } + protected static [Symbol.toStringTag] = ((proto: Decimal) => { + ( proto).ArrayType = Uint32Array; + return proto[Symbol.toStringTag] = 'Decimal'; + })(Decimal.prototype); +} + +/* tslint:disable:class-name */ +export interface Date_ extends DataType { TArray: Int32Array; TValue: Date; } +export class Date_ extends DataType { + constructor(public readonly unit: DateUnit) { + super(Type.Date); + } + public toString() { return `Date${(this.unit + 1) * 32}<${DateUnit[this.unit]}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitDate(this); + } + protected static [Symbol.toStringTag] = ((proto: Date_) => { + ( proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Date'; + })(Date_.prototype); +} + +export interface Time extends DataType { TArray: Uint32Array; TValue: number; } +export class Time extends DataType { + constructor(public readonly unit: TimeUnit, + public readonly bitWidth: TimeBitWidth) { + super(Type.Time); + } + public toString() { return `Time${this.bitWidth}<${TimeUnit[this.unit]}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitTime(this); + } + protected static [Symbol.toStringTag] = ((proto: Time) => { + ( proto).ArrayType = Uint32Array; + return proto[Symbol.toStringTag] = 'Time'; + })(Time.prototype); +} + +export interface Timestamp extends DataType { TArray: Int32Array; TValue: number; } +export class Timestamp extends DataType { + constructor(public unit: TimeUnit, public timezone?: string | null) { + super(Type.Timestamp); + } + public toString() { return `Timestamp<${TimeUnit[this.unit]}${this.timezone ? `, ${this.timezone}` : ``}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitTimestamp(this); + } + protected static [Symbol.toStringTag] = ((proto: Timestamp) => { + ( proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Timestamp'; + })(Timestamp.prototype); +} + +export interface Interval extends DataType { TArray: Int32Array; TValue: Int32Array; } +export class Interval extends DataType { + constructor(public unit: IntervalUnit) { + super(Type.Interval); + } + public toString() { return `Interval<${IntervalUnit[this.unit]}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitInterval(this); + } + protected static [Symbol.toStringTag] = ((proto: Interval) => { + ( proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Interval'; + })(Interval.prototype); +} + +export interface List extends DataType { TArray: any; TValue: Vector; } +export class List extends DataType { + constructor(public children: Field[]) { + super(Type.List, children); + } + public toString() { return `List<${this.valueType}>`; } + public get ArrayType() { return this.valueType.ArrayType; } + public get valueType() { return this.children[0].type as T; } + public get valueField() { return this.children[0] as Field; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitList(this); + } + protected static [Symbol.toStringTag] = ((proto: List) => { + return proto[Symbol.toStringTag] = 'List'; + })(List.prototype); +} + +export interface Struct extends DataType { TArray: any; TValue: View; } +export class Struct extends DataType { + constructor(public children: Field[]) { + super(Type.Struct, children); + } + public toString() { return `Struct<${this.children.map((f) => f.type).join(`, `)}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitStruct(this); + } + protected static [Symbol.toStringTag] = ((proto: Struct) => { + return proto[Symbol.toStringTag] = 'Struct'; + })(Struct.prototype); +} + +export interface Union extends DataType { TArray: Int8Array; TValue: any; } +export class Union extends DataType { + constructor(public readonly mode: UnionMode, + public readonly typeIds: ArrowType[], + public readonly children: Field[]) { + super( (mode === UnionMode.Sparse ? Type.SparseUnion : Type.DenseUnion), children); + } + public toString() { return `${this[Symbol.toStringTag]}<${this.typeIds.map((x) => Type[x]).join(` | `)}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitUnion(this); } + protected static [Symbol.toStringTag] = ((proto: Union) => { + ( proto).ArrayType = Int8Array; + return proto[Symbol.toStringTag] = 'Union'; + })(Union.prototype); +} + +export class DenseUnion extends Union { + constructor(typeIds: ArrowType[], children: Field[]) { + super(UnionMode.Dense, typeIds, children); + } + protected static [Symbol.toStringTag] = ((proto: DenseUnion) => { + return proto[Symbol.toStringTag] = 'DenseUnion'; + })(DenseUnion.prototype); +} + +export class SparseUnion extends Union { + constructor(typeIds: ArrowType[], children: Field[]) { + super(UnionMode.Sparse, typeIds, children); + } + protected static [Symbol.toStringTag] = ((proto: SparseUnion) => { + return proto[Symbol.toStringTag] = 'SparseUnion'; + })(SparseUnion.prototype); +} + +export interface FixedSizeBinary extends DataType { TArray: Uint8Array; TValue: Uint8Array; } +export class FixedSizeBinary extends DataType { + constructor(public readonly byteWidth: number) { + super(Type.FixedSizeBinary); + } + public toString() { return `FixedSizeBinary[${this.byteWidth}]`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitFixedSizeBinary(this); } + protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'FixedSizeBinary'; + })(FixedSizeBinary.prototype); +} + +export interface FixedSizeList extends DataType { TArray: any; TValue: Vector; } +export class FixedSizeList extends DataType { + constructor(public readonly listSize: number, + public readonly children: Field[]) { + super(Type.FixedSizeList, children); + } + public get ArrayType() { return this.valueType.ArrayType; } + public get valueType() { return this.children[0].type as T; } + public get valueField() { return this.children[0] as Field; } + public toString() { return `FixedSizeList[${this.listSize}]<${this.valueType}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitFixedSizeList(this); } + protected static [Symbol.toStringTag] = ((proto: FixedSizeList) => { + return proto[Symbol.toStringTag] = 'FixedSizeList'; + })(FixedSizeList.prototype); +} + +/* tslint:disable:class-name */ +export interface Map_ extends DataType { TArray: Uint8Array; TValue: View; } +export class Map_ extends DataType { + constructor(public readonly keysSorted: boolean, + public readonly children: Field[]) { + super(Type.Map, children); + } + public toString() { return `Map<${this.children.join(`, `)}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitMap(this); } + protected static [Symbol.toStringTag] = ((proto: Map_) => { + return proto[Symbol.toStringTag] = 'Map_'; + })(Map_.prototype); +} + +export interface Dictionary extends DataType { TArray: T['TArray']; TValue: T['TValue']; } +export class Dictionary extends DataType { + public readonly id: number; + public readonly dictionary: T; + public readonly indicies: Int; + public readonly isOrdered: boolean; + constructor(dictionary: T, indicies: Int, id?: Long | number | null, isOrdered?: boolean | null) { + super(Type.Dictionary); + this.indicies = indicies; + this.dictionary = dictionary; + this.isOrdered = isOrdered || false; + this.id = id == null ? DictionaryBatch.getId() : typeof id === 'number' ? id : id.low; + } + public get ArrayType() { return this.dictionary.ArrayType; } + public toString() { return `Dictionary<${this.dictionary}, ${this.indicies}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitDictionary(this); + } + protected static [Symbol.toStringTag] = ((proto: Dictionary) => { + return proto[Symbol.toStringTag] = 'Dictionary'; + })(Dictionary.prototype); +} +export interface IterableArrayLike extends ArrayLike, Iterable {} + +export interface TypedArrayConstructor { + readonly prototype: T; + readonly BYTES_PER_ELEMENT: number; + new (length: number): T; + new (elements: Iterable): T; + new (arrayOrArrayBuffer: ArrayLike | ArrayBufferLike): T; + new (buffer: ArrayBufferLike, byteOffset: number, length?: number): T; + of(...items: number[]): T; + from(arrayLike: ArrayLike | Iterable, mapfn?: (v: number, k: number) => number, thisArg?: any): T; +} + +export type FloatArray = Uint16Array | Float32Array | Float64Array; +export type IntArray = Int8Array | Int16Array | Int32Array | Uint8Array | Uint16Array | Uint32Array; + +export interface TypedArray extends Iterable { + [index: number]: number; + readonly length: number; + readonly byteLength: number; + readonly byteOffset: number; + readonly buffer: ArrayBufferLike; + readonly BYTES_PER_ELEMENT: number; + [Symbol.toStringTag]: any; + [Symbol.iterator](): IterableIterator; + entries(): IterableIterator<[number, number]>; + keys(): IterableIterator; + values(): IterableIterator; + copyWithin(target: number, start: number, end?: number): this; + every(callbackfn: (value: number, index: number, array: TypedArray) => boolean, thisArg?: any): boolean; + fill(value: number, start?: number, end?: number): this; + filter(callbackfn: (value: number, index: number, array: TypedArray) => any, thisArg?: any): TypedArray; + find(predicate: (value: number, index: number, obj: TypedArray) => boolean, thisArg?: any): number | undefined; + findIndex(predicate: (value: number, index: number, obj: TypedArray) => boolean, thisArg?: any): number; + forEach(callbackfn: (value: number, index: number, array: TypedArray) => void, thisArg?: any): void; + includes(searchElement: number, fromIndex?: number): boolean; + indexOf(searchElement: number, fromIndex?: number): number; + join(separator?: string): string; + lastIndexOf(searchElement: number, fromIndex?: number): number; + map(callbackfn: (value: number, index: number, array: TypedArray) => number, thisArg?: any): TypedArray; + reduce(callbackfn: (previousValue: number, currentValue: number, currentIndex: number, array: TypedArray) => number): number; + reduce(callbackfn: (previousValue: number, currentValue: number, currentIndex: number, array: TypedArray) => number, initialValue: number): number; + reduce(callbackfn: (previousValue: U, currentValue: number, currentIndex: number, array: TypedArray) => U, initialValue: U): U; + reduceRight(callbackfn: (previousValue: number, currentValue: number, currentIndex: number, array: TypedArray) => number): number; + reduceRight(callbackfn: (previousValue: number, currentValue: number, currentIndex: number, array: TypedArray) => number, initialValue: number): number; + reduceRight(callbackfn: (previousValue: U, currentValue: number, currentIndex: number, array: TypedArray) => U, initialValue: U): U; + reverse(): TypedArray; + set(array: ArrayLike, offset?: number): void; + slice(start?: number, end?: number): TypedArray; + some(callbackfn: (value: number, index: number, array: TypedArray) => boolean, thisArg?: any): boolean; + sort(compareFn?: (a: number, b: number) => number): this; + subarray(begin: number, end?: number): TypedArray; + toLocaleString(): string; + toString(): string; +} diff --git a/js/src/util/bit.ts b/js/src/util/bit.ts new file mode 100644 index 0000000000000..2308bf6a2e03c --- /dev/null +++ b/js/src/util/bit.ts @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { TypedArray } from '../type'; + +export function align(value: number, alignment: number) { + return value + padding(value, alignment); +} + +export function padding(value: number, alignment: number) { + return (value % alignment === 0 ? 0 : alignment - value % alignment); +} + +export function getBool(_data: any, _index: number, byte: number, bit: number) { + return (byte & 1 << bit) !== 0; +} + +export function getBit(_data: any, _index: number, byte: number, bit: number): 0 | 1 { + return (byte & 1 << bit) >> bit as (0 | 1); +} + +export function setBool(bytes: Uint8Array, index: number, value: any) { + return value ? + !!(bytes[index >> 3] |= (1 << (index % 8))) || true : + !(bytes[index >> 3] &= ~(1 << (index % 8))) && false ; +} + +export function packBools(values: Iterable) { + let n = 0, i = 0; + let xs: number[] = []; + let bit = 0, byte = 0; + for (const value of values) { + value && (byte |= 1 << bit); + if (++bit === 8) { + xs[i++] = byte; + byte = bit = 0; + } + } + if (i === 0 || bit > 0) { xs[i++] = byte; } + if (i % 8 && (n = i + 8 - i % 8)) { + do { xs[i] = 0; } while (++i < n); + } + return new Uint8Array(xs); +} + +export function* iterateBits(bytes: Uint8Array, begin: number, length: number, context: any, + get: (context: any, index: number, byte: number, bit: number) => T) { + let bit = begin % 8; + let byteIndex = begin >> 3; + let index = 0, remaining = length; + for (; remaining > 0; bit = 0) { + let byte = bytes[byteIndex++]; + do { + yield get(context, index++, byte, bit); + } while (--remaining > 0 && ++bit < 8); + } +} + +/** + * Compute the population count (the number of bits set to 1) for a range of bits in a Uint8Array. + * @param vector The Uint8Array of bits for which to compute the population count. + * @param lhs The range's left-hand side (or start) bit + * @param rhs The range's right-hand side (or end) bit + */ +export function popcnt_bit_range(data: Uint8Array, lhs: number, rhs: number): number { + if (rhs - lhs <= 0) { return 0; } + // If the bit range is less than one byte, sum the 1 bits in the bit range + if (rhs - lhs < 8) { + let sum = 0; + for (const bit of iterateBits(data, lhs, rhs - lhs, data, getBit)) { + sum += bit; + } + return sum; + } + // Get the next lowest multiple of 8 from the right hand side + const rhsInside = rhs >> 3 << 3; + // Get the next highest multiple of 8 from the left hand side + const lhsInside = lhs + (lhs % 8 === 0 ? 0 : 8 - lhs % 8); + return ( + // Get the popcnt of bits between the left hand side, and the next highest multiple of 8 + popcnt_bit_range(data, lhs, lhsInside) + + // Get the popcnt of bits between the right hand side, and the next lowest multiple of 8 + popcnt_bit_range(data, rhsInside, rhs) + + // Get the popcnt of all bits between the left and right hand sides' multiples of 8 + popcnt_array(data, lhsInside >> 3, (rhsInside - lhsInside) >> 3) + ); +} + +export function popcnt_array(arr: TypedArray, byteOffset?: number, byteLength?: number) { + let cnt = 0, pos = byteOffset! | 0; + const view = new DataView(arr.buffer, arr.byteOffset, arr.byteLength); + const len = byteLength === void 0 ? arr.byteLength : pos + byteLength; + while (len - pos >= 4) { + cnt += popcnt_uint32(view.getUint32(pos)); + pos += 4; + } + while (len - pos >= 2) { + cnt += popcnt_uint32(view.getUint16(pos)); + pos += 2; + } + while (len - pos >= 1) { + cnt += popcnt_uint32(view.getUint8(pos)); + pos += 1; + } + return cnt; +} + +export function popcnt_uint32(uint32: number): number { + let i = uint32 | 0; + i = i - ((i >>> 1) & 0x55555555); + i = (i & 0x33333333) + ((i >>> 2) & 0x33333333); + return (((i + (i >>> 4)) & 0x0F0F0F0F) * 0x01010101) >>> 24; +} diff --git a/js/src/util/compat.ts b/js/src/util/compat.ts new file mode 100644 index 0000000000000..7a4232ee8c32e --- /dev/null +++ b/js/src/util/compat.ts @@ -0,0 +1,49 @@ +export interface Subscription { + unsubscribe: () => void; +} + +export interface Observer { + closed?: boolean; + next: (value: T) => void; + error: (err: any) => void; + complete: () => void; +} + +export interface Observable { + subscribe: (observer: Observer) => Subscription; +} + +/** + * @ignore + */ +export function isPromise(x: any): x is PromiseLike { + return x != null && Object(x) === x && typeof x['then'] === 'function'; +} + +/** + * @ignore + */ +export function isObservable(x: any): x is Observable { + return x != null && Object(x) === x && typeof x['subscribe'] === 'function'; +} + +/** + * @ignore + */ +export function isArrayLike(x: any): x is ArrayLike { + return x != null && Object(x) === x && typeof x['length'] === 'number'; +} + +/** + * @ignore + */ +export function isIterable(x: any): x is Iterable { + return x != null && Object(x) === x && typeof x[Symbol.iterator] !== 'undefined'; +} + +/** + * @ignore + */ +export function isAsyncIterable(x: any): x is AsyncIterable { + return x != null && Object(x) === x && typeof x[Symbol.asyncIterator] !== 'undefined'; +} diff --git a/js/src/util/layout.ts b/js/src/util/layout.ts index c064ee9d7d0b0..29698fb3d2b93 100644 --- a/js/src/util/layout.ts +++ b/js/src/util/layout.ts @@ -15,16 +15,9 @@ // specific language governing permissions and limitations // under the License. +import { align } from './bit'; import { TextEncoder } from 'text-encoding-utf-8'; -import { TypedArrayConstructor, TypedArray } from '../vector/types'; - -export function align(value: number, alignment: number) { - return value + padding(value, alignment); -} - -export function padding(value: number, alignment: number) { - return (value % alignment === 0 ? 0 : alignment - value % alignment); -} +import { TypedArrayConstructor, TypedArray } from '../type'; export type NullableLayout = { nullCount: number, validity: Uint8Array }; export type BufferLayout> = { data: TArray }; diff --git a/js/src/vector.ts b/js/src/vector.ts new file mode 100644 index 0000000000000..d9ca97b5fd120 --- /dev/null +++ b/js/src/vector.ts @@ -0,0 +1,441 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data, ChunkedData, FlatData, BoolData, FlatListData, NestedData, DictionaryData } from './data'; +import { VisitorNode, TypeVisitor, VectorVisitor } from './visitor'; +import { DataType, ListType, FlatType, NestedType, FlatListType, TimeUnit } from './type'; +import { IterableArrayLike, Precision, DateUnit, IntervalUnit, UnionMode } from './type'; + +export interface VectorLike { length: number; nullCount: number; } + +export interface View { + clone(data: Data): this; + isValid(index: number): boolean; + get(index: number): T['TValue'] | null; + set(index: number, value: T['TValue']): void; + toArray(): IterableArrayLike; + [Symbol.iterator](): IterableIterator; +} + +export class Vector implements VectorLike, View, VisitorNode { + public static create(data: Data): Vector { + return createVector(data); + } + public static concat(source?: Vector | null, ...others: Vector[]): Vector { + return others.reduce((a, b) => a ? a.concat(b) : b, source!); + } + public type: T; + public length: number; + public readonly data: Data; + public readonly view: View; + constructor(data: Data, view: View) { + this.data = data; + this.type = data.type; + this.length = data.length; + let nulls: Uint8Array; + if (( data instanceof ChunkedData) && !(view instanceof ChunkedView)) { + this.view = new ChunkedView(data); + } else if (!(view instanceof ValidityView) && (nulls = data.nullBitmap!) && nulls.length > 0 && data.nullCount > 0) { + this.view = new ValidityView(data, view); + } else { + this.view = view; + } + } + + public get nullCount() { return this.data.nullCount; } + public get nullBitmap() { return this.data.nullBitmap; } + public get [Symbol.toStringTag]() { + return `Vector<${this.type[Symbol.toStringTag]}>`; + } + public toJSON(): any { return this.toArray(); } + public clone(data: Data, view: View = this.view.clone(data) as any): this { + return new (this.constructor as any)(data, view); + } + public isValid(index: number): boolean { + return this.view.isValid(index); + } + public get(index: number): T['TValue'] | null { + return this.view.get(index); + } + public set(index: number, value: T['TValue']): void { + return this.view.set(index, value); + } + public toArray(): IterableArrayLike { + return this.view.toArray(); + } + public [Symbol.iterator](): IterableIterator { + return this.view[Symbol.iterator](); + } + public concat(...others: Vector[]): this { + if ((others = others.filter(Boolean)).length === 0) { + return this; + } + const { view } = this; + const vecs = !(view instanceof ChunkedView) + ? [this, ...others] + : [...view.chunkVectors, ...others]; + const offsets = ChunkedData.computeOffsets(vecs); + const chunksLength = offsets[offsets.length - 1]; + const chunkedData = new ChunkedData(this.type, chunksLength, vecs, 0, -1, offsets); + return this.clone(chunkedData, new ChunkedView(chunkedData)) as this; + } + public slice(begin?: number, end?: number): this { + let { length } = this; + let size = (this.view as any).size || 1; + let total = length, from = (begin || 0) * size; + let to = (typeof end === 'number' ? end : total) * size; + if (to < 0) { to = total - (to * -1) % total; } + if (from < 0) { from = total - (from * -1) % total; } + if (to < from) { [from, to] = [to, from]; } + total = !isFinite(total = (to - from)) || total < 0 ? 0 : total; + const slicedData = this.data.slice(from, Math.min(total, length)); + return this.clone(slicedData, this.view.clone(slicedData)) as this; + } + + public acceptTypeVisitor(visitor: TypeVisitor): any { + return TypeVisitor.visitTypeInline(visitor, this.type); + } + public acceptVectorVisitor(visitor: VectorVisitor): any { + return VectorVisitor.visitTypeInline(visitor, this.type, this); + } +} + +export abstract class FlatVector extends Vector { + public get values() { return this.data.values; } + public lows(): IntVector { return this.asInt32(0, 2); } + public highs(): IntVector { return this.asInt32(1, 2); } + public asInt32(offset: number = 0, stride: number = 2): IntVector { + let data = (this.data as FlatData).clone(new Int32()); + if (offset > 0) { + data = data.slice(offset, this.length - offset); + } + const int32s = new IntVector(data, new PrimitiveView(data, stride)); + int32s.length = this.length / stride | 0; + return int32s; + } +} + +export abstract class ListVectorBase extends Vector { + public get values() { return this.data.values; } + public get valueOffsets() { return this.data.valueOffsets; } + public getValueOffset(index: number) { + return this.valueOffsets[index]; + } + public getValueLength(index: number) { + return this.valueOffsets[index + 1] - this.valueOffsets[index]; + } +} + +export abstract class NestedVector extends Vector { + // @ts-ignore + public readonly view: NestedView; + // @ts-ignore + protected _childData: Data[]; + public getChildAt(index: number): Vector | null { + return this.view.getChildAt(index); + } + public get childData(): Data[] { + let data: Data | Data[]; + if ((data = this._childData)) { + // Return the cached childData reference first + return data as Data[]; + } else if (!( (data = this.data) instanceof ChunkedData)) { + // If data isn't chunked, cache and return NestedData's childData + return this._childData = (data as NestedData).childData; + } + // Otherwise if the data is chunked, concatenate the childVectors from each chunk + // to construct a single chunked Vector for each column. Then return the ChunkedData + // instance from each unified chunked column as the childData of a chunked NestedVector + const chunks = ((data as ChunkedData).chunkVectors as NestedVector[]); + return this._childData = chunks + .reduce<(Vector | null)[][]>((cols, chunk) => chunk.childData + .reduce<(Vector | null)[][]>((cols, _, i) => ( + (cols[i] || (cols[i] = [])).push(chunk.getChildAt(i)) + ) && cols || cols, cols), [] as Vector[][]) + .map((vecs) => Vector.concat(...vecs).data); + } +} + +import { List, Binary, Utf8, Bool, } from './type'; +import { Null, Int, Float, Decimal, Date_, Time, Timestamp, Interval } from './type'; +import { Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Float16, Float32, Float64 } from './type'; +import { Struct, Union, SparseUnion, DenseUnion, FixedSizeBinary, FixedSizeList, Map_, Dictionary } from './type'; + +import { ChunkedView } from './vector/chunked'; +import { DictionaryView } from './vector/dictionary'; +import { ListView, FixedSizeListView, BinaryView, Utf8View } from './vector/list'; +import { UnionView, DenseUnionView, NestedView, StructView, MapView } from './vector/nested'; +import { FlatView, NullView, BoolView, ValidityView, PrimitiveView, FixedSizeView, Float16View } from './vector/flat'; +import { DateDayView, DateMillisecondView, IntervalYearMonthView } from './vector/flat'; +import { TimestampDayView, TimestampSecondView, TimestampMillisecondView, TimestampMicrosecondView, TimestampNanosecondView } from './vector/flat'; +import { packBools } from './util/bit'; + +export class NullVector extends Vector { + constructor(data: Data, view: View = new NullView(data)) { + super(data, view); + } +} + +export class BoolVector extends Vector { + public static from(data: IterableArrayLike) { + return new BoolVector(new BoolData(new Bool(), data.length, null, packBools(data))); + } + public get values() { return this.data.values; } + constructor(data: Data, view: View = new BoolView(data)) { + super(data, view); + } +} + +export class IntVector> extends FlatVector { + public static from(data: Int8Array): IntVector; + public static from(data: Int16Array): IntVector; + public static from(data: Int32Array): IntVector; + public static from(data: Uint8Array): IntVector; + public static from(data: Uint16Array): IntVector; + public static from(data: Uint32Array): IntVector; + public static from(data: Int32Array, is64: true): IntVector; + public static from(data: Uint32Array, is64: true): IntVector; + public static from(data: any, is64?: boolean) { + if (is64 === true) { + return data instanceof Int32Array + ? new IntVector(new FlatData(new Int64(), data.length, null, data)) + : new IntVector(new FlatData(new Uint64(), data.length, null, data)); + } + switch (data.constructor) { + case Int8Array: return new IntVector(new FlatData(new Int8(), data.length, null, data)); + case Int16Array: return new IntVector(new FlatData(new Int16(), data.length, null, data)); + case Int32Array: return new IntVector(new FlatData(new Int32(), data.length, null, data)); + case Uint8Array: return new IntVector(new FlatData(new Uint8(), data.length, null, data)); + case Uint16Array: return new IntVector(new FlatData(new Uint16(), data.length, null, data)); + case Uint32Array: return new IntVector(new FlatData(new Uint32(), data.length, null, data)); + } + throw new TypeError('Unrecognized Int data'); + } + static defaultView(data: Data) { + return data.type.bitWidth <= 32 ? new FlatView(data) : new FixedSizeView(data, (data.type.bitWidth / 32) | 0); + } + constructor(data: Data, view: View = IntVector.defaultView(data)) { + super(data, view); + } +} + +export class FloatVector> extends FlatVector { + public static from(data: Uint16Array): FloatVector; + public static from(data: Float32Array): FloatVector; + public static from(data: Float64Array): FloatVector; + public static from(data: any) { + switch (data.constructor) { + case Uint16Array: return new FloatVector(new FlatData(new Float16(), data.length, null, data)); + case Float32Array: return new FloatVector(new FlatData(new Float32(), data.length, null, data)); + case Float64Array: return new FloatVector(new FlatData(new Float64(), data.length, null, data)); + } + throw new TypeError('Unrecognized Float data'); + } + static defaultView(data: Data): FlatView { + return data.type.precision !== Precision.HALF ? new FlatView(data) : new Float16View(data as Data); + } + constructor(data: Data, view: View = FloatVector.defaultView(data)) { + super(data, view); + } +} + +export class DateVector extends FlatVector { + static defaultView(data: Data) { + return data.type.unit === DateUnit.DAY ? new DateDayView(data) : new DateMillisecondView(data, 2); + } + constructor(data: Data, view: View = DateVector.defaultView(data)) { + super(data, view); + } + public lows(): IntVector { + return this.type.unit === DateUnit.DAY ? this.asInt32(0, 1) : this.asInt32(0, 2); + } + public highs(): IntVector { + return this.type.unit === DateUnit.DAY ? this.asInt32(0, 1) : this.asInt32(1, 2); + } + public asEpochMilliseconds(): IntVector { + let data = (this.data as FlatData).clone(new Int32()); + switch (this.type.unit) { + case DateUnit.DAY: return new IntVector(data, new TimestampDayView(data as any, 1) as any); + case DateUnit.MILLISECOND: return new IntVector(data, new TimestampMillisecondView(data as any, 2) as any); + } + throw new TypeError(`Unrecognized date unit "${DateUnit[this.type.unit]}"`); + } +} + +export class DecimalVector extends FlatVector { + constructor(data: Data, view: View = new FixedSizeView(data, 4)) { + super(data, view); + } +} + +export class TimeVector extends FlatVector