Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-34936: [JavaScript] Added Proxy for Table, RecordBatch, and Vector #34939

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions js/perf/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,21 @@ b.suite(
b.cycle(cycle)
);

b.suite(
`[index] Vector`,

...Object.entries(vectors).map(([name, vector]) =>
b.add(`from: ${name}`, () => {
for (let i = -1, n = vector.length; ++i < n;) {
vector[i];
}
})),

b.cycle(cycle)
);



for (const { name, ipc, table } of config) {
b.suite(
`Parse`,
Expand Down
16 changes: 16 additions & 0 deletions js/src/recordbatch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import { Vector } from './vector.js';
import { Schema, Field } from './schema.js';
import { DataType, Struct, Null, TypeMap } from './type.js';

import { IndexAccessProxyHandler } from './util/proxyhandler.js'

import { instance as getVisitor } from './visitor/get.js';
import { instance as setVisitor } from './visitor/set.js';
import { instance as indexOfVisitor } from './visitor/indexof.js';
Expand Down Expand Up @@ -95,6 +97,13 @@ export class RecordBatch<T extends TypeMap = any> {
public readonly schema: Schema<T>;
public readonly data: Data<Struct<T>>;

/**
* Index access of the record batch elements. While equivalent to
* {@link * RecordBatch.get}, * it is 1-2 orders of magnitude slower than
* {@link * RecordBatch.get}.
*/
[index: number]: T['TValue'] | null;

public get dictionaries() {
return this._dictionaries || (this._dictionaries = collectDictionaries(this.schema.fields, this.data.children));
}
Expand Down Expand Up @@ -280,6 +289,13 @@ export class RecordBatch<T extends TypeMap = any> {
protected static [Symbol.toStringTag] = ((proto: RecordBatch) => {
(proto as any)._nullCount = -1;
(proto as any)[Symbol.isConcatSpreadable] = true;

// The Proxy object will slow down all method access if it is returned
// from the constructor. By putting it at the root of the prototype
// chain, we do not affect the speed of normal access. That said, index
// access will be much slower than `.get()`.
Object.setPrototypeOf(proto, new Proxy({}, new IndexAccessProxyHandler()))

return 'RecordBatch';
})(RecordBatch.prototype);
}
Expand Down
15 changes: 15 additions & 0 deletions js/src/table.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import {
wrapChunkedIndexOf,
sliceChunks,
} from './util/chunk.js';
import { IndexAccessProxyHandler } from './util/proxyhandler.js'

import { instance as getVisitor } from './visitor/get.js';
import { instance as setVisitor } from './visitor/set.js';
Expand Down Expand Up @@ -152,6 +153,13 @@ export class Table<T extends TypeMap = any> {
*/
declare public readonly batches: RecordBatch<T>[];

/**
* Index access of the table elements. While equivalent to
* {@link * Table.get}, * it is 1-2 orders of magnitude slower than
* {@link * Table.get}.
*/
[index: number]: T['TValue'] | null;

/**
* The contiguous {@link RecordBatch `RecordBatch`} chunks of the Table rows.
*/
Expand Down Expand Up @@ -389,6 +397,13 @@ export class Table<T extends TypeMap = any> {
(proto as any)['set'] = wrapChunkedCall2(setVisitor.getVisitFn(Type.Struct));
(proto as any)['indexOf'] = wrapChunkedIndexOf(indexOfVisitor.getVisitFn(Type.Struct));
(proto as any)['getByteLength'] = wrapChunkedCall1(byteLengthVisitor.getVisitFn(Type.Struct));

// The Proxy object will slow down all method access if it is returned
// from the constructor. By putting it at the root of the prototype
// chain, we do not affect the speed of normal access. That said, index
// access will be much slower than `.get()`.
Object.setPrototypeOf(proto, new Proxy({}, new IndexAccessProxyHandler()))

return 'Table';
})(Table.prototype);
}
Expand Down
24 changes: 24 additions & 0 deletions js/src/util/proxyhandler.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
export class IndexAccessProxyHandler<T extends object = any> implements ProxyHandler<T> {
get(target: any, key: string, receiver: any) {
if (typeof key === "string") { // Need to check because key can be a symbol, such as [Symbol.iterator].
const idx = +key; // Convert the key to a number
if (idx === idx) { // Basically an inverse NaN check
return (receiver || target).get(idx);
}
}

return Reflect.get(target, key, receiver);
}

set(target: any, key: string, value: any, receiver: any) {
if (typeof key === "string") { // Need to check because key can be a symbol, such as [Symbol.iterator].
const idx = +key; // Convert the key to a number
if (idx === idx) { // Basically an inverse NaN check
(receiver || target).set(idx, value);
return true; // Is this correct?
}
}

return Reflect.set(target, key, value, receiver);
}
}
35 changes: 35 additions & 0 deletions js/src/vector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import {
wrapChunkedIndexOf,
} from './util/chunk.js';
import { BigInt64Array, BigUint64Array } from './util/compat.js';
import { IndexAccessProxyHandler } from './util/proxyhandler.js'

import { instance as getVisitor } from './visitor/get.js';
import { instance as setVisitor } from './visitor/set.js';
Expand Down Expand Up @@ -103,6 +104,12 @@ export class Vector<T extends DataType = any> {
declare protected _nullCount: number;
declare protected _byteLength: number;

/**
* Index access of the vector elements. While equivalent to {@link * Vector.get},
* it is 1-2 orders of magnitude slower than {@link * Vector.get}.
*/
[index: number]: T['TValue'] | null;

/**
* The {@link DataType `DataType`} of this Vector.
*/
Expand Down Expand Up @@ -358,6 +365,30 @@ export class Vector<T extends DataType = any> {
(proto as any)._offsets = new Uint32Array([0]);
(proto as any)[Symbol.isConcatSpreadable] = true;

// The prototype chain of the Vector object is complex to get the best
// possible performance:
//
// - The Proxy object is quite slow, so we put it at the bottom of the
// prototype chain. This means that known access such as functions
// like `vector.get` will be immediately resolved and is fast. Unknown
// access such as index notation (`vector[0]`) will bubble up to the
// proxy object and be resolved. Experimentally, this is about 1-2
// orders of magnitude slower than using `vector.get(index)`.
// - When the Vector object has multiple chunks in it, we need to find
// the appropriate chunk to iterate through when using methods like
// `.get()`. To do this, in the Vector constructor, it sets the
// prototype of `this` to `vectorPrototypesByTypeId[typeId]`, which
// defines the appropriate methods to find the appropriate chunk.
// The prototypes provided by `vectorPrototypesByTypeId` is also
// chained from the Proxy object, which means Vector objects with
// multiple chunks also retain the index access API.
// - As a note, using `vector.get(i)` is slow as it needs to perform a
// binary search while looking for the right chunk. So operations
// that loop through the array with an index (i.e. `(for i=0; i<n;
// i++) vector.get(i)`) becomes an O(nlogn) operation as opposed to
// O(n).
Object.setPrototypeOf(proto, new Proxy({}, new IndexAccessProxyHandler()));

const typeIds: Type[] = Object.keys(Type)
.map((T: any) => Type[T] as any)
.filter((T: any) => typeof T === 'number' && T !== Type.NONE);
Expand All @@ -370,6 +401,10 @@ export class Vector<T extends DataType = any> {

visitorsByTypeId[typeId] = { get, set, indexOf, byteLength };
vectorPrototypesByTypeId[typeId] = Object.create(proto, {
// These object keys are equivalent to string keys. However, by
// putting them in [] (which makes them computed property
// names), the Closure compiler we use to compile this library
// will not optimize out the function names.
['isValid']: { value: wrapChunkedCall1(isChunkedValid) },
['get']: { value: wrapChunkedCall1(getVisitor.getVisitFnByTypeId(typeId)) },
['set']: { value: wrapChunkedCall2(setVisitor.getVisitFnByTypeId(typeId)) },
Expand Down
13 changes: 13 additions & 0 deletions js/test/unit/recordbatch/record-batch-tests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,17 @@ describe(`RecordBatch`, () => {
expect(f32sBatch.numRows).toBe(45);
});
});

describe(`get()`, () => {
test(`can get row with get and []`, () => {
const batch = numsRecordBatch(32, 45);
const row = batch.get(2)
expect(row).not.toBeNull();
expect(row!.f32).toEqual(2);
expect(row!.i32).toEqual(2);

const row2 = batch[2];
expect(row2).toEqual(row);
});
});
});
4 changes: 4 additions & 0 deletions js/test/unit/table-tests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,10 @@ describe(`Table`, () => {
expect(row.f32).toEqual(expected[F32]);
expect(row.i32).toEqual(expected[I32]);
expect(row.dictionary).toEqual(expected[DICT]);

// Test index access as well
const row2 = table[i];
expect(row2).toEqual(row)
}
});
test(`iterates expected values`, () => {
Expand Down
7 changes: 6 additions & 1 deletion js/test/unit/vector/vector-tests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,12 @@ describe(`StructVector`, () => {

test(`get value`, () => {
for (const [i, value] of values.entries()) {
expect(vector.get(i)!.toJSON()).toEqual(value);
const row = vector.get(i);
expect(row).not.toBeNull();
expect(row!.toJSON()).toEqual(value);

const row2 = vector[i];
expect(row2).toEqual(row);
}
});
});
Expand Down