Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Assorted helpers used for texture checking #1068

Merged
merged 5 commits into from
Mar 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 108 additions & 3 deletions src/unittests/conversion.spec.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
export const description = `Unit tests for conversion`;

import { makeTestGroup } from '../common/internal/test_group.js';
import { float16BitsToFloat32, float32ToFloat16Bits } from '../webgpu/util/conversion.js';
import {
float16BitsToFloat32,
float32ToFloat16Bits,
float32ToFloatBits,
floatBitsToNormalULPFromZero,
floatBitsToNumber,
kFloat16Format,
kFloat32Format,
} from '../webgpu/util/conversion.js';

import { UnitTest } from './unit_test.js';

Expand All @@ -21,17 +29,114 @@ const cases = [
[0b1_10101_1001000000, -100],
];

g.test('conversion,float16BitsToFloat32').fn(t => {
g.test('float16BitsToFloat32').fn(t => {
cases.forEach(value => {
// some loose check
t.expect(Math.abs(float16BitsToFloat32(value[0]) - value[1]) <= 0.00001, value[0].toString(2));
});
});

g.test('conversion,float32ToFloat16Bits').fn(t => {
g.test('float32ToFloat16Bits').fn(t => {
cases.forEach(value => {
// some loose check
// Does not handle clamping, underflow, overflow, or denormalized numbers.
t.expect(Math.abs(float32ToFloat16Bits(value[1]) - value[0]) <= 1, value[1].toString());
});
});

g.test('float32ToFloatBits_floatBitsToNumber')
.paramsSubcasesOnly(u =>
u
.combine('signed', [0, 1] as const)
.combine('exponentBits', [5, 8])
.combine('mantissaBits', [10, 23])
)
.fn(t => {
const { signed, exponentBits, mantissaBits } = t.params;
const bias = (1 << (exponentBits - 1)) - 1;

for (const [, value] of cases) {
if (value < 0 && signed === 0) continue;
const bits = float32ToFloatBits(value, signed, exponentBits, mantissaBits, bias);
const reconstituted = floatBitsToNumber(bits, { signed, exponentBits, mantissaBits, bias });
t.expect(Math.abs(reconstituted - value) <= 0.0000001, `${reconstituted} vs ${value}`);
}
});

g.test('floatBitsToULPFromZero,16').fn(t => {
const test = (bits: number, ulpFromZero: number) =>
t.expect(floatBitsToNormalULPFromZero(bits, kFloat16Format) === ulpFromZero, bits.toString(2));
// Zero
test(0b0_00000_0000000000, 0);
// Subnormal
test(0b0_00000_0000000001, 0);
test(0b1_00000_0000000001, 0);
test(0b0_00000_1111111111, 0);
test(0b1_00000_1111111111, 0);
// Normal
test(0b0_00001_0000000000, 1); // 0 + 1ULP
test(0b1_00001_0000000000, -1); // 0 - 1ULP
test(0b0_00001_0000000001, 2); // 0 + 2ULP
test(0b1_00001_0000000001, -2); // 0 - 2ULP
test(0b0_01110_0000000000, 0b01101_0000000001); // 0.5
test(0b1_01110_0000000000, -0b01101_0000000001); // -0.5
test(0b0_01110_1111111110, 0b01101_1111111111); // 1.0 - 2ULP
test(0b1_01110_1111111110, -0b01101_1111111111); // -(1.0 - 2ULP)
test(0b0_01110_1111111111, 0b01110_0000000000); // 1.0 - 1ULP
test(0b1_01110_1111111111, -0b01110_0000000000); // -(1.0 - 1ULP)
test(0b0_01111_0000000000, 0b01110_0000000001); // 1.0
test(0b1_01111_0000000000, -0b01110_0000000001); // -1.0
test(0b0_01111_0000000001, 0b01110_0000000010); // 1.0 + 1ULP
test(0b1_01111_0000000001, -0b01110_0000000010); // -(1.0 + 1ULP)
test(0b0_10000_0000000000, 0b01111_0000000001); // 2.0
test(0b1_10000_0000000000, -0b01111_0000000001); // -2.0

const testThrows = (b: number) =>
t.shouldThrow('Error', () => floatBitsToNormalULPFromZero(b, kFloat16Format));
// Infinity
testThrows(0b0_11111_0000000000);
testThrows(0b1_11111_0000000000);
// NaN
testThrows(0b0_11111_1111111111);
testThrows(0b1_11111_1111111111);
});

g.test('floatBitsToULPFromZero,32').fn(t => {
const test = (bits: number, ulpFromZero: number) =>
t.expect(floatBitsToNormalULPFromZero(bits, kFloat32Format) === ulpFromZero, bits.toString(2));
// Zero
test(0b0_00000000_00000000000000000000000, 0);
// Subnormal
test(0b0_00000000_00000000000000000000001, 0);
test(0b1_00000000_00000000000000000000001, 0);
test(0b0_00000000_11111111111111111111111, 0);
test(0b1_00000000_11111111111111111111111, 0);
// Normal
test(0b0_00000001_00000000000000000000000, 1); // 0 + 1ULP
test(0b1_00000001_00000000000000000000000, -1); // 0 - 1ULP
test(0b0_00000001_00000000000000000000001, 2); // 0 + 2ULP
test(0b1_00000001_00000000000000000000001, -2); // 0 - 2ULP
test(0b0_01111110_00000000000000000000000, 0b01111101_00000000000000000000001); // 0.5
test(0b1_01111110_00000000000000000000000, -0b01111101_00000000000000000000001); // -0.5
test(0b0_01111110_11111111111111111111110, 0b01111101_11111111111111111111111); // 1.0 - 2ULP
test(0b1_01111110_11111111111111111111110, -0b01111101_11111111111111111111111); // -(1.0 - 2ULP)
test(0b0_01111110_11111111111111111111111, 0b01111110_00000000000000000000000); // 1.0 - 1ULP
test(0b1_01111110_11111111111111111111111, -0b01111110_00000000000000000000000); // -(1.0 - 1ULP)
test(0b0_01111111_00000000000000000000000, 0b01111110_00000000000000000000001); // 1.0
test(0b1_01111111_00000000000000000000000, -0b01111110_00000000000000000000001); // -1.0
test(0b0_01111111_00000000000000000000001, 0b01111110_00000000000000000000010); // 1.0 + 1ULP
test(0b1_01111111_00000000000000000000001, -0b01111110_00000000000000000000010); // -(1.0 + 1ULP)
test(0b0_11110000_00000000000000000000000, 0b11101111_00000000000000000000001); // 2.0
test(0b1_11110000_00000000000000000000000, -0b11101111_00000000000000000000001); // -2.0

const testThrows = (b: number) =>
t.shouldThrow('Error', () => floatBitsToNormalULPFromZero(b, kFloat32Format));
// Infinity
testThrows(0b0_11111111_00000000000000000000000);
testThrows(0b1_11111111_00000000000000000000000);
// NaN
testThrows(0b0_11111111_11111111111111111111111);
testThrows(0b0_11111111_00000000000000000000001);
testThrows(0b1_11111111_11111111111111111111111);
testThrows(0b1_11111111_00000000000000000000001);
});
51 changes: 1 addition & 50 deletions src/webgpu/util/check_contents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
} from '../../common/util/util.js';

import { float16BitsToFloat32 } from './conversion.js';
import { generatePrettyTable } from './pretty_diff_tables.js';

/** Generate an expected value at `index`, to test for equality with the actual value. */
export type CheckElementsGenerator = (index: number) => number;
Expand Down Expand Up @@ -236,53 +237,3 @@ function intToPaddedHex(number: number, { byteLength }: { byteLength: number })
if (number < 0) s = '-' + s;
return s;
}

/**
* Pretty-prints a "table" of cell values (each being `number | string`), right-aligned.
* Each row may be any iterator, including lazily-generated (potentially infinite) rows.
*
* The first argument is the printing options:
* - fillToWidth: Keep printing columns (as long as there is data) until this width is passed.
* If there is more data, "..." is appended.
* - numberToString: if a cell value is a number, this is used to stringify it.
*
* Each remaining argument provides one row for the table.
*/
function generatePrettyTable(
{ fillToWidth, numberToString }: { fillToWidth: number; numberToString: (n: number) => string },
rows: ReadonlyArray<Iterable<string | number>>
): string {
const rowStrings = range(rows.length, () => '');
let totalTableWidth = 0;
const iters = rows.map(row => row[Symbol.iterator]());

// Loop over columns
for (;;) {
const cellsForColumn = iters.map(iter => {
const r = iter.next(); // Advance the iterator for each row, in lock-step.
return r.done ? undefined : typeof r.value === 'number' ? numberToString(r.value) : r.value;
});
if (cellsForColumn.every(cell => cell === undefined)) break;

// Maximum width of any cell in this column, plus one for space between columns
// (also inserts a space at the left of the first column).
const colWidth = Math.max(...cellsForColumn.map(c => (c === undefined ? 0 : c.length))) + 1;
for (let row = 0; row < rowStrings.length; ++row) {
const cell = cellsForColumn[row];
if (cell !== undefined) {
rowStrings[row] += cell.padStart(colWidth);
}
}

totalTableWidth += colWidth;
if (totalTableWidth >= fillToWidth) {
for (let row = 0; row < rowStrings.length; ++row) {
if (cellsForColumn[row] !== undefined) {
rowStrings[row] += ' ...';
}
}
break;
}
}
return rowStrings.join('\n');
}
75 changes: 70 additions & 5 deletions src/webgpu/util/conversion.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,76 @@ export function float32ToFloat16Bits(n: number) {
* Decodes an IEEE754 16 bit floating point number into a JS `number` and returns.
*/
export function float16BitsToFloat32(float16Bits: number): number {
const buf = new DataView(new ArrayBuffer(Float32Array.BYTES_PER_ELEMENT));
// shift exponent and mantissa bits and fill with 0 on right, shift sign bit
buf.setUint32(0, ((float16Bits & 0x7fff) << 13) | ((float16Bits & 0x8000) << 16), true);
// shifting for bias different: f16 uses a bias of 15, f32 uses a bias of 127
return buf.getFloat32(0, true) * 2 ** (127 - 15);
return floatBitsToNumber(float16Bits, kFloat16Format);
}

type FloatFormat = { signed: 0 | 1; exponentBits: number; mantissaBits: number; bias: number };

/** FloatFormat defining IEEE754 32-bit float. */
export const kFloat32Format = { signed: 1, exponentBits: 8, mantissaBits: 23, bias: 127 } as const;
/** FloatFormat defining IEEE754 16-bit float. */
export const kFloat16Format = { signed: 1, exponentBits: 5, mantissaBits: 10, bias: 15 } as const;

const workingData = new ArrayBuffer(4);
const workingDataU32 = new Uint32Array(workingData);
const workingDataF32 = new Float32Array(workingData);
/** Bitcast u32 (represented as integer Number) to f32 (represented as floating-point Number). */
export function float32BitsToNumber(bits: number): number {
workingDataU32[0] = bits;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the take away here is don't create temporary TypedArrayBuffer for reinterpretation.

Copy link
Collaborator Author

@kainino0x kainino0x Mar 17, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I measured the performance of this briefly while testing a bunch of other things. It didn't have a very large effect, but it was enough that it seemed worth using.

However I tried measuring it against just now (by just moving workingData* inside these functions) and I wasn't able to measure a difference... ~2020ms either way. Maybe it got optimized better somehow when written this way?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, the test case I was using is no longer bottlenecked on this function. I tested a different test case (rgba32float) which is, and the results are good.
webgpu:web_platform,copyToTexture,ImageBitmap:from_ImageData:alpha="premultiply";orientation="flipY";srcDoFlipYDuringCopy=false;dstColorFormat="rgba32float";dstPremultiplied=true

preallocated (this PR): 1640ms
late allocated (same but workingData moved inside the function): 2130ms
array-initialized (new Float32Array(new Uint32Array([bits]).buffer)[0]): 2260ms

incidentally I realized one of these functions is implemented wrong, so fixing that.

Copy link
Contributor

@shaoboyan shaoboyan Mar 17, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, so it seems that the take away is still correct! Thanks for resolving this performance issue!

return workingDataF32[0];
}
/** Bitcast f32 (represented as floating-point Number) to u32 (represented as integer Number). */
export function numberToFloat32Bits(number: number): number {
workingDataF32[0] = number;
return workingDataU32[0];
}

/**
* Decodes an IEEE754 float with the supplied format specification into a JS number.
*
* The format MUST be no larger than a 32-bit float.
*/
export function floatBitsToNumber(bits: number, fmt: FloatFormat): number {
// Pad the provided bits out to f32, then convert to a `number` with the wrong bias.
// E.g. for f16 to f32:
// - f16: S EEEEE MMMMMMMMMM
// ^ 000^^^^^ ^^^^^^^^^^0000000000000
// - f32: S eeeEEEEE MMMMMMMMMMmmmmmmmmmmmmm

const kNonSignBits = fmt.exponentBits + fmt.mantissaBits;
const kNonSignBitsMask = (1 << kNonSignBits) - 1;
const expAndMantBits = bits & kNonSignBitsMask;
let f32BitsWithWrongBias = expAndMantBits << (kFloat32Format.mantissaBits - fmt.mantissaBits);
f32BitsWithWrongBias |= (bits << (31 - kNonSignBits)) & 0x8000_0000;
const numberWithWrongBias = float32BitsToNumber(f32BitsWithWrongBias);
return numberWithWrongBias * 2 ** (kFloat32Format.bias - fmt.bias);
}

/**
* Given a floating point number (as an integer representing its bits), computes how many ULPs it is
* from zero.
*
* Subnormal numbers are skipped, so that 0 is one ULP from the minimum normal number.
* Subnormal values are flushed to 0.
* Positive and negative 0 are both considered to be 0 ULPs from 0.
*/
export function floatBitsToNormalULPFromZero(bits: number, fmt: FloatFormat): number {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For copy_to_texture 16-bit float(and 32-bit float) result comparasion. With this helper function, it seems that we could check the result by:

// Assume expect is larger than actual
floatBitsToNormalULPFromZero(Uint16(expected) - Uint16(actual), kFloat16Format) < constant?

Am I right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is the direction, I'm a bit worry about the case running time.
As you may know, current compare logic is simple and hack
But it still took longer time because it requires a buffer view reinterpretation.
If we add an extra ops floatBitsToNormalULPFromZero, I think it took longer time.

So maybe a bit hack but do you think it is possible that we took everything as Uint8 as input and do the bit ops? It will save the buffer view reinterpretation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And another option is to save time on the other place rather than the float compare.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Performance is definitely a potential issue and I haven't investigated it enough yet, thanks for highlighting it.
It's even worse than your example code, because it's more like floatBitsToNormalULPFromZero(expected) - floatBitsToNormalULPFromZero(actual).

We have a diffULP helper already for directly determining the ULPs between two values without computing them relative to zero. I'll investigate the performance and see what can be done.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a point of comparison,
webgpu:web_platform,copyToTexture,ImageBitmap:from_ImageData:alpha="none";orientation="none";srcDoFlipYDuringCopy=true;dstColorFormat="rgba16float";*
before: 2500ms each
after: 4300ms each

Not quite as bad as I expected, but could probably be better

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at 7481681, I'm guessing it was float16BitsToFloat32/float16BitsToFloat32. Which is probably a little more expensive than floatBitsToNormalULPFromZero though I wouldn't expect it to be that much worse.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at 7481681, I'm guessing it was float16BitsToFloat32/float16BitsToFloat32. Which is probably a little more expensive than floatBitsToNormalULPFromZero though I wouldn't expect it to be that much worse.

Yes, removing these two helper functions help accelerated the tests a lot but it is still worse than the Uint8 comparation a lot (on my machine) but the same performance as float32 comparation. So I suspect this is due to the reinterpretation (But I think it shouldn't took long time).

webgpu:web_platform,copyToTexture,ImageBitmap:from_ImageData:alpha="none";orientation="none";srcDoFlipYDuringCopy=true;dstColorFormat="rgba16float";*
before: 2500ms each
after: 4300ms each

Thanks for testing! I understand that 4300ms is the time that applying diffULP, right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dug down into the performance of the ImageBitmap:from_ImageData test and found that it was a simple matter of implementing this optimization I had left for myself (in #1055):

    // MAINTENANCE_TODO: Could be faster to actually implement numberToBits directly.
    numberToBits: (components: PerTexelComponent<number>) =>
      ret.unpackBits(new Uint8Array(ret.pack(encode(components)))),

before: 2500ms
draft: 4300ms
after: 2030ms!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool!

const mask_sign = fmt.signed << (fmt.exponentBits + fmt.mantissaBits);
const mask_expt = ((1 << fmt.exponentBits) - 1) << fmt.mantissaBits;
const mask_mant = (1 << fmt.mantissaBits) - 1;
const mask_rest = mask_expt | mask_mant;

assert(fmt.exponentBits + fmt.mantissaBits <= 31);

const sign = bits & mask_sign ? -1 : 1;
const rest = bits & mask_rest;
const subnormal_or_zero = (bits & mask_expt) === 0;
const infinity_or_nan = (bits & mask_expt) === mask_expt;
assert(!infinity_or_nan, 'no ulp representation for infinity/nan');

// The first normal number is mask_mant+1, so subtract mask_mant to make min_normal - zero = 1ULP.
const abs_ulp_from_zero = subnormal_or_zero ? 0 : rest - mask_mant;
return sign * abs_ulp_from_zero;
}

/**
Expand Down
13 changes: 11 additions & 2 deletions src/webgpu/util/math.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,11 @@ export function clamp(n: number, { min, max }: { min: number; max: number }): nu
}

/**
* @returns the Units of Last Place difference between the numbers a and b.
* If either `a` or `b` are not finite numbers, then diffULP() returns Infinity.
* @returns the (absolute) Units of Last Place difference between the float32 numbers a and b, taken
* as JS doubles. If either `a` or `b` are not finite numbers, then diffULP() returns Infinity.
*
* Subnormal numbers are skipped, so 0 is one ULP from the minimum normal number.
* Subnormal values are rounded to 0.
*/
export function diffULP(a: number, b: number): number {
if (!Number.isFinite(a) || !Number.isFinite(b)) {
Expand Down Expand Up @@ -317,3 +320,9 @@ export function multiplyMatrices(

return product;
}

/** Sign-extend the `bits`-bit number `n` to a 32-bit signed integer. */
export function signExtend(n: number, bits: number): number {
shaoboyan marked this conversation as resolved.
Show resolved Hide resolved
const shift = 32 - bits;
return (n << shift) >> shift;
}
51 changes: 51 additions & 0 deletions src/webgpu/util/pretty_diff_tables.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import { range } from '../../common/util/util.js';

/**
* Pretty-prints a "table" of cell values (each being `number | string`), right-aligned.
* Each row may be any iterator, including lazily-generated (potentially infinite) rows.
*
* The first argument is the printing options:
* - fillToWidth: Keep printing columns (as long as there is data) until this width is passed.
* If there is more data, "..." is appended.
* - numberToString: if a cell value is a number, this is used to stringify it.
*
* Each remaining argument provides one row for the table.
*/
export function generatePrettyTable(
{ fillToWidth, numberToString }: { fillToWidth: number; numberToString: (n: number) => string },
rows: ReadonlyArray<Iterable<string | number>>
): string {
const rowStrings = range(rows.length, () => '');
let totalTableWidth = 0;
const iters = rows.map(row => row[Symbol.iterator]());

// Loop over columns
for (;;) {
const cellsForColumn = iters.map(iter => {
const r = iter.next(); // Advance the iterator for each row, in lock-step.
return r.done ? undefined : typeof r.value === 'number' ? numberToString(r.value) : r.value;
});
if (cellsForColumn.every(cell => cell === undefined)) break;

// Maximum width of any cell in this column, plus one for space between columns
// (also inserts a space at the left of the first column).
const colWidth = Math.max(...cellsForColumn.map(c => (c === undefined ? 0 : c.length))) + 1;
for (let row = 0; row < rowStrings.length; ++row) {
const cell = cellsForColumn[row];
if (cell !== undefined) {
rowStrings[row] += cell.padStart(colWidth);
}
}

totalTableWidth += colWidth;
if (totalTableWidth >= fillToWidth) {
for (let row = 0; row < rowStrings.length; ++row) {
if (cellsForColumn[row] !== undefined) {
rowStrings[row] += ' ...';
}
}
break;
}
}
return rowStrings.join('\n');
}
Loading