diff --git a/server/src/main/java/org/elasticsearch/common/util/LongLongHash.java b/server/src/main/java/org/elasticsearch/common/util/LongLongHash.java new file mode 100644 index 0000000000000..3ffb98b19aefb --- /dev/null +++ b/server/src/main/java/org/elasticsearch/common/util/LongLongHash.java @@ -0,0 +1,154 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.util; + +import com.carrotsearch.hppc.BitMixer; + +import org.elasticsearch.common.lease.Releasables; + +/** + * Specialized hash table implementation similar to BytesRefHash that maps + * two long values to ids. Collisions are resolved with open addressing and + * linear probing, growth is smooth thanks to {@link BigArrays} and capacity + * is always a multiple of 2 for faster identification of buckets. + * This class is not thread-safe. + */ +// IDs are internally stored as id + 1 so that 0 encodes for an empty slot +public final class LongLongHash extends AbstractHash { + /** + * The keys of the hash, stored one after another. So the keys for an id + * are stored in {@code 2 * id} and {@code 2 * id + 1}. This arrangement + * makes {@link #add(long, long)} about 17% faster which seems worth it + * because it is in the critical path for aggregations. + */ + private LongArray keys; + + // Constructor with configurable capacity and default maximum load factor. + public LongLongHash(long capacity, BigArrays bigArrays) { + this(capacity, DEFAULT_MAX_LOAD_FACTOR, bigArrays); + } + + //Constructor with configurable capacity and load factor. + public LongLongHash(long capacity, float maxLoadFactor, BigArrays bigArrays) { + super(capacity, maxLoadFactor, bigArrays); + keys = bigArrays.newLongArray(2 * capacity, false); + } + + /** + * Return the first key at {@code 0 <= index <= capacity()}. The + * result is undefined if the slot is unused. + */ + public long getKey1(long id) { + return keys.get(2 * id); + } + + /** + * Return the second key at {@code 0 <= index <= capacity()}. The + * result is undefined if the slot is unused. + */ + public long getKey2(long id) { + return keys.get(2 * id + 1); + } + + /** + * Get the id associated with key or -1 if the key is not contained in the hash. + */ + public long find(long key1, long key2) { + final long slot = slot(hash(key1, key2), mask); + for (long index = slot; ; index = nextSlot(index, mask)) { + final long id = id(index); + long keyOffset = 2 * id; + if (id == -1 || (keys.get(keyOffset) == key1 && keys.get(keyOffset + 1) == key2)) { + return id; + } + } + } + + private long set(long key1, long key2, long id) { + assert size < maxSize; + final long slot = slot(hash(key1, key2), mask); + for (long index = slot; ; index = nextSlot(index, mask)) { + final long curId = id(index); + if (curId == -1) { // means unset + id(index, id); + append(id, key1, key2); + ++size; + return id; + } else { + long keyOffset = 2 * curId; + if (keys.get(keyOffset) == key1 && keys.get(keyOffset + 1) == key2) { + return -1 - curId; + } + } + } + } + + private void append(long id, long key1, long key2) { + long keyOffset = 2 * id; + keys = bigArrays.grow(keys, keyOffset + 2); + keys.set(keyOffset, key1); + keys.set(keyOffset + 1, key2); + } + + private void reset(long key1, long key2, long id) { + final long slot = slot(hash(key1, key2), mask); + for (long index = slot; ; index = nextSlot(index, mask)) { + final long curId = id(index); + if (curId == -1) { // means unset + id(index, id); + append(id, key1, key2); + break; + } + } + } + + /** + * Try to add {@code key}. Return its newly allocated id if it wasn't in + * the hash table yet, or {@code -1-id} if it was already present in + * the hash table. + */ + public long add(long key1, long key2) { + if (size >= maxSize) { + assert size == maxSize; + grow(); + } + assert size < maxSize; + return set(key1, key2, size); + } + + @Override + protected void removeAndAdd(long index) { + final long id = id(index, -1); + assert id >= 0; + long keyOffset = id * 2; + final long key1 = keys.set(keyOffset, 0); + final long key2 = keys.set(keyOffset + 1, 0); + reset(key1, key2, id); + } + + @Override + public void close() { + Releasables.close(keys, () -> super.close()); + } + + static long hash(long key1, long key2) { + return 31 * BitMixer.mix(key1) + BitMixer.mix(key2); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongKeyedBucketOrds.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongKeyedBucketOrds.java index b79dc74980a4d..370cb6419003b 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongKeyedBucketOrds.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongKeyedBucketOrds.java @@ -20,11 +20,9 @@ package org.elasticsearch.search.aggregations.bucket.terms; import org.elasticsearch.common.lease.Releasable; -import org.elasticsearch.common.lease.Releasables; import org.elasticsearch.common.util.BigArrays; -import org.elasticsearch.common.util.LongArray; import org.elasticsearch.common.util.LongHash; -import org.elasticsearch.common.util.ObjectArray; +import org.elasticsearch.common.util.LongLongHash; import org.elasticsearch.search.aggregations.CardinalityUpperBound; /** @@ -124,6 +122,7 @@ public FromSingle(BigArrays bigArrays) { @Override public long add(long owningBucketOrd, long value) { + // This is in the critical path for collecting most aggs. Be careful of performance. assert owningBucketOrd == 0; return ords.add(value); } @@ -189,121 +188,69 @@ public void close() { * Implementation that works properly when collecting from many buckets. */ public static class FromMany extends LongKeyedBucketOrds { - // TODO we can almost certainly do better here by building something fit for purpose rather than trying to lego together stuff - private static class Buckets implements Releasable { - private final LongHash valueToThisBucketOrd; - private LongArray thisBucketOrdToGlobalOrd; - - Buckets(BigArrays bigArrays) { - valueToThisBucketOrd = new LongHash(1, bigArrays); - thisBucketOrdToGlobalOrd = bigArrays.newLongArray(1, false); - } - - @Override - public void close() { - Releasables.close(valueToThisBucketOrd, thisBucketOrdToGlobalOrd); - } - } - private final BigArrays bigArrays; - private ObjectArray owningOrdToBuckets; - private long lastGlobalOrd = -1; + private final LongLongHash ords; public FromMany(BigArrays bigArrays) { - this.bigArrays = bigArrays; - owningOrdToBuckets = bigArrays.newObjectArray(1); + ords = new LongLongHash(2, bigArrays); } @Override public long add(long owningBucketOrd, long value) { - Buckets buckets = bucketsForOrd(owningBucketOrd); - long thisBucketOrd = buckets.valueToThisBucketOrd.add(value); - if (thisBucketOrd < 0) { - // Already in the hash - thisBucketOrd = -1 - thisBucketOrd; - return -1 - buckets.thisBucketOrdToGlobalOrd.get(thisBucketOrd); - } - buckets.thisBucketOrdToGlobalOrd = bigArrays.grow(buckets.thisBucketOrdToGlobalOrd, thisBucketOrd + 1); - lastGlobalOrd++; - buckets.thisBucketOrdToGlobalOrd.set(thisBucketOrd, lastGlobalOrd); - return lastGlobalOrd; - } - - private Buckets bucketsForOrd(long owningBucketOrd) { - if (owningOrdToBuckets.size() <= owningBucketOrd) { - owningOrdToBuckets = bigArrays.grow(owningOrdToBuckets, owningBucketOrd + 1); - Buckets buckets = new Buckets(bigArrays); - owningOrdToBuckets.set(owningBucketOrd, buckets); - return buckets; - } - Buckets buckets = owningOrdToBuckets.get(owningBucketOrd); - if (buckets == null) { - buckets = new Buckets(bigArrays); - owningOrdToBuckets.set(owningBucketOrd, buckets); - } - return buckets; + // This is in the critical path for collecting most aggs. Be careful of performance. + return ords.add(owningBucketOrd, value); } @Override public long find(long owningBucketOrd, long value) { - if (owningBucketOrd >= owningOrdToBuckets.size()) { - return -1; - } - Buckets buckets = owningOrdToBuckets.get(owningBucketOrd); - if (buckets == null) { - return -1; - } - long thisBucketOrd = buckets.valueToThisBucketOrd.find(value); - if (thisBucketOrd < 0) { - return -1; - } - return buckets.thisBucketOrdToGlobalOrd.get(thisBucketOrd); + return ords.find(owningBucketOrd, value); } @Override public long bucketsInOrd(long owningBucketOrd) { - if (owningBucketOrd >= owningOrdToBuckets.size()) { - return 0; - } - Buckets buckets = owningOrdToBuckets.get(owningBucketOrd); - if (buckets == null) { - return 0; + // TODO it'd be faster to count the number of buckets in a list of these ords rather than one at a time + long count = 0; + for (long i = 0; i < ords.size(); i++) { + if (ords.getKey1(i) == owningBucketOrd) { + count++; + } } - return buckets.valueToThisBucketOrd.size(); + return count; } @Override public long size() { - return lastGlobalOrd + 1; + return ords.size(); } @Override public long maxOwningBucketOrd() { - return owningOrdToBuckets.size() - 1; + // TODO this is fairly expensive to compute. Can we avoid needing it? + long max = -1; + for (long i = 0; i < ords.size(); i++) { + max = Math.max(max, ords.getKey1(i)); + } + return max; } @Override public BucketOrdsEnum ordsEnum(long owningBucketOrd) { - if (owningBucketOrd >= owningOrdToBuckets.size()) { - return BucketOrdsEnum.EMPTY; - } - Buckets buckets = owningOrdToBuckets.get(owningBucketOrd); - if (buckets == null) { - return BucketOrdsEnum.EMPTY; - } + // TODO it'd be faster to iterate many ords at once rather than one at a time return new BucketOrdsEnum() { - private long thisBucketOrd = -1; + private long ord = -1; private long value; - private long ord; @Override public boolean next() { - thisBucketOrd++; - if (thisBucketOrd >= buckets.valueToThisBucketOrd.size()) { - return false; + while (true) { + ord++; + if (ord >= ords.size()) { + return false; + } + if (ords.getKey1(ord) == owningBucketOrd) { + value = ords.getKey2(ord); + return true; + } } - value = buckets.valueToThisBucketOrd.get(thisBucketOrd); - ord = buckets.thisBucketOrdToGlobalOrd.get(thisBucketOrd); - return true; } @Override @@ -320,13 +267,7 @@ public long ord() { @Override public void close() { - for (long owningBucketOrd = 0; owningBucketOrd < owningOrdToBuckets.size(); owningBucketOrd++) { - Buckets buckets = owningOrdToBuckets.get(owningBucketOrd); - if (buckets != null) { - buckets.close(); - } - } - owningOrdToBuckets.close(); + ords.close(); } } } diff --git a/server/src/test/java/org/elasticsearch/common/util/LongHashTests.java b/server/src/test/java/org/elasticsearch/common/util/LongHashTests.java index 708e7beb861ec..31b499d5c4984 100644 --- a/server/src/test/java/org/elasticsearch/common/util/LongHashTests.java +++ b/server/src/test/java/org/elasticsearch/common/util/LongHashTests.java @@ -22,9 +22,10 @@ import com.carrotsearch.hppc.LongLongHashMap; import com.carrotsearch.hppc.LongLongMap; import com.carrotsearch.hppc.cursors.LongLongCursor; + import org.elasticsearch.common.settings.Settings; import org.elasticsearch.indices.breaker.NoneCircuitBreakerService; -import org.elasticsearch.test.ESSingleNodeTestCase; +import org.elasticsearch.test.ESTestCase; import java.util.HashMap; import java.util.HashSet; @@ -32,7 +33,7 @@ import java.util.Map; import java.util.Set; -public class LongHashTests extends ESSingleNodeTestCase { +public class LongHashTests extends ESTestCase { LongHash hash; private BigArrays randombigArrays() { diff --git a/server/src/test/java/org/elasticsearch/common/util/LongLongHashTests.java b/server/src/test/java/org/elasticsearch/common/util/LongLongHashTests.java new file mode 100644 index 0000000000000..bdfb6b89ccd4a --- /dev/null +++ b/server/src/test/java/org/elasticsearch/common/util/LongLongHashTests.java @@ -0,0 +1,113 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.util; + +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.indices.breaker.NoneCircuitBreakerService; +import org.elasticsearch.test.ESTestCase; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.hamcrest.Matchers.equalTo; + +public class LongLongHashTests extends ESTestCase { + private BigArrays randombigArrays() { + return new MockBigArrays(new MockPageCacheRecycler(Settings.EMPTY), new NoneCircuitBreakerService()); + } + + private LongLongHash randomHash() { + // Test high load factors to make sure that collision resolution works fine + final float maxLoadFactor = 0.6f + randomFloat() * 0.39f; + return new LongLongHash(randomIntBetween(0, 100), maxLoadFactor, randombigArrays()); + } + + public void testSimple() { + try (LongLongHash hash = randomHash()) { + assertThat(hash.add(0, 0), equalTo(0L)); + assertThat(hash.add(0, 1), equalTo(1L)); + assertThat(hash.add(0, 2), equalTo(2L)); + assertThat(hash.add(1, 0), equalTo(3L)); + assertThat(hash.add(1, 1), equalTo(4L)); + + assertThat(hash.add(0, 0), equalTo(-1L)); + assertThat(hash.add(0, 2), equalTo(-3L)); + assertThat(hash.add(1, 1), equalTo(-5L)); + + assertThat(hash.getKey1(0), equalTo(0L)); + assertThat(hash.getKey2(0), equalTo(0L)); + assertThat(hash.getKey1(4), equalTo(1L)); + assertThat(hash.getKey2(4), equalTo(1L)); + } + } + + public void testDuel() { + try (LongLongHash hash = randomHash()) { + int iters = scaledRandomIntBetween(100, 100000); + Key[] values = randomArray(1, iters, Key[]::new, () -> new Key(randomLong(), randomLong())); + Map keyToId = new HashMap<>(); + List idToKey = new ArrayList<>(); + for (int i = 0; i < iters; ++i) { + Key key = randomFrom(values); + if (keyToId.containsKey(key)) { + assertEquals(-1 - keyToId.get(key), hash.add(key.key1, key.key2)); + } else { + assertEquals(keyToId.size(), hash.add(key.key1, key.key2)); + keyToId.put(key, keyToId.size()); + idToKey.add(key); + } + } + + assertEquals(keyToId.size(), hash.size()); + for (Map.Entry entry : keyToId.entrySet()) { + assertEquals(entry.getValue().longValue(), hash.find(entry.getKey().key1, entry.getKey().key2)); + } + + assertEquals(idToKey.size(), hash.size()); + for (long i = 0; i < hash.capacity(); i++) { + long id = hash.id(i); + if (id >= 0) { + Key key = idToKey.get((int) id); + assertEquals(key.key1, hash.getKey1(id)); + assertEquals(key.key2, hash.getKey2(id)); + } + } + + for (long i = 0; i < hash.size(); i++) { + Key key = idToKey.get((int) i); + assertEquals(key.key1, hash.getKey1(i)); + assertEquals(key.key2, hash.getKey2(i)); + } + } + } + + class Key { + long key1; + long key2; + + Key(long key1, long key2) { + this.key1 = key1; + this.key2 = key2; + } + } + +}