From 25b4c745388e4450500f356ad3d703b34ec8daed Mon Sep 17 00:00:00 2001 From: TheR1sing3un Date: Fri, 27 Sep 2024 17:12:24 +0800 Subject: [PATCH] refactor: polish record size estimation in compaction 1. polish record size estimation in compaction Signed-off-by: TheR1sing3un --- .../hudi/common/model/CompactionContext.java | 1 + .../hudi/common/util/SampleEstimator.java | 8 ++++---- .../SortedAppendOnlyExternalSpillableMap.java | 17 +++++++++-------- .../hudi/common/util/TestSampleEstimator.java | 2 +- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/CompactionContext.java b/hudi-common/src/main/java/org/apache/hudi/common/model/CompactionContext.java index 59cc6bf61f30b..b42d96a7ded7a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/CompactionContext.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/CompactionContext.java @@ -171,6 +171,7 @@ public String toString() { + ", estimatedBaseFileSize=" + estimatedBaseFileSize + ", baseFileSorted=" + baseFileSorted + ", estimatedLogFileSize=" + estimatedLogFileSize + + ", logFileNum=" + logFilePaths.size() + ", logFilePaths=" + logFilePaths + ", logFilesRealSize=" + logFilesRealSize + '}'; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/SampleEstimator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/SampleEstimator.java index a114e69c332be..eabf0d1090357 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/SampleEstimator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/SampleEstimator.java @@ -38,15 +38,15 @@ public class SampleEstimator implements SizeEstimator { private int sampleCount; public SampleEstimator(SizeEstimator underlyingEstimator) { - this(DEFAULT_TRIGGER_SAMPLE_THRESHOLD, DEFAULT_SAMPLE_WRIGHT, underlyingEstimator); + this(underlyingEstimator, DEFAULT_TRIGGER_SAMPLE_THRESHOLD); } public SampleEstimator(SizeEstimator underlyingEstimator, int triggerSampleThreshold) { - this(triggerSampleThreshold, DEFAULT_SAMPLE_WRIGHT, underlyingEstimator); + this(underlyingEstimator, triggerSampleThreshold, DEFAULT_SAMPLE_WRIGHT); } // TODO: configure the triggerSampleThreshold and sampleWeight in the write config - public SampleEstimator(int triggerSampleThreshold, double sampleWeight, SizeEstimator underlyingEstimator) { + public SampleEstimator(SizeEstimator underlyingEstimator, int triggerSampleThreshold, double sampleWeight) { this.triggerSampleThreshold = triggerSampleThreshold; this.sampleWeight = sampleWeight; this.underlyingEstimator = underlyingEstimator; @@ -56,7 +56,7 @@ public SampleEstimator(int triggerSampleThreshold, double sampleWeight, SizeEsti } public SampleEstimator newInstance() { - return new SampleEstimator<>(triggerSampleThreshold, sampleWeight, underlyingEstimator); + return new SampleEstimator<>(underlyingEstimator, triggerSampleThreshold, sampleWeight); } public long getPerEstimatedSize() { diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/SortedAppendOnlyExternalSpillableMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/SortedAppendOnlyExternalSpillableMap.java index 61ca9887e7c63..fd498ec12c3f7 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/SortedAppendOnlyExternalSpillableMap.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/SortedAppendOnlyExternalSpillableMap.java @@ -411,10 +411,10 @@ public void close() throws IOException { } } - class MemoryCombinedMap { + private class MemoryCombinedMap { Map map; CombineFunc combineFunc; - long currentMemorySize; + long estimatedAverageKeyValueSize; public MemoryCombinedMap(Option> func) { this.map = new HashMap(); @@ -429,20 +429,21 @@ public V insert(K key, V value) { map.compute(key, (k, oldValue) -> { if (oldValue == null) { V initValue = combineFunc.initCombine(key, value); - long estimatedSize = keySizeEstimator.sizeEstimate(key) + valueSizeEstimator.sizeEstimate(initValue); - currentMemorySize += estimatedSize; + sampleSize(key, initValue); return initValue; } else { V combined = combineFunc.combine(key, value, oldValue); - // NOTE: call size-estimate function to sample, but not add to currentMemorySize - keySizeEstimator.sizeEstimate(key); - valueSizeEstimator.sizeEstimate(combined); + sampleSize(key, combined); return combined; } }); return map.get(key); } + private void sampleSize(K k, V v) { + estimatedAverageKeyValueSize = keySizeEstimator.sizeEstimate(k) + valueSizeEstimator.sizeEstimate(v); + } + public V get(K key) { return map.get(key); } @@ -456,7 +457,7 @@ public int size() { } public long getCurrentMemorySize() { - return currentMemorySize; + return map.size() * estimatedAverageKeyValueSize; } public Iterator getSortedIterator() { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/TestSampleEstimator.java b/hudi-common/src/test/java/org/apache/hudi/common/util/TestSampleEstimator.java index ef0ad36d299d5..e50ba86bf8db4 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/util/TestSampleEstimator.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/TestSampleEstimator.java @@ -31,7 +31,7 @@ public class TestSampleEstimator { @Test public void testSampleEstimator() { final AtomicLong returnValue = new AtomicLong(100); - SampleEstimator estimator = new SampleEstimator(100, 0.1, (r) -> returnValue.get()); + SampleEstimator estimator = new SampleEstimator((r) -> returnValue.get(), 100, 0.1); long estimatedSize = 0; for (int i = 0; i < 1000; i++) { estimatedSize = estimator.sizeEstimate(i);