From 0167b22ac6d4886a1c3157437a3c5b19e327723a Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Fri, 1 Nov 2024 09:29:55 -0600 Subject: [PATCH] fix(deps): update module github.com/axiomhq/hyperloglog to v0.2.0 (#14722) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 +- .../github.com/axiomhq/hyperloglog/README.md | 54 ++-- vendor/github.com/axiomhq/hyperloglog/beta.go | 273 ++++++++++++++++++ .../axiomhq/hyperloglog/hyperloglog.go | 261 ++++++----------- .../axiomhq/hyperloglog/registers.go | 114 -------- .../github.com/axiomhq/hyperloglog/utils.go | 24 -- vendor/modules.txt | 4 +- 8 files changed, 394 insertions(+), 342 deletions(-) create mode 100644 vendor/github.com/axiomhq/hyperloglog/beta.go delete mode 100644 vendor/github.com/axiomhq/hyperloglog/registers.go diff --git a/go.mod b/go.mod index a2e37eca1d9f..2455994c4a49 100644 --- a/go.mod +++ b/go.mod @@ -117,7 +117,7 @@ require ( github.com/DmitriyVTitov/size v1.5.0 github.com/IBM/go-sdk-core/v5 v5.18.1 github.com/IBM/ibm-cos-sdk-go v1.11.1 - github.com/axiomhq/hyperloglog v0.0.0-20240507144631-af9851f82b27 + github.com/axiomhq/hyperloglog v0.2.0 github.com/buger/jsonparser v1.1.1 github.com/coder/quartz v0.1.2 github.com/d4l3k/messagediff v1.2.1 diff --git a/go.sum b/go.sum index 5638f0414bf2..e5b9d771add9 100644 --- a/go.sum +++ b/go.sum @@ -390,8 +390,8 @@ github.com/aws/aws-sdk-go-v2/service/sts v1.16.1 h1:xsOtPAvHqhvQvBza5ohaUcfq1Lce github.com/aws/aws-sdk-go-v2/service/sts v1.16.1/go.mod h1:Aq2/Qggh2oemSfyHH+EO4UBbgWG6zFCXLHYI4ILTY7w= github.com/aws/smithy-go v1.11.1 h1:IQ+lPZVkSM3FRtyaDox41R8YS6iwPMYIreejOgPW49g= github.com/aws/smithy-go v1.11.1/go.mod h1:3xHYmszWVx2c0kIwQeEVf9uSm4fYZt67FBJnwub1bgM= -github.com/axiomhq/hyperloglog v0.0.0-20240507144631-af9851f82b27 h1:60m4tnanN1ctzIu4V3bfCNJ39BiOPSm1gHFlFjTkRE0= -github.com/axiomhq/hyperloglog v0.0.0-20240507144631-af9851f82b27/go.mod h1:k08r+Yj1PRAmuayFiRK6MYuR5Ve4IuZtTfxErMIh0+c= +github.com/axiomhq/hyperloglog v0.2.0 h1:u1XT3yyY1rjzlWuP6NQIrV4bRYHOaqZaovqjcBEvZJo= +github.com/axiomhq/hyperloglog v0.2.0/go.mod h1:GcgMjz9gaDKZ3G0UMS6Fq/VkZ4l7uGgcJyxA7M+omIM= github.com/baidubce/bce-sdk-go v0.9.197 h1:TQqa4J+FTagrywhaTQ707ffE1eG3ix1s06eSZ/K+Wk0= github.com/baidubce/bce-sdk-go v0.9.197/go.mod h1:zbYJMQwE4IZuyrJiFO8tO8NbtYiKTFTbwh4eIsqjVdg= github.com/baiyubin/aliyun-sts-go-sdk v0.0.0-20180326062324-cfa1a18b161f/go.mod h1:AuiFmCCPBSrqvVMvuqFuk0qogytodnVFVSN5CeJB8Gc= diff --git a/vendor/github.com/axiomhq/hyperloglog/README.md b/vendor/github.com/axiomhq/hyperloglog/README.md index cf5c05e842ca..5affa528733f 100644 --- a/vendor/github.com/axiomhq/hyperloglog/README.md +++ b/vendor/github.com/axiomhq/hyperloglog/README.md @@ -1,43 +1,43 @@ -HyperLogLog - an algorithm for approximating the number of distinct elements ---- +# HyperLogLog - an algorithm for approximating the number of distinct elements [![GoDoc](https://godoc.org/github.com/axiomhq/hyperloglog?status.svg)](https://godoc.org/github.com/axiomhq/hyperloglog) [![Go Report Card](https://goreportcard.com/badge/github.com/axiomhq/hyperloglog)](https://goreportcard.com/report/github.com/axiomhq/hyperloglog) [![CircleCI](https://circleci.com/gh/axiomhq/hyperloglog/tree/master.svg?style=svg)](https://circleci.com/gh/axiomhq/hyperloglog/tree/master) -An improved version of [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) for the count-distinct problem, approximating the number of distinct elements in a multiset **using 33-50% less space** than other usual HyperLogLog implementations. +An improved version of [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) for the count-distinct problem, approximating the number of distinct elements in a multiset. This implementation offers enhanced performance, flexibility, and simplicity while maintaining accuracy. -This work is based on ["Better with fewer bits: Improving the performance of cardinality estimation of large data streams - Qingjun Xiao, You Zhou, Shigang Chen"](http://cse.seu.edu.cn/PersonalPage/csqjxiao/csqjxiao_files/papers/INFOCOM17.pdf). +## Note on Implementation History -## Implementation +The initial version of this work (tagged as v0.1.0) was based on ["Better with fewer bits: Improving the performance of cardinality estimation of large data streams - Qingjun Xiao, You Zhou, Shigang Chen"](http://cse.seu.edu.cn/PersonalPage/csqjxiao/csqjxiao_files/papers/INFOCOM17.pdf). However, the current implementation has evolved significantly from this original basis, notably moving away from the tailcut method. -The core differences between this and other implementations are: -* **use metro hash** instead of xxhash -* **sparse representation** for lower cardinalities (like HyperLogLog++) -* **loglog-beta** for dynamic bias correction medium and high cardinalities. -* **4-bit register** instead of 5 (HLL) and 6 (HLL++), but most implementations use 1-byte registers out of convenience +## Current Implementation -In general it borrows a lot from [InfluxData's fork](https://github.com/influxdata/influxdb/tree/master/pkg/estimator/hll) of [Clark Duvall's HyperLogLog++ implementation](https://github.com/clarkduvall/hyperloglog), but uses **50% less space**. +The current implementation is based on the LogLog-Beta algorithm, as described in: -## Results -A direct comparison with the [HyperLogLog++ implementation used by InfluxDB](https://github.com/influxdata/influxdb/tree/master/pkg/estimator/hll) yielded the following results: +["LogLog-Beta and More: A New Algorithm for Cardinality Estimation Based on LogLog Counting"](https://arxiv.org/pdf/1612.02284) by Jason Qin, Denys Kim, and Yumei Tung (2016). -| Exact | Axiom (8.2 KB) | Influx (16.39 KB) | -| --- | --- | --- | -| 10 | 10 (0.0% off) | 10 (0.0% off) | -| 50 | 50 (0.0% off) | 50 (0.0% off) | -| 250 | 250 (0.0% off) | 250 (0.0% off) | -| 1250 | 1249 (0.08% off) | 1249 (0.08% off) | -| 6250 | 6250 (0.0% off) | 6250 (0.0% off) | -| 31250 | **31008 (0.7744% off)** | 31565 (1.0080% off) | -| 156250 | **156013 (0.1517% off)** | 156652 (0.2573% off) | -| 781250 | **782364 (0.1426% off)** | 775988 (0.6735% off) | -| 3906250 | 3869332 (0.9451% off) | **3889909 (0.4183% off)** | -| 10000000 | **9952682 (0.4732% off)** |9889556 (1.1044% off) | +Key features of the current implementation: +* **Metro hash** used instead of xxhash +* **Sparse representation** for lower cardinalities (like HyperLogLog++) +* **LogLog-Beta** for dynamic bias correction across all cardinalities +* **8-bit registers** for convenience and simplified implementation +* **Order-independent insertions and merging** for consistent results regardless of data input order +* **Removal of tailcut method** for a more straightforward approach +* **Flexible precision** allowing for 2^4 to 2^18 registers +This implementation is now more straightforward, efficient, and flexible, while remaining backwards compatible with previous versions. It provides a balance between precision, memory usage, speed, and ease of use. + +## Precision and Memory Usage + +This implementation allows for creating HyperLogLog sketches with arbitrary precision between 2^4 and 2^18 registers. The memory usage scales with the number of registers: + +* Minimum (2^4 registers): 16 bytes +* Default (2^14 registers): 16 KB +* Maximum (2^18 registers): 256 KB + +Users can choose the precision that best fits their use case, balancing memory usage against estimation accuracy. ## Note A big thank you to Prof. Shigang Chen and his team at the University of Florida who are actively conducting research around "Big Network Data". - ## Contributing Kindly check our [contributing guide](https://github.com/axiomhq/hyperloglog/blob/main/Contributing.md) on how to propose bugfixes and improvements, and submitting pull requests to the project @@ -48,4 +48,4 @@ Kindly check our [contributing guide](https://github.com/axiomhq/hyperloglog/blo Distributed under MIT License (`The MIT License`). -See [LICENSE](LICENSE) for more information. +See [LICENSE](LICENSE) for more information. \ No newline at end of file diff --git a/vendor/github.com/axiomhq/hyperloglog/beta.go b/vendor/github.com/axiomhq/hyperloglog/beta.go new file mode 100644 index 000000000000..29d560136ac5 --- /dev/null +++ b/vendor/github.com/axiomhq/hyperloglog/beta.go @@ -0,0 +1,273 @@ +package hyperloglog + +import ( + "fmt" + "math" +) + +var betaMap = map[uint8]func(float64) float64{ + 4: beta4, + 5: beta5, + 6: beta6, + 7: beta7, + 8: beta8, + 9: beta9, + 10: beta10, + 11: beta11, + 12: beta12, + 13: beta13, + 14: beta14, + 15: beta15, + 16: beta16, + 17: beta17, + 18: beta18, +} + +func beta(p uint8, ez float64) float64 { + f, ok := betaMap[p] + if !ok { + panic(fmt.Sprintf("invalid precision %d", p)) + } + return f(ez) +} + +/* +p=4 +[-0.582581413904517,-1.935300357560050,11.07932375 8035073,-22.131357446444323,22.505391846630037,-12 .000723834917984,3.220579408194167,-0.342225302271 235] +*/ +func beta4(ez float64) float64 { + zl := math.Log(ez + 1) + return -0.582581413904517*ez + + -1.935300357560050*zl + + 11.079323758035073*math.Pow(zl, 2) + + -22.131357446444323*math.Pow(zl, 3) + + 22.505391846630037*math.Pow(zl, 4) + + -12.000723834917984*math.Pow(zl, 5) + + 3.220579408194167*math.Pow(zl, 6) + + -0.342225302271235*math.Pow(zl, 7) +} + +/* +p=5 +[-0.7518999460733967,-0.9590030077748760,5.5997371 322141607,-8.2097636999765520,6.5091254894472037,- 2.6830293734323729,0.5612891113138221,-0.046333162 2196545] +*/ +func beta5(ez float64) float64 { + zl := math.Log(ez + 1) + return -0.7518999460733967*ez + + -0.9590030077748760*zl + + 5.5997371322141607*math.Pow(zl, 2) + + -8.2097636999765520*math.Pow(zl, 3) + + 6.5091254894472037*math.Pow(zl, 4) + + -2.6830293734323729*math.Pow(zl, 5) + + 0.5612891113138221*math.Pow(zl, 6) + + -0.0463331622196545*math.Pow(zl, 7) +} + +/* +p=6 +[29.8257900969619634,-31.3287083337725925,-10.5942 523036582283,-11.5720125689099618,3.81887543739074 92,-2.4160130328530811,0.4542208940970826,-0.05751 55452020420] +*/ +func beta6(ez float64) float64 { + zl := math.Log(ez + 1) + return 29.8257900969619634*ez + + -31.3287083337725925*zl + + -10.5942523036582283*math.Pow(zl, 2) + + -11.5720125689099618*math.Pow(zl, 3) + + 3.8188754373907492*math.Pow(zl, 4) + + -2.4160130328530811*math.Pow(zl, 5) + + 0.4542208940970826*math.Pow(zl, 6) + + -0.0575155452020420*math.Pow(zl, 7) +} + +/* +p=7 +[2.8102921290820060,-3.9780498518175995,1.31626800 41351582,-3.9252486335805901,2.0080835753946471,-0 .7527151937556955,0.1265569894242751,-0.0109946438726240] +*/ +func beta7(ez float64) float64 { + zl := math.Log(ez + 1) + return 2.8102921290820060*ez + + -3.9780498518175995*zl + + 1.3162680041351582*math.Pow(zl, 2) + + -3.9252486335805901*math.Pow(zl, 3) + + 2.0080835753946471*math.Pow(zl, 4) + + -0.7527151937556955*math.Pow(zl, 5) + + 0.1265569894242751*math.Pow(zl, 6) + + -0.0109946438726240*math.Pow(zl, 7) +} + +/* +p=8 +[1.00633544887550519,-2.00580666405112407,1.643697 49366514117,-2.70560809940566172,1.392099802442225 98,-0.46470374272183190,0.07384282377269775,-0.00578554885254223] +*/ +func beta8(ez float64) float64 { + zl := math.Log(ez + 1) + return 1.00633544887550519*ez + + -2.00580666405112407*zl + + 1.64369749366514117*math.Pow(zl, 2) + + -2.70560809940566172*math.Pow(zl, 3) + + 1.39209980244222598*math.Pow(zl, 4) + + -0.46470374272183190*math.Pow(zl, 5) + + 0.07384282377269775*math.Pow(zl, 6) + + -0.00578554885254223*math.Pow(zl, 7) +} + +/* +p=9 +[-0.09415657458167959,-0.78130975924550528,1.71514 946750712460,-1.73711250406516338,0.86441508489048 924,-0.23819027465047218,0.03343448400269076,-0.00 207858528178157] +*/ +func beta9(ez float64) float64 { + zl := math.Log(ez + 1) + return -0.09415657458167959*ez + + -0.78130975924550528*zl + + 1.71514946750712460*math.Pow(zl, 2) + + -1.73711250406516338*math.Pow(zl, 3) + + 0.86441508489048924*math.Pow(zl, 4) + + -0.23819027465047218*math.Pow(zl, 5) + + 0.03343448400269076*math.Pow(zl, 6) + + -0.00207858528178157*math.Pow(zl, 7) +} + +/* +p=10 +[-0.25935400670790054,-0.52598301999805808,1.48933 034925876839,-1.29642714084993571,0.62284756217221615,-0.15672326770251041,0.02054415903878563,-0.00 112488483925502] +*/ +func beta10(ez float64) float64 { + zl := math.Log(ez + 1) + return -0.25935400670790054*ez + + -0.52598301999805808*zl + + 1.48933034925876839*math.Pow(zl, 2) + + -1.29642714084993571*math.Pow(zl, 3) + + 0.62284756217221615*math.Pow(zl, 4) + + -0.15672326770251041*math.Pow(zl, 5) + + 0.02054415903878563*math.Pow(zl, 6) + + -0.00112488483925502*math.Pow(zl, 7) +} + +/* +p=11 +[-4.32325553856025e-01,-1.08450736399632e-01,6.091 56550741120e-01,-1.65687801845180e-02,-7.958293410 87617e-02,4.71830602102918e-02,-7.81372902346934e- 03,5.84268708489995e-04] +*/ +func beta11(ez float64) float64 { + zl := math.Log(ez + 1) + return -0.432325553856025*ez + + -0.108450736399632*zl + + 0.609156550741120*math.Pow(zl, 2) + + -0.0165687801845180*math.Pow(zl, 3) + + -0.0795829341087617*math.Pow(zl, 4) + + 0.0471830602102918*math.Pow(zl, 5) + + -0.00781372902346934*math.Pow(zl, 6) + + 0.000584268708489995*math.Pow(zl, 7) +} + +/* +p=12 +[-3.84979202588598e-01,1.83162233114364e-01,1.3039 6688841854e-01,7.04838927629266e-02,-8.95893971464 453e-03,1.13010036741605e-02,-1.94285569591290e-03 ,2.25435774024964e-04] +*/ +func beta12(ez float64) float64 { + zl := math.Log(ez + 1) + return -0.384979202588598*ez + + 0.183162233114364*zl + + 0.130396688841854*math.Pow(zl, 2) + + 0.0704838927629266*math.Pow(zl, 3) + + -0.0089589397146453*math.Pow(zl, 4) + + 0.0113010036741605*math.Pow(zl, 5) + + -0.00194285569591290*math.Pow(zl, 6) + + 0.000225435774024964*math.Pow(zl, 7) +} + +/* +p=13 +[-0.41655270946462997,-0.22146677040685156,0.38862 131236999947,0.45340979746062371,-0.36264738324476 375,0.12304650053558529,-0.01701540384555510,0.001 02750367080838] +*/ +func beta13(ez float64) float64 { + zl := math.Log(ez + 1) + return -0.41655270946462997*ez + + -0.22146677040685156*zl + + 0.38862131236999947*math.Pow(zl, 2) + + 0.45340979746062371*math.Pow(zl, 3) + + -0.36264738324476375*math.Pow(zl, 4) + + 0.12304650053558529*math.Pow(zl, 5) + + -0.01701540384555510*math.Pow(zl, 6) + + 0.00102750367080838*math.Pow(zl, 7) +} + +/* +p=14 +[-3.71009760230692e-01,9.78811941207509e-03,1.8579 6293324165e-01,2.03015527328432e-01,-1.16710521803 686e-01,4.31106699492820e-02,-5.99583540511831e-03 ,4.49704299509437e-04] +*/ + +func beta14(ez float64) float64 { + zl := math.Log(ez + 1) + return -0.371009760230692*ez + + 0.00978811941207509*zl + + 0.185796293324165*math.Pow(zl, 2) + + 0.203015527328432*math.Pow(zl, 3) + + -0.116710521803686*math.Pow(zl, 4) + + 0.0431106699492820*math.Pow(zl, 5) + + -0.00599583540511831*math.Pow(zl, 6) + + 0.000449704299509437*math.Pow(zl, 7) +} + +/* +p=15 +[-0.38215145543875273,-0.89069400536090837,0.37602 335774678869,0.99335977440682377,-0.65577441638318 956,0.18332342129703610,-0.02241529633062872,0.001 21399789330194] +*/ +func beta15(ez float64) float64 { + zl := math.Log(ez + 1) + return -0.38215145543875273*ez + + -0.89069400536090837*zl + + 0.37602335774678869*math.Pow(zl, 2) + + 0.99335977440682377*math.Pow(zl, 3) + + -0.65577441638318956*math.Pow(zl, 4) + + 0.18332342129703610*math.Pow(zl, 5) + + -0.02241529633062872*math.Pow(zl, 6) + + 0.00121399789330194*math.Pow(zl, 7) +} + +/* +p=16 +[-0.37331876643753059,-1.41704077448122989,0.407291 84796612533,1.56152033906584164,-0.99242233534286128,0.26064681399483092,-0.03053811369682807,0.00155770210179105] +*/ +func beta16(ez float64) float64 { + zl := math.Log(ez + 1) + return -0.37331876643753059*ez + + -1.41704077448122989*zl + + 0.40729184796612533*math.Pow(zl, 2) + + 1.56152033906584164*math.Pow(zl, 3) + + -0.99242233534286128*math.Pow(zl, 4) + + 0.26064681399483092*math.Pow(zl, 5) + + -0.03053811369682807*math.Pow(zl, 6) + + 0.00155770210179105*math.Pow(zl, 7) +} + +/* +p=17 +[-0.36775502299404605,0.53831422351377967,0.769702 89278767923,0.55002583586450560,-0.745755882611469 41,0.25711835785821952,-0.03437902606864149,0.0018 5949146371616] +*/ +func beta17(ez float64) float64 { + zl := math.Log(ez + 1) + return -0.36775502299404605*ez + + 0.53831422351377967*zl + + 0.76970289278767923*math.Pow(zl, 2) + + 0.55002583586450560*math.Pow(zl, 3) + + -0.74575588261146941*math.Pow(zl, 4) + + 0.25711835785821952*math.Pow(zl, 5) + + -0.03437902606864149*math.Pow(zl, 6) + + 0.00185949146371616*math.Pow(zl, 7) +} + +/* +p=18 +[-0.36479623325960542,0.99730412328635032,1.553543 86230081221,1.25932677198028919,-1.533259482091101 63,0.47801042200056593,-0.05951025172951174,0.0029 1076804642205] +*/ +func beta18(ez float64) float64 { + zl := math.Log(ez + 1) + return -0.36479623325960542*ez + + 0.99730412328635032*zl + + 1.55354386230081221*math.Pow(zl, 2) + + 1.25932677198028919*math.Pow(zl, 3) + + -1.53325948209110163*math.Pow(zl, 4) + + 0.47801042200056593*math.Pow(zl, 5) + + -0.05951025172951174*math.Pow(zl, 6) + + 0.00291076804642205*math.Pow(zl, 7) +} diff --git a/vendor/github.com/axiomhq/hyperloglog/hyperloglog.go b/vendor/github.com/axiomhq/hyperloglog/hyperloglog.go index 58a13a5d6287..638b291cd23a 100644 --- a/vendor/github.com/axiomhq/hyperloglog/hyperloglog.go +++ b/vendor/github.com/axiomhq/hyperloglog/hyperloglog.go @@ -9,61 +9,36 @@ import ( ) const ( - capacity = uint8(16) - pp = uint8(25) - mp = uint32(1) << pp - version = 1 + pp = uint8(25) + mp = uint32(1) << pp + version = 2 ) -// Sketch is a HyperLogLog data-structure for the count-distinct problem, -// approximating the number of distinct elements in a multiset. type Sketch struct { p uint8 - b uint8 m uint32 alpha float64 tmpSet set sparseList *compressedList - regs *registers + regs []uint8 } -// New returns a HyperLogLog Sketch with 2^14 registers (precision 14) -func New() *Sketch { - return New14() -} - -// New14 returns a HyperLogLog Sketch with 2^14 registers (precision 14) -func New14() *Sketch { - sk, _ := newSketch(14, true) - return sk -} - -// New16 returns a HyperLogLog Sketch with 2^16 registers (precision 16) -func New16() *Sketch { - sk, _ := newSketch(16, true) - return sk -} - -// NewNoSparse returns a HyperLogLog Sketch with 2^14 registers (precision 14) -// that will not use a sparse representation -func NewNoSparse() *Sketch { - sk, _ := newSketch(14, false) - return sk -} +func New() *Sketch { return New14() } // New returns a HyperLogLog Sketch with 2^14 registers (precision 14) +func New14() *Sketch { return newSketchNoError(14, true) } // New14 returns a HyperLogLog Sketch with 2^14 registers (precision 14) +func New16() *Sketch { return newSketchNoError(16, true) } // New16 returns a HyperLogLog Sketch with 2^16 registers (precision 16) +func NewNoSparse() *Sketch { return newSketchNoError(14, false) } // NewNoSparse returns a HyperLogLog Sketch with 2^14 registers (precision 14) that will not use a sparse representation +func New16NoSparse() *Sketch { return newSketchNoError(16, false) } // New16NoSparse returns a HyperLogLog Sketch with 2^16 registers (precision 16) that will not use a sparse representation -// New16NoSparse returns a HyperLogLog Sketch with 2^16 registers (precision 16) -// that will not use a sparse representation -func New16NoSparse() *Sketch { - sk, _ := newSketch(16, false) +func newSketchNoError(precision uint8, sparse bool) *Sketch { + sk, _ := NewSketch(precision, sparse) return sk } -// newSketch returns a HyperLogLog Sketch with 2^precision registers -func newSketch(precision uint8, sparse bool) (*Sketch, error) { +func NewSketch(precision uint8, sparse bool) (*Sketch, error) { if precision < 4 || precision > 18 { return nil, fmt.Errorf("p has to be >= 4 and <= 18") } - m := uint32(math.Pow(2, float64(precision))) + m := uint32(1) << precision s := &Sketch{ m: m, p: precision, @@ -73,29 +48,22 @@ func newSketch(precision uint8, sparse bool) (*Sketch, error) { s.tmpSet = set{} s.sparseList = newCompressedList(0) } else { - s.regs = newRegisters(m) + s.regs = make([]uint8, m) } return s, nil } -func (sk *Sketch) sparse() bool { - return sk.sparseList != nil -} +func (sk *Sketch) sparse() bool { return sk.sparseList != nil } // Clone returns a deep copy of sk. func (sk *Sketch) Clone() *Sketch { - return &Sketch{ - b: sk.b, - p: sk.p, - m: sk.m, - alpha: sk.alpha, - tmpSet: sk.tmpSet.Clone(), - sparseList: sk.sparseList.Clone(), - regs: sk.regs.clone(), - } + clone := *sk + clone.regs = append([]uint8(nil), sk.regs...) + clone.tmpSet = sk.tmpSet.Clone() + clone.sparseList = sk.sparseList.Clone() + return &clone } -// Converts to normal if the sparse list is too large. func (sk *Sketch) maybeToNormal() { if uint32(len(sk.tmpSet))*100 > sk.m { sk.mergeSparse() @@ -105,75 +73,61 @@ func (sk *Sketch) maybeToNormal() { } } -// Merge takes another Sketch and combines it with Sketch h. -// If Sketch h is using the sparse Sketch, it will be converted -// to the normal Sketch. func (sk *Sketch) Merge(other *Sketch) error { if other == nil { - // Nothing to do return nil } - cpOther := other.Clone() - - if sk.p != cpOther.p { + if sk.p != other.p { return errors.New("precisions must be equal") } if sk.sparse() && other.sparse() { - for k := range other.tmpSet { - sk.tmpSet.add(k) - } - for iter := other.sparseList.Iter(); iter.HasNext(); { - sk.tmpSet.add(iter.Next()) - } - sk.maybeToNormal() - return nil + sk.mergeSparseSketch(other) + } else { + sk.mergeDenseSketch(other) } + return nil +} +func (sk *Sketch) mergeSparseSketch(other *Sketch) { + for k := range other.tmpSet { + sk.tmpSet.add(k) + } + for iter := other.sparseList.Iter(); iter.HasNext(); { + sk.tmpSet.add(iter.Next()) + } + sk.maybeToNormal() +} + +func (sk *Sketch) mergeDenseSketch(other *Sketch) { if sk.sparse() { sk.toNormal() } - if cpOther.sparse() { - for k := range cpOther.tmpSet { - i, r := decodeHash(k, cpOther.p, pp) + if other.sparse() { + for k := range other.tmpSet { + i, r := decodeHash(k, other.p, pp) sk.insert(i, r) } - - for iter := cpOther.sparseList.Iter(); iter.HasNext(); { - i, r := decodeHash(iter.Next(), cpOther.p, pp) + for iter := other.sparseList.Iter(); iter.HasNext(); { + i, r := decodeHash(iter.Next(), other.p, pp) sk.insert(i, r) } } else { - if sk.b < cpOther.b { - sk.regs.rebase(cpOther.b - sk.b) - sk.b = cpOther.b - } else { - cpOther.regs.rebase(sk.b - cpOther.b) - cpOther.b = sk.b - } - - for i, v := range cpOther.regs.tailcuts { - v1 := v.get(0) - if v1 > sk.regs.get(uint32(i)*2) { - sk.regs.set(uint32(i)*2, v1) - } - v2 := v.get(1) - if v2 > sk.regs.get(1+uint32(i)*2) { - sk.regs.set(1+uint32(i)*2, v2) + for i, v := range other.regs { + if v > sk.regs[i] { + sk.regs[i] = v } } } - return nil } -// Convert from sparse Sketch to dense Sketch. func (sk *Sketch) toNormal() { if len(sk.tmpSet) > 0 { sk.mergeSparse() } - sk.regs = newRegisters(sk.m) + sk.regs = make([]uint8, sk.m) for iter := sk.sparseList.Iter(); iter.HasNext(); { i, r := decodeHash(iter.Next(), sk.p, pp) sk.insert(i, r) @@ -183,81 +137,30 @@ func (sk *Sketch) toNormal() { sk.sparseList = nil } -func (sk *Sketch) insert(i uint32, r uint8) bool { - changed := false - if r-sk.b >= capacity { - //overflow - db := sk.regs.min() - if db > 0 { - sk.b += db - sk.regs.rebase(db) - changed = true - } - } - if r > sk.b { - val := r - sk.b - if c1 := capacity - 1; c1 < val { - val = c1 - } - - if val > sk.regs.get(i) { - sk.regs.set(i, val) - changed = true - } - } - return changed -} - -// Insert adds element e to sketch -func (sk *Sketch) Insert(e []byte) bool { - x := hash(e) - return sk.InsertHash(x) -} +func (sk *Sketch) insert(i uint32, r uint8) { sk.regs[i] = max(r, sk.regs[i]) } +func (sk *Sketch) Insert(e []byte) { sk.InsertHash(hash(e)) } -// InsertHash adds hash x to sketch -func (sk *Sketch) InsertHash(x uint64) bool { +func (sk *Sketch) InsertHash(x uint64) { if sk.sparse() { - changed := sk.tmpSet.add(encodeHash(x, sk.p, pp)) - if !changed { - return false - } - if uint32(len(sk.tmpSet))*100 > sk.m/2 { - sk.mergeSparse() - if uint32(sk.sparseList.Len()) > sk.m/2 { - sk.toNormal() - } + if sk.tmpSet.add(encodeHash(x, sk.p, pp)) { + sk.maybeToNormal() } - return true - } else { - i, r := getPosVal(x, sk.p) - return sk.insert(uint32(i), r) + return } + i, r := getPosVal(x, sk.p) + sk.insert(uint32(i), r) } -// Estimate returns the cardinality of the Sketch func (sk *Sketch) Estimate() uint64 { if sk.sparse() { sk.mergeSparse() return uint64(linearCount(mp, mp-sk.sparseList.count)) } - sum, ez := sk.regs.sumAndZeros(sk.b) + sum, ez := sumAndZeros(sk.regs) m := float64(sk.m) - var est float64 - - var beta func(float64) float64 - if sk.p < 16 { - beta = beta14 - } else { - beta = beta16 - } - - if sk.b == 0 { - est = sk.alpha * m * (m - ez) / (sum + beta(ez)) - } else { - est = sk.alpha * m * m / sum - } + est := sk.alpha * m * (m - ez) / (sum + beta(sk.p, ez)) return uint64(est + 0.5) } @@ -303,12 +206,13 @@ func (sk *Sketch) mergeSparse() { // MarshalBinary implements the encoding.BinaryMarshaler interface. func (sk *Sketch) MarshalBinary() (data []byte, err error) { + data = make([]byte, 0, 8+len(sk.regs)) // Marshal a version marker. data = append(data, version) // Marshal p. data = append(data, sk.p) // Marshal b - data = append(data, sk.b) + data = append(data, 0) if sk.sparse() { // It's using the sparse Sketch. @@ -333,7 +237,7 @@ func (sk *Sketch) MarshalBinary() (data []byte, err error) { data = append(data, byte(0)) // Add the dense sketch Sketch. - sz := len(sk.regs.tailcuts) + sz := len(sk.regs) data = append(data, []byte{ byte(sz >> 24), byte(sz >> 16), @@ -342,8 +246,8 @@ func (sk *Sketch) MarshalBinary() (data []byte, err error) { }...) // Marshal each element in the list. - for i := 0; i < len(sk.regs.tailcuts); i++ { - data = append(data, byte(sk.regs.tailcuts[i])) + for _, v := range sk.regs { + data = append(data, byte(v)) } return data, nil @@ -361,24 +265,23 @@ func (sk *Sketch) UnmarshalBinary(data []byte) error { // Unmarshal version. We may need this in the future if we make // non-compatible changes. - _ = data[0] + v := data[0] // Unmarshal p. p := data[1] // Unmarshal b. - sk.b = data[2] + b := data[2] // Determine if we need a sparse Sketch sparse := data[3] == byte(1) // Make a newSketch Sketch if the precision doesn't match or if the Sketch was used if sk.p != p || sk.regs != nil || len(sk.tmpSet) > 0 || (sk.sparseList != nil && sk.sparseList.Len() > 0) { - newh, err := newSketch(p, sparse) + newh, err := NewSketch(p, sparse) if err != nil { return err } - newh.b = sk.b *sk = *newh } @@ -406,19 +309,33 @@ func (sk *Sketch) UnmarshalBinary(data []byte) error { // Using the dense Sketch. sk.sparseList = nil sk.tmpSet = nil - dsz := binary.BigEndian.Uint32(data[4:8]) - sk.regs = newRegisters(dsz * 2) - data = data[8:] - - for i, val := range data { - sk.regs.tailcuts[i] = reg(val) - if uint8(sk.regs.tailcuts[i]<<4>>4) > 0 { - sk.regs.nz-- - } - if uint8(sk.regs.tailcuts[i]>>4) > 0 { - sk.regs.nz-- + + if v == 1 { + return sk.unmarshalBinaryV1(data[8:], b) + } + return sk.unmarshalBinaryV2(data) +} + +func sumAndZeros(regs []uint8) (res, ez float64) { + for _, v := range regs { + if v == 0 { + ez++ } + res += 1.0 / math.Pow(2.0, float64(v)) } + return res, ez +} + +func (sk *Sketch) unmarshalBinaryV1(data []byte, b uint8) error { + sk.regs = make([]uint8, len(data)*2) + for i, v := range data { + sk.regs[i*2] = uint8((v >> 4)) + b + sk.regs[i*2+1] = uint8((v<<4)>>4) + b + } + return nil +} +func (sk *Sketch) unmarshalBinaryV2(data []byte) error { + sk.regs = data[8:] return nil } diff --git a/vendor/github.com/axiomhq/hyperloglog/registers.go b/vendor/github.com/axiomhq/hyperloglog/registers.go deleted file mode 100644 index 19bb5d47f462..000000000000 --- a/vendor/github.com/axiomhq/hyperloglog/registers.go +++ /dev/null @@ -1,114 +0,0 @@ -package hyperloglog - -import ( - "math" -) - -type reg uint8 -type tailcuts []reg - -type registers struct { - tailcuts - nz uint32 -} - -func (r *reg) set(offset, val uint8) bool { - var isZero bool - if offset == 0 { - isZero = *r < 16 - tmpVal := uint8((*r) << 4 >> 4) - *r = reg(tmpVal | (val << 4)) - } else { - isZero = *r&0x0f == 0 - tmpVal := uint8((*r) >> 4 << 4) - *r = reg(tmpVal | val) - } - return isZero -} - -func (r *reg) get(offset uint8) uint8 { - if offset == 0 { - return uint8((*r) >> 4) - } - return uint8((*r) << 4 >> 4) -} - -func newRegisters(size uint32) *registers { - return ®isters{ - tailcuts: make(tailcuts, size/2), - nz: size, - } -} - -func (rs *registers) clone() *registers { - if rs == nil { - return nil - } - tc := make([]reg, len(rs.tailcuts)) - copy(tc, rs.tailcuts) - return ®isters{ - tailcuts: tc, - nz: rs.nz, - } -} - -func (rs *registers) rebase(delta uint8) { - nz := uint32(len(rs.tailcuts)) * 2 - for i := range rs.tailcuts { - for j := uint8(0); j < 2; j++ { - val := rs.tailcuts[i].get(j) - if val >= delta { - rs.tailcuts[i].set(j, val-delta) - if val-delta > 0 { - nz-- - } - } - } - } - rs.nz = nz -} - -func (rs *registers) set(i uint32, val uint8) { - offset, index := uint8(i)&1, i/2 - if rs.tailcuts[index].set(offset, val) { - rs.nz-- - } -} - -func (rs *registers) get(i uint32) uint8 { - offset, index := uint8(i)&1, i/2 - return rs.tailcuts[index].get(offset) -} - -func (rs *registers) sumAndZeros(base uint8) (res, ez float64) { - for _, r := range rs.tailcuts { - for j := uint8(0); j < 2; j++ { - v := float64(base + r.get(j)) - if v == 0 { - ez++ - } - res += 1.0 / math.Pow(2.0, v) - } - } - rs.nz = uint32(ez) - return res, ez -} - -func (rs *registers) min() uint8 { - if rs.nz > 0 { - return 0 - } - min := uint8(math.MaxUint8) - for _, r := range rs.tailcuts { - if r == 0 || min == 0 { - return 0 - } - if val := uint8(r << 4 >> 4); val < min { - min = val - } - if val := uint8(r >> 4); val < min { - min = val - } - } - return min -} diff --git a/vendor/github.com/axiomhq/hyperloglog/utils.go b/vendor/github.com/axiomhq/hyperloglog/utils.go index 58cd80cee5a5..c032c88d4ba0 100644 --- a/vendor/github.com/axiomhq/hyperloglog/utils.go +++ b/vendor/github.com/axiomhq/hyperloglog/utils.go @@ -9,30 +9,6 @@ import ( var hash = hashFunc -func beta14(ez float64) float64 { - zl := math.Log(ez + 1) - return -0.370393911*ez + - 0.070471823*zl + - 0.17393686*math.Pow(zl, 2) + - 0.16339839*math.Pow(zl, 3) + - -0.09237745*math.Pow(zl, 4) + - 0.03738027*math.Pow(zl, 5) + - -0.005384159*math.Pow(zl, 6) + - 0.00042419*math.Pow(zl, 7) -} - -func beta16(ez float64) float64 { - zl := math.Log(ez + 1) - return -0.37331876643753059*ez + - -1.41704077448122989*zl + - 0.40729184796612533*math.Pow(zl, 2) + - 1.56152033906584164*math.Pow(zl, 3) + - -0.99242233534286128*math.Pow(zl, 4) + - 0.26064681399483092*math.Pow(zl, 5) + - -0.03053811369682807*math.Pow(zl, 6) + - 0.00155770210179105*math.Pow(zl, 7) -} - func alpha(m float64) float64 { switch m { case 16: diff --git a/vendor/modules.txt b/vendor/modules.txt index 8068ab95ca4c..147776f446d7 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -486,8 +486,8 @@ github.com/aws/smithy-go/rand github.com/aws/smithy-go/time github.com/aws/smithy-go/transport/http github.com/aws/smithy-go/transport/http/internal/io -# github.com/axiomhq/hyperloglog v0.0.0-20240507144631-af9851f82b27 -## explicit; go 1.12 +# github.com/axiomhq/hyperloglog v0.2.0 +## explicit; go 1.21 github.com/axiomhq/hyperloglog # github.com/baidubce/bce-sdk-go v0.9.197 ## explicit; go 1.11