- 浏览: 120888 次
- 性别:
- 来自: 杭州
文章分类
最新评论
package hllctest
import java.util
import org.scalatest.{FlatSpec}
import org.spark.sqludf.HLLCounter
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
class HllcCrossSetTest extends FlatSpec {
val ramdom = new Random()
val m = 18
// 用于验证hllc 的错误率
def errorRateCal(sampleCount: Int) = {
errorRate(0.01, sampleCount)
errorRate(0.05, sampleCount)
errorRate(0.1, sampleCount)
errorRate(0.2, sampleCount)
errorRate(0.5, sampleCount)
}
"hllc test" should "hllc merge,mix error rate" in {
errorRateCal(1000)
errorRateCal(2000)
errorRateCal(5000)
errorRateCal(10000)
errorRateCal(20000)
errorRateCal(50000)
errorRateCal(100000)
errorRateCal(200000)
errorRateCal(500000)
errorRateCal(100000)
errorRateCal(200000)
errorRateCal(500000)
}
// 不放回抽样 测试集合生成
def getRandomStr(setCollection: mutable.HashSet[String], totalIntArray: Array[Int]): Unit = {
val str = getTestString(totalIntArray)
if (!setCollection.contains(str)) setCollection.add(str)
else
getRandomStr(setCollection, totalIntArray)
}
//5% 100w 级别集合 计算总量
def errorRate(r: Double, testSetLength: Int) = {
println(s" ********************begin r :{$r},testSetLength:{$testSetLength} ,m:{$m} ***********************")
var setA = new mutable.HashSet[String]()
var setB = new mutable.HashSet[String]()
val tatolCount = (testSetLength / r).toInt
var b = System.currentTimeMillis()
val totalIntArray = (tatolCount + "").toCharArray.map(x => x.toString.toInt)
var timeRecord = System.currentTimeMillis()
for (i <- 0 until testSetLength) {
getRandomStr(setA, totalIntArray)
getRandomStr(setB, totalIntArray)
// if (i%5000 == 0) {
// println(s" generate data ${i} ,cost: ${System.currentTimeMillis()-timeRecord}) ")
// timeRecord = System.currentTimeMillis()
// }
}
println(s"tatolCount: ${tatolCount} ,r : ${r} setA size: ${setA.size} , setB size: ${setB.size} ")
var e = System.currentTimeMillis()
println(s" generate data cost time: ${e - b} ")
/*
* realMix 交集
* hllcMix
* realMerge
* hllcMerge
* */
b = System.currentTimeMillis()
val realMixCnt = realMix(setA, setB)
val mixRate = realMixRate(realMixCnt, setA)
println(s" realMixCnt: ${realMixCnt} , mixRate:${mixRate}")
e = System.currentTimeMillis()
println(s" Map collection cost time: ${e - b} ")
b = System.currentTimeMillis()
val hllcMixCnt = hllcMix(setA, setB)
val mixRatehllc = realMixRate(hllcMixCnt, setA)
println(s" hllcMixCnt: ${hllcMixCnt} , mixRatehllc:${mixRatehllc}")
val distinct = mixRatehllc - mixRate
println(f" mixRatehllc - mixRate: $distinct%1.6f ")
println(f" hllcMixCnt - realMixCnt: ${hllcMixCnt - realMixCnt} ")
e = System.currentTimeMillis()
println(s" hllc cost time: ${e - b} ")
}
def realMix(setA: mutable.HashSet[String], setB: mutable.HashSet[String]) = {
val hashSet = new util.HashSet[String]
setA.foreach(str => if (!hashSet.contains(str)) hashSet.add(str))
var mixCount = 0
setB.foreach(str => if (hashSet.contains(str)) mixCount += 1)
mixCount
}
def realMixRate(mixCount: Int, set: mutable.HashSet[String]) = {
mixCount * 1.0 / set.size
}
def realMixRate(mixCount: Long, set: mutable.HashSet[String]) = {
mixCount * 1.0 / set.size
}
def hllcMix(setA: mutable.HashSet[String], setB: mutable.HashSet[String]): Long = {
val hllc16A = new HLLCounter(m)
setA.foreach(item => hllc16A.add(item))
val hllc16B = new HLLCounter(m)
setA.foreach(item => hllc16B.add(item))
hllc16A.getCountEstimate + hllc16B.getCountEstimate - hllcMerge(setA, setB)
}
def hllcMerge(setA: mutable.HashSet[String], setB: mutable.HashSet[String]) = {
val hllc16 = new HLLCounter(m)
setA.foreach(item => hllc16.add(item))
setB.foreach(item => hllc16.add(item))
hllc16.getCountEstimate
}
def realMerge(setA: ArrayBuffer[String], setB: ArrayBuffer[String]) = {
val hashSet = new util.HashSet[String]
setA.foreach(str => if (!hashSet.contains(str)) hashSet.add(str))
setB.foreach(str => if (!hashSet.contains(str)) hashSet.add(str))
hashSet.size()
}
def getTestString(totalCountArray: Array[Int]) = {
val sbf = new StringBuffer()
//没一位的数字是几, 然后根据这个来生成随机数
totalCountArray.foreach(s => {
if (!0.equals(s))
sbf.append(getRamdomStringS(s))
else sbf.append(getRamdomStringS(10))
})
sbf.toString
}
// n -> 10 ^^n
def getRamdomString(length: Int): String = {
val sbf = new StringBuffer()
for (i <- 0 until length) sbf.append((ramdom.nextInt(10) + 97).toChar)
sbf.toString
}
// n -> 10 ^^n
def getRamdomStringS(l: Int): String = {
(ramdom.nextInt(l) + 97).toChar.toString
}
}
********************begin r :{0.01},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.01 setA size: 1000 , setB size: 1000
generate data cost time: 48
realMixCnt: 12 , mixRate:0.012
Map collection cost time: 6
hllcMixCnt: 8 , mixRatehllc:0.008
mixRatehllc - mixRate: -0.004000
hllcMixCnt - realMixCnt: -4
hllc cost time: 131
********************begin r :{0.05},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 20000 ,r : 0.05 setA size: 1000 , setB size: 1000
generate data cost time: 4
realMixCnt: 49 , mixRate:0.049
Map collection cost time: 3
hllcMixCnt: 48 , mixRatehllc:0.048
mixRatehllc - mixRate: -0.001000
hllcMixCnt - realMixCnt: -1
hllc cost time: 19
********************begin r :{0.1},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 10000 ,r : 0.1 setA size: 1000 , setB size: 1000
generate data cost time: 3
realMixCnt: 108 , mixRate:0.108
Map collection cost time: 5
hllcMixCnt: 107 , mixRatehllc:0.107
mixRatehllc - mixRate: -0.001000
hllcMixCnt - realMixCnt: -1
hllc cost time: 15
********************begin r :{0.2},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 5000 ,r : 0.2 setA size: 1000 , setB size: 1000
generate data cost time: 3
realMixCnt: 196 , mixRate:0.196
Map collection cost time: 1
hllcMixCnt: 195 , mixRatehllc:0.195
mixRatehllc - mixRate: -0.001000
hllcMixCnt - realMixCnt: -1
hllc cost time: 16
********************begin r :{0.5},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 2000 ,r : 0.5 setA size: 1000 , setB size: 1000
generate data cost time: 7
realMixCnt: 489 , mixRate:0.489
Map collection cost time: 1
hllcMixCnt: 490 , mixRatehllc:0.49
mixRatehllc - mixRate: 0.001000
hllcMixCnt - realMixCnt: 1
hllc cost time: 8
********************begin r :{0.01},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.01 setA size: 2000 , setB size: 2000
generate data cost time: 6
realMixCnt: 11 , mixRate:0.0055
Map collection cost time: 0
hllcMixCnt: 19 , mixRatehllc:0.0095
mixRatehllc - mixRate: 0.004000
hllcMixCnt - realMixCnt: 8
hllc cost time: 24
********************begin r :{0.05},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 40000 ,r : 0.05 setA size: 2000 , setB size: 2000
generate data cost time: 5
realMixCnt: 102 , mixRate:0.051
Map collection cost time: 1
hllcMixCnt: 110 , mixRatehllc:0.055
mixRatehllc - mixRate: 0.004000
hllcMixCnt - realMixCnt: 8
hllc cost time: 11
********************begin r :{0.1},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 20000 ,r : 0.1 setA size: 2000 , setB size: 2000
generate data cost time: 4
realMixCnt: 192 , mixRate:0.096
Map collection cost time: 0
hllcMixCnt: 192 , mixRatehllc:0.096
mixRatehllc - mixRate: 0.000000
hllcMixCnt - realMixCnt: 0
hllc cost time: 11
********************begin r :{0.2},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 10000 ,r : 0.2 setA size: 2000 , setB size: 2000
generate data cost time: 3
realMixCnt: 395 , mixRate:0.1975
Map collection cost time: 1
hllcMixCnt: 387 , mixRatehllc:0.1935
mixRatehllc - mixRate: -0.004000
hllcMixCnt - realMixCnt: -8
hllc cost time: 12
********************begin r :{0.5},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 4000 ,r : 0.5 setA size: 2000 , setB size: 2000
generate data cost time: 6
realMixCnt: 986 , mixRate:0.493
Map collection cost time: 1
hllcMixCnt: 981 , mixRatehllc:0.4905
mixRatehllc - mixRate: -0.002500
hllcMixCnt - realMixCnt: -5
hllc cost time: 16
********************begin r :{0.01},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.01 setA size: 5000 , setB size: 5000
generate data cost time: 17
realMixCnt: 48 , mixRate:0.0096
Map collection cost time: 2
hllcMixCnt: 41 , mixRatehllc:0.0082
mixRatehllc - mixRate: -0.001400
hllcMixCnt - realMixCnt: -7
hllc cost time: 13
********************begin r :{0.05},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.05 setA size: 5000 , setB size: 5000
generate data cost time: 7
realMixCnt: 263 , mixRate:0.0526
Map collection cost time: 1
hllcMixCnt: 271 , mixRatehllc:0.0542
mixRatehllc - mixRate: 0.001600
hllcMixCnt - realMixCnt: 8
hllc cost time: 13
********************begin r :{0.1},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 50000 ,r : 0.1 setA size: 5000 , setB size: 5000
generate data cost time: 6
realMixCnt: 527 , mixRate:0.1054
Map collection cost time: 0
hllcMixCnt: 526 , mixRatehllc:0.1052
mixRatehllc - mixRate: -0.000200
hllcMixCnt - realMixCnt: -1
hllc cost time: 26
********************begin r :{0.2},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 25000 ,r : 0.2 setA size: 5000 , setB size: 5000
generate data cost time: 16
realMixCnt: 2505 , mixRate:0.501
Map collection cost time: 4
hllcMixCnt: 2496 , mixRatehllc:0.4992
mixRatehllc - mixRate: -0.001800
hllcMixCnt - realMixCnt: -9
hllc cost time: 16
********************begin r :{0.5},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 10000 ,r : 0.5 setA size: 5000 , setB size: 5000
generate data cost time: 8
realMixCnt: 2499 , mixRate:0.4998
Map collection cost time: 1
hllcMixCnt: 2505 , mixRatehllc:0.501
mixRatehllc - mixRate: 0.001200
hllcMixCnt - realMixCnt: 6
hllc cost time: 14
********************begin r :{0.01},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.01 setA size: 10000 , setB size: 10000
generate data cost time: 15
realMixCnt: 103 , mixRate:0.0103
Map collection cost time: 2
hllcMixCnt: 62 , mixRatehllc:0.0062
mixRatehllc - mixRate: -0.004100
hllcMixCnt - realMixCnt: -41
hllc cost time: 24
********************begin r :{0.05},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.05 setA size: 10000 , setB size: 10000
generate data cost time: 18
realMixCnt: 484 , mixRate:0.0484
Map collection cost time: 2
hllcMixCnt: 467 , mixRatehllc:0.0467
mixRatehllc - mixRate: -0.001700
hllcMixCnt - realMixCnt: -17
hllc cost time: 18
********************begin r :{0.1},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.1 setA size: 10000 , setB size: 10000
generate data cost time: 11
realMixCnt: 938 , mixRate:0.0938
Map collection cost time: 1
hllcMixCnt: 967 , mixRatehllc:0.0967
mixRatehllc - mixRate: 0.002900
hllcMixCnt - realMixCnt: 29
hllc cost time: 12
********************begin r :{0.2},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 50000 ,r : 0.2 setA size: 10000 , setB size: 10000
generate data cost time: 10
realMixCnt: 1997 , mixRate:0.1997
Map collection cost time: 5
hllcMixCnt: 1999 , mixRatehllc:0.1999
mixRatehllc - mixRate: 0.000200
hllcMixCnt - realMixCnt: 2
hllc cost time: 15
********************begin r :{0.5},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 20000 ,r : 0.5 setA size: 10000 , setB size: 10000
generate data cost time: 18
realMixCnt: 5010 , mixRate:0.501
Map collection cost time: 4
hllcMixCnt: 4990 , mixRatehllc:0.499
mixRatehllc - mixRate: -0.002000
hllcMixCnt - realMixCnt: -20
hllc cost time: 23
********************begin r :{0.01},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.01 setA size: 20000 , setB size: 20000
generate data cost time: 41
realMixCnt: 218 , mixRate:0.0109
Map collection cost time: 4
hllcMixCnt: 134 , mixRatehllc:0.0067
mixRatehllc - mixRate: -0.004200
hllcMixCnt - realMixCnt: -84
hllc cost time: 36
********************begin r :{0.05},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 400000 ,r : 0.05 setA size: 20000 , setB size: 20000
generate data cost time: 19
realMixCnt: 946 , mixRate:0.0473
Map collection cost time: 3
hllcMixCnt: 949 , mixRatehllc:0.04745
mixRatehllc - mixRate: 0.000150
hllcMixCnt - realMixCnt: 3
hllc cost time: 23
********************begin r :{0.1},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.1 setA size: 20000 , setB size: 20000
generate data cost time: 26
realMixCnt: 2001 , mixRate:0.10005
Map collection cost time: 10
hllcMixCnt: 2080 , mixRatehllc:0.104
mixRatehllc - mixRate: 0.003950
hllcMixCnt - realMixCnt: 79
hllc cost time: 56
********************begin r :{0.2},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.2 setA size: 20000 , setB size: 20000
generate data cost time: 28
realMixCnt: 4034 , mixRate:0.2017
Map collection cost time: 7
hllcMixCnt: 4113 , mixRatehllc:0.20565
mixRatehllc - mixRate: 0.003950
hllcMixCnt - realMixCnt: 79
hllc cost time: 25
********************begin r :{0.5},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 40000 ,r : 0.5 setA size: 20000 , setB size: 20000
generate data cost time: 24
realMixCnt: 9975 , mixRate:0.49875
Map collection cost time: 8
hllcMixCnt: 9994 , mixRatehllc:0.4997
mixRatehllc - mixRate: 0.000950
hllcMixCnt - realMixCnt: 19
hllc cost time: 23
********************begin r :{0.01},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 5000000 ,r : 0.01 setA size: 50000 , setB size: 50000
generate data cost time: 88
realMixCnt: 468 , mixRate:0.00936
Map collection cost time: 48
hllcMixCnt: 603 , mixRatehllc:0.01206
mixRatehllc - mixRate: 0.002700
hllcMixCnt - realMixCnt: 135
hllc cost time: 163
********************begin r :{0.05},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.05 setA size: 50000 , setB size: 50000
generate data cost time: 99
realMixCnt: 2381 , mixRate:0.04762
Map collection cost time: 17
hllcMixCnt: 2335 , mixRatehllc:0.0467
mixRatehllc - mixRate: -0.000920
hllcMixCnt - realMixCnt: -46
hllc cost time: 59
********************begin r :{0.1},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.1 setA size: 50000 , setB size: 50000
generate data cost time: 52
realMixCnt: 5091 , mixRate:0.10182
Map collection cost time: 19
hllcMixCnt: 5116 , mixRatehllc:0.10232
mixRatehllc - mixRate: 0.000500
hllcMixCnt - realMixCnt: 25
hllc cost time: 88
********************begin r :{0.2},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 250000 ,r : 0.2 setA size: 50000 , setB size: 50000
generate data cost time: 72
realMixCnt: 24889 , mixRate:0.49778
Map collection cost time: 18
hllcMixCnt: 25002 , mixRatehllc:0.50004
mixRatehllc - mixRate: 0.002260
hllcMixCnt - realMixCnt: 113
hllc cost time: 61
********************begin r :{0.5},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.5 setA size: 50000 , setB size: 50000
generate data cost time: 86
realMixCnt: 25140 , mixRate:0.5028
Map collection cost time: 15
hllcMixCnt: 25190 , mixRatehllc:0.5038
mixRatehllc - mixRate: 0.001000
hllcMixCnt - realMixCnt: 50
hllc cost time: 71
********************begin r :{0.01},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.01 setA size: 100000 , setB size: 100000
generate data cost time: 154
realMixCnt: 1051 , mixRate:0.01051
Map collection cost time: 29
hllcMixCnt: 811 , mixRatehllc:0.00811
mixRatehllc - mixRate: -0.002400
hllcMixCnt - realMixCnt: -240
hllc cost time: 232
********************begin r :{0.05},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.05 setA size: 100000 , setB size: 100000
generate data cost time: 171
realMixCnt: 4903 , mixRate:0.04903
Map collection cost time: 19
hllcMixCnt: 5095 , mixRatehllc:0.05095
mixRatehllc - mixRate: 0.001920
hllcMixCnt - realMixCnt: 192
hllc cost time: 122
********************begin r :{0.1},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.1 setA size: 100000 , setB size: 100000
generate data cost time: 131
realMixCnt: 9931 , mixRate:0.09931
Map collection cost time: 42
hllcMixCnt: 10136 , mixRatehllc:0.10136
mixRatehllc - mixRate: 0.002050
hllcMixCnt - realMixCnt: 205
hllc cost time: 155
********************begin r :{0.2},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.2 setA size: 100000 , setB size: 100000
generate data cost time: 117
realMixCnt: 20148 , mixRate:0.20148
Map collection cost time: 35
hllcMixCnt: 20414 , mixRatehllc:0.20414
mixRatehllc - mixRate: 0.002660
hllcMixCnt - realMixCnt: 266
hllc cost time: 111
********************begin r :{0.5},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.5 setA size: 100000 , setB size: 100000
generate data cost time: 130
realMixCnt: 49964 , mixRate:0.49964
Map collection cost time: 35
hllcMixCnt: 50268 , mixRatehllc:0.50268
mixRatehllc - mixRate: 0.003040
hllcMixCnt - realMixCnt: 304
hllc cost time: 133
********************begin r :{0.01},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 20000000 ,r : 0.01 setA size: 200000 , setB size: 200000
generate data cost time: 260
realMixCnt: 2035 , mixRate:0.010175
Map collection cost time: 83
hllcMixCnt: 1247 , mixRatehllc:0.006235
mixRatehllc - mixRate: -0.003940
hllcMixCnt - realMixCnt: -788
hllc cost time: 389
********************begin r :{0.05},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 4000000 ,r : 0.05 setA size: 200000 , setB size: 200000
generate data cost time: 311
realMixCnt: 10159 , mixRate:0.050795
Map collection cost time: 94
hllcMixCnt: 10030 , mixRatehllc:0.05015
mixRatehllc - mixRate: -0.000645
hllcMixCnt - realMixCnt: -129
hllc cost time: 308
********************begin r :{0.1},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.1 setA size: 200000 , setB size: 200000
generate data cost time: 255
realMixCnt: 20009 , mixRate:0.100045
Map collection cost time: 133
hllcMixCnt: 19539 , mixRatehllc:0.097695
mixRatehllc - mixRate: -0.002350
hllcMixCnt - realMixCnt: -470
hllc cost time: 235
********************begin r :{0.2},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.2 setA size: 200000 , setB size: 200000
generate data cost time: 229
realMixCnt: 39946 , mixRate:0.19973
Map collection cost time: 92
hllcMixCnt: 41310 , mixRatehllc:0.20655
mixRatehllc - mixRate: 0.006820
hllcMixCnt - realMixCnt: 1364
hllc cost time: 271
********************begin r :{0.5},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 400000 ,r : 0.5 setA size: 200000 , setB size: 200000
generate data cost time: 357
realMixCnt: 100095 , mixRate:0.500475
Map collection cost time: 93
hllcMixCnt: 100242 , mixRatehllc:0.50121
mixRatehllc - mixRate: 0.000735
hllcMixCnt - realMixCnt: 147
hllc cost time: 422
********************begin r :{0.01},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 50000000 ,r : 0.01 setA size: 500000 , setB size: 500000
generate data cost time: 758
realMixCnt: 5084 , mixRate:0.010168
Map collection cost time: 211
hllcMixCnt: 2978 , mixRatehllc:0.005956
mixRatehllc - mixRate: -0.004212
hllcMixCnt - realMixCnt: -2106
hllc cost time: 844
********************begin r :{0.05},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.05 setA size: 500000 , setB size: 500000
generate data cost time: 721
realMixCnt: 25296 , mixRate:0.050592
Map collection cost time: 222
hllcMixCnt: 23440 , mixRatehllc:0.04688
mixRatehllc - mixRate: -0.003712
hllcMixCnt - realMixCnt: -1856
hllc cost time: 699
********************begin r :{0.1},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 5000000 ,r : 0.1 setA size: 500000 , setB size: 500000
generate data cost time: 688
realMixCnt: 50178 , mixRate:0.100356
Map collection cost time: 200
hllcMixCnt: 45070 , mixRatehllc:0.09014
mixRatehllc - mixRate: -0.010216
hllcMixCnt - realMixCnt: -5108
hllc cost time: 701
********************begin r :{0.2},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 2500000 ,r : 0.2 setA size: 500000 , setB size: 500000
generate data cost time: 897
realMixCnt: 249899 , mixRate:0.499798
Map collection cost time: 223
hllcMixCnt: 250263 , mixRatehllc:0.500526
mixRatehllc - mixRate: 0.000728
hllcMixCnt - realMixCnt: 364
hllc cost time: 658
********************begin r :{0.5},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.5 setA size: 500000 , setB size: 500000
generate data cost time: 868
realMixCnt: 249895 , mixRate:0.49979
Map collection cost time: 245
hllcMixCnt: 249916 , mixRatehllc:0.499832
mixRatehllc - mixRate: 0.000042
hllcMixCnt - realMixCnt: 21
hllc cost time: 724
********************begin r :{0.01},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.01 setA size: 100000 , setB size: 100000
generate data cost time: 110
realMixCnt: 1026 , mixRate:0.01026
Map collection cost time: 28
hllcMixCnt: 569 , mixRatehllc:0.00569
mixRatehllc - mixRate: -0.004570
hllcMixCnt - realMixCnt: -457
hllc cost time: 95
********************begin r :{0.05},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.05 setA size: 100000 , setB size: 100000
generate data cost time: 91
realMixCnt: 5024 , mixRate:0.05024
Map collection cost time: 26
hllcMixCnt: 5439 , mixRatehllc:0.05439
mixRatehllc - mixRate: 0.004150
hllcMixCnt - realMixCnt: 415
hllc cost time: 131
********************begin r :{0.1},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.1 setA size: 100000 , setB size: 100000
generate data cost time: 93
realMixCnt: 9925 , mixRate:0.09925
Map collection cost time: 28
hllcMixCnt: 10201 , mixRatehllc:0.10201
mixRatehllc - mixRate: 0.002760
hllcMixCnt - realMixCnt: 276
hllc cost time: 141
********************begin r :{0.2},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.2 setA size: 100000 , setB size: 100000
generate data cost time: 90
realMixCnt: 19983 , mixRate:0.19983
Map collection cost time: 32
hllcMixCnt: 19936 , mixRatehllc:0.19936
mixRatehllc - mixRate: -0.000470
hllcMixCnt - realMixCnt: -47
hllc cost time: 128
********************begin r :{0.5},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.5 setA size: 100000 , setB size: 100000
generate data cost time: 121
realMixCnt: 50027 , mixRate:0.50027
Map collection cost time: 35
hllcMixCnt: 49726 , mixRatehllc:0.49726
mixRatehllc - mixRate: -0.003010
hllcMixCnt - realMixCnt: -301
hllc cost time: 137
********************begin r :{0.01},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 20000000 ,r : 0.01 setA size: 200000 , setB size: 200000
generate data cost time: 247
realMixCnt: 1991 , mixRate:0.009955
Map collection cost time: 38
hllcMixCnt: 2118 , mixRatehllc:0.01059
mixRatehllc - mixRate: 0.000635
hllcMixCnt - realMixCnt: 127
hllc cost time: 200
********************begin r :{0.05},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 4000000 ,r : 0.05 setA size: 200000 , setB size: 200000
generate data cost time: 225
realMixCnt: 10000 , mixRate:0.05
Map collection cost time: 71
hllcMixCnt: 9751 , mixRatehllc:0.048755
mixRatehllc - mixRate: -0.001245
hllcMixCnt - realMixCnt: -249
hllc cost time: 273
********************begin r :{0.1},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.1 setA size: 200000 , setB size: 200000
generate data cost time: 224
realMixCnt: 19974 , mixRate:0.09987
Map collection cost time: 71
hllcMixCnt: 19810 , mixRatehllc:0.09905
mixRatehllc - mixRate: -0.000820
hllcMixCnt - realMixCnt: -164
hllc cost time: 300
********************begin r :{0.2},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.2 setA size: 200000 , setB size: 200000
generate data cost time: 243
realMixCnt: 40093 , mixRate:0.200465
Map collection cost time: 82
hllcMixCnt: 40549 , mixRatehllc:0.202745
mixRatehllc - mixRate: 0.002280
hllcMixCnt - realMixCnt: 456
hllc cost time: 297
********************begin r :{0.5},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 400000 ,r : 0.5 setA size: 200000 , setB size: 200000
generate data cost time: 283
realMixCnt: 99874 , mixRate:0.49937
Map collection cost time: 88
hllcMixCnt: 99730 , mixRatehllc:0.49865
mixRatehllc - mixRate: -0.000720
hllcMixCnt - realMixCnt: -144
hllc cost time: 306
********************begin r :{0.01},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 50000000 ,r : 0.01 setA size: 500000 , setB size: 500000
generate data cost time: 678
realMixCnt: 5148 , mixRate:0.010296
Map collection cost time: 181
hllcMixCnt: 3895 , mixRatehllc:0.00779
mixRatehllc - mixRate: -0.002506
hllcMixCnt - realMixCnt: -1253
hllc cost time: 673
********************begin r :{0.05},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.05 setA size: 500000 , setB size: 500000
generate data cost time: 820
realMixCnt: 25131 , mixRate:0.050262
Map collection cost time: 185
hllcMixCnt: 24850 , mixRatehllc:0.0497
mixRatehllc - mixRate: -0.000562
hllcMixCnt - realMixCnt: -281
hllc cost time: 647
********************begin r :{0.1},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 5000000 ,r : 0.1 setA size: 500000 , setB size: 500000
generate data cost time: 691
realMixCnt: 49911 , mixRate:0.099822
Map collection cost time: 187
hllcMixCnt: 50951 , mixRatehllc:0.101902
mixRatehllc - mixRate: 0.002080
hllcMixCnt - realMixCnt: 1040
hllc cost time: 690
********************begin r :{0.2},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 2500000 ,r : 0.2 setA size: 500000 , setB size: 500000
generate data cost time: 888
realMixCnt: 250250 , mixRate:0.5005
Map collection cost time: 212
hllcMixCnt: 249358 , mixRatehllc:0.498716
mixRatehllc - mixRate: -0.001784
hllcMixCnt - realMixCnt: -892
hllc cost time: 608
********************begin r :{0.5},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.5 setA size: 500000 , setB size: 500000
generate data cost time: 840
realMixCnt: 249691 , mixRate:0.499382
Map collection cost time: 230
hllcMixCnt: 249833 , mixRatehllc:0.499666
mixRatehllc - mixRate: 0.000284
hllcMixCnt - realMixCnt: 142
hllc cost time: 714
Process finished with exit code 0
import java.util
import org.scalatest.{FlatSpec}
import org.spark.sqludf.HLLCounter
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
class HllcCrossSetTest extends FlatSpec {
val ramdom = new Random()
val m = 18
// 用于验证hllc 的错误率
def errorRateCal(sampleCount: Int) = {
errorRate(0.01, sampleCount)
errorRate(0.05, sampleCount)
errorRate(0.1, sampleCount)
errorRate(0.2, sampleCount)
errorRate(0.5, sampleCount)
}
"hllc test" should "hllc merge,mix error rate" in {
errorRateCal(1000)
errorRateCal(2000)
errorRateCal(5000)
errorRateCal(10000)
errorRateCal(20000)
errorRateCal(50000)
errorRateCal(100000)
errorRateCal(200000)
errorRateCal(500000)
errorRateCal(100000)
errorRateCal(200000)
errorRateCal(500000)
}
// 不放回抽样 测试集合生成
def getRandomStr(setCollection: mutable.HashSet[String], totalIntArray: Array[Int]): Unit = {
val str = getTestString(totalIntArray)
if (!setCollection.contains(str)) setCollection.add(str)
else
getRandomStr(setCollection, totalIntArray)
}
//5% 100w 级别集合 计算总量
def errorRate(r: Double, testSetLength: Int) = {
println(s" ********************begin r :{$r},testSetLength:{$testSetLength} ,m:{$m} ***********************")
var setA = new mutable.HashSet[String]()
var setB = new mutable.HashSet[String]()
val tatolCount = (testSetLength / r).toInt
var b = System.currentTimeMillis()
val totalIntArray = (tatolCount + "").toCharArray.map(x => x.toString.toInt)
var timeRecord = System.currentTimeMillis()
for (i <- 0 until testSetLength) {
getRandomStr(setA, totalIntArray)
getRandomStr(setB, totalIntArray)
// if (i%5000 == 0) {
// println(s" generate data ${i} ,cost: ${System.currentTimeMillis()-timeRecord}) ")
// timeRecord = System.currentTimeMillis()
// }
}
println(s"tatolCount: ${tatolCount} ,r : ${r} setA size: ${setA.size} , setB size: ${setB.size} ")
var e = System.currentTimeMillis()
println(s" generate data cost time: ${e - b} ")
/*
* realMix 交集
* hllcMix
* realMerge
* hllcMerge
* */
b = System.currentTimeMillis()
val realMixCnt = realMix(setA, setB)
val mixRate = realMixRate(realMixCnt, setA)
println(s" realMixCnt: ${realMixCnt} , mixRate:${mixRate}")
e = System.currentTimeMillis()
println(s" Map collection cost time: ${e - b} ")
b = System.currentTimeMillis()
val hllcMixCnt = hllcMix(setA, setB)
val mixRatehllc = realMixRate(hllcMixCnt, setA)
println(s" hllcMixCnt: ${hllcMixCnt} , mixRatehllc:${mixRatehllc}")
val distinct = mixRatehllc - mixRate
println(f" mixRatehllc - mixRate: $distinct%1.6f ")
println(f" hllcMixCnt - realMixCnt: ${hllcMixCnt - realMixCnt} ")
e = System.currentTimeMillis()
println(s" hllc cost time: ${e - b} ")
}
def realMix(setA: mutable.HashSet[String], setB: mutable.HashSet[String]) = {
val hashSet = new util.HashSet[String]
setA.foreach(str => if (!hashSet.contains(str)) hashSet.add(str))
var mixCount = 0
setB.foreach(str => if (hashSet.contains(str)) mixCount += 1)
mixCount
}
def realMixRate(mixCount: Int, set: mutable.HashSet[String]) = {
mixCount * 1.0 / set.size
}
def realMixRate(mixCount: Long, set: mutable.HashSet[String]) = {
mixCount * 1.0 / set.size
}
def hllcMix(setA: mutable.HashSet[String], setB: mutable.HashSet[String]): Long = {
val hllc16A = new HLLCounter(m)
setA.foreach(item => hllc16A.add(item))
val hllc16B = new HLLCounter(m)
setA.foreach(item => hllc16B.add(item))
hllc16A.getCountEstimate + hllc16B.getCountEstimate - hllcMerge(setA, setB)
}
def hllcMerge(setA: mutable.HashSet[String], setB: mutable.HashSet[String]) = {
val hllc16 = new HLLCounter(m)
setA.foreach(item => hllc16.add(item))
setB.foreach(item => hllc16.add(item))
hllc16.getCountEstimate
}
def realMerge(setA: ArrayBuffer[String], setB: ArrayBuffer[String]) = {
val hashSet = new util.HashSet[String]
setA.foreach(str => if (!hashSet.contains(str)) hashSet.add(str))
setB.foreach(str => if (!hashSet.contains(str)) hashSet.add(str))
hashSet.size()
}
def getTestString(totalCountArray: Array[Int]) = {
val sbf = new StringBuffer()
//没一位的数字是几, 然后根据这个来生成随机数
totalCountArray.foreach(s => {
if (!0.equals(s))
sbf.append(getRamdomStringS(s))
else sbf.append(getRamdomStringS(10))
})
sbf.toString
}
// n -> 10 ^^n
def getRamdomString(length: Int): String = {
val sbf = new StringBuffer()
for (i <- 0 until length) sbf.append((ramdom.nextInt(10) + 97).toChar)
sbf.toString
}
// n -> 10 ^^n
def getRamdomStringS(l: Int): String = {
(ramdom.nextInt(l) + 97).toChar.toString
}
}
********************begin r :{0.01},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.01 setA size: 1000 , setB size: 1000
generate data cost time: 48
realMixCnt: 12 , mixRate:0.012
Map collection cost time: 6
hllcMixCnt: 8 , mixRatehllc:0.008
mixRatehllc - mixRate: -0.004000
hllcMixCnt - realMixCnt: -4
hllc cost time: 131
********************begin r :{0.05},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 20000 ,r : 0.05 setA size: 1000 , setB size: 1000
generate data cost time: 4
realMixCnt: 49 , mixRate:0.049
Map collection cost time: 3
hllcMixCnt: 48 , mixRatehllc:0.048
mixRatehllc - mixRate: -0.001000
hllcMixCnt - realMixCnt: -1
hllc cost time: 19
********************begin r :{0.1},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 10000 ,r : 0.1 setA size: 1000 , setB size: 1000
generate data cost time: 3
realMixCnt: 108 , mixRate:0.108
Map collection cost time: 5
hllcMixCnt: 107 , mixRatehllc:0.107
mixRatehllc - mixRate: -0.001000
hllcMixCnt - realMixCnt: -1
hllc cost time: 15
********************begin r :{0.2},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 5000 ,r : 0.2 setA size: 1000 , setB size: 1000
generate data cost time: 3
realMixCnt: 196 , mixRate:0.196
Map collection cost time: 1
hllcMixCnt: 195 , mixRatehllc:0.195
mixRatehllc - mixRate: -0.001000
hllcMixCnt - realMixCnt: -1
hllc cost time: 16
********************begin r :{0.5},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 2000 ,r : 0.5 setA size: 1000 , setB size: 1000
generate data cost time: 7
realMixCnt: 489 , mixRate:0.489
Map collection cost time: 1
hllcMixCnt: 490 , mixRatehllc:0.49
mixRatehllc - mixRate: 0.001000
hllcMixCnt - realMixCnt: 1
hllc cost time: 8
********************begin r :{0.01},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.01 setA size: 2000 , setB size: 2000
generate data cost time: 6
realMixCnt: 11 , mixRate:0.0055
Map collection cost time: 0
hllcMixCnt: 19 , mixRatehllc:0.0095
mixRatehllc - mixRate: 0.004000
hllcMixCnt - realMixCnt: 8
hllc cost time: 24
********************begin r :{0.05},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 40000 ,r : 0.05 setA size: 2000 , setB size: 2000
generate data cost time: 5
realMixCnt: 102 , mixRate:0.051
Map collection cost time: 1
hllcMixCnt: 110 , mixRatehllc:0.055
mixRatehllc - mixRate: 0.004000
hllcMixCnt - realMixCnt: 8
hllc cost time: 11
********************begin r :{0.1},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 20000 ,r : 0.1 setA size: 2000 , setB size: 2000
generate data cost time: 4
realMixCnt: 192 , mixRate:0.096
Map collection cost time: 0
hllcMixCnt: 192 , mixRatehllc:0.096
mixRatehllc - mixRate: 0.000000
hllcMixCnt - realMixCnt: 0
hllc cost time: 11
********************begin r :{0.2},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 10000 ,r : 0.2 setA size: 2000 , setB size: 2000
generate data cost time: 3
realMixCnt: 395 , mixRate:0.1975
Map collection cost time: 1
hllcMixCnt: 387 , mixRatehllc:0.1935
mixRatehllc - mixRate: -0.004000
hllcMixCnt - realMixCnt: -8
hllc cost time: 12
********************begin r :{0.5},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 4000 ,r : 0.5 setA size: 2000 , setB size: 2000
generate data cost time: 6
realMixCnt: 986 , mixRate:0.493
Map collection cost time: 1
hllcMixCnt: 981 , mixRatehllc:0.4905
mixRatehllc - mixRate: -0.002500
hllcMixCnt - realMixCnt: -5
hllc cost time: 16
********************begin r :{0.01},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.01 setA size: 5000 , setB size: 5000
generate data cost time: 17
realMixCnt: 48 , mixRate:0.0096
Map collection cost time: 2
hllcMixCnt: 41 , mixRatehllc:0.0082
mixRatehllc - mixRate: -0.001400
hllcMixCnt - realMixCnt: -7
hllc cost time: 13
********************begin r :{0.05},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.05 setA size: 5000 , setB size: 5000
generate data cost time: 7
realMixCnt: 263 , mixRate:0.0526
Map collection cost time: 1
hllcMixCnt: 271 , mixRatehllc:0.0542
mixRatehllc - mixRate: 0.001600
hllcMixCnt - realMixCnt: 8
hllc cost time: 13
********************begin r :{0.1},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 50000 ,r : 0.1 setA size: 5000 , setB size: 5000
generate data cost time: 6
realMixCnt: 527 , mixRate:0.1054
Map collection cost time: 0
hllcMixCnt: 526 , mixRatehllc:0.1052
mixRatehllc - mixRate: -0.000200
hllcMixCnt - realMixCnt: -1
hllc cost time: 26
********************begin r :{0.2},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 25000 ,r : 0.2 setA size: 5000 , setB size: 5000
generate data cost time: 16
realMixCnt: 2505 , mixRate:0.501
Map collection cost time: 4
hllcMixCnt: 2496 , mixRatehllc:0.4992
mixRatehllc - mixRate: -0.001800
hllcMixCnt - realMixCnt: -9
hllc cost time: 16
********************begin r :{0.5},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 10000 ,r : 0.5 setA size: 5000 , setB size: 5000
generate data cost time: 8
realMixCnt: 2499 , mixRate:0.4998
Map collection cost time: 1
hllcMixCnt: 2505 , mixRatehllc:0.501
mixRatehllc - mixRate: 0.001200
hllcMixCnt - realMixCnt: 6
hllc cost time: 14
********************begin r :{0.01},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.01 setA size: 10000 , setB size: 10000
generate data cost time: 15
realMixCnt: 103 , mixRate:0.0103
Map collection cost time: 2
hllcMixCnt: 62 , mixRatehllc:0.0062
mixRatehllc - mixRate: -0.004100
hllcMixCnt - realMixCnt: -41
hllc cost time: 24
********************begin r :{0.05},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.05 setA size: 10000 , setB size: 10000
generate data cost time: 18
realMixCnt: 484 , mixRate:0.0484
Map collection cost time: 2
hllcMixCnt: 467 , mixRatehllc:0.0467
mixRatehllc - mixRate: -0.001700
hllcMixCnt - realMixCnt: -17
hllc cost time: 18
********************begin r :{0.1},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.1 setA size: 10000 , setB size: 10000
generate data cost time: 11
realMixCnt: 938 , mixRate:0.0938
Map collection cost time: 1
hllcMixCnt: 967 , mixRatehllc:0.0967
mixRatehllc - mixRate: 0.002900
hllcMixCnt - realMixCnt: 29
hllc cost time: 12
********************begin r :{0.2},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 50000 ,r : 0.2 setA size: 10000 , setB size: 10000
generate data cost time: 10
realMixCnt: 1997 , mixRate:0.1997
Map collection cost time: 5
hllcMixCnt: 1999 , mixRatehllc:0.1999
mixRatehllc - mixRate: 0.000200
hllcMixCnt - realMixCnt: 2
hllc cost time: 15
********************begin r :{0.5},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 20000 ,r : 0.5 setA size: 10000 , setB size: 10000
generate data cost time: 18
realMixCnt: 5010 , mixRate:0.501
Map collection cost time: 4
hllcMixCnt: 4990 , mixRatehllc:0.499
mixRatehllc - mixRate: -0.002000
hllcMixCnt - realMixCnt: -20
hllc cost time: 23
********************begin r :{0.01},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.01 setA size: 20000 , setB size: 20000
generate data cost time: 41
realMixCnt: 218 , mixRate:0.0109
Map collection cost time: 4
hllcMixCnt: 134 , mixRatehllc:0.0067
mixRatehllc - mixRate: -0.004200
hllcMixCnt - realMixCnt: -84
hllc cost time: 36
********************begin r :{0.05},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 400000 ,r : 0.05 setA size: 20000 , setB size: 20000
generate data cost time: 19
realMixCnt: 946 , mixRate:0.0473
Map collection cost time: 3
hllcMixCnt: 949 , mixRatehllc:0.04745
mixRatehllc - mixRate: 0.000150
hllcMixCnt - realMixCnt: 3
hllc cost time: 23
********************begin r :{0.1},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.1 setA size: 20000 , setB size: 20000
generate data cost time: 26
realMixCnt: 2001 , mixRate:0.10005
Map collection cost time: 10
hllcMixCnt: 2080 , mixRatehllc:0.104
mixRatehllc - mixRate: 0.003950
hllcMixCnt - realMixCnt: 79
hllc cost time: 56
********************begin r :{0.2},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.2 setA size: 20000 , setB size: 20000
generate data cost time: 28
realMixCnt: 4034 , mixRate:0.2017
Map collection cost time: 7
hllcMixCnt: 4113 , mixRatehllc:0.20565
mixRatehllc - mixRate: 0.003950
hllcMixCnt - realMixCnt: 79
hllc cost time: 25
********************begin r :{0.5},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 40000 ,r : 0.5 setA size: 20000 , setB size: 20000
generate data cost time: 24
realMixCnt: 9975 , mixRate:0.49875
Map collection cost time: 8
hllcMixCnt: 9994 , mixRatehllc:0.4997
mixRatehllc - mixRate: 0.000950
hllcMixCnt - realMixCnt: 19
hllc cost time: 23
********************begin r :{0.01},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 5000000 ,r : 0.01 setA size: 50000 , setB size: 50000
generate data cost time: 88
realMixCnt: 468 , mixRate:0.00936
Map collection cost time: 48
hllcMixCnt: 603 , mixRatehllc:0.01206
mixRatehllc - mixRate: 0.002700
hllcMixCnt - realMixCnt: 135
hllc cost time: 163
********************begin r :{0.05},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.05 setA size: 50000 , setB size: 50000
generate data cost time: 99
realMixCnt: 2381 , mixRate:0.04762
Map collection cost time: 17
hllcMixCnt: 2335 , mixRatehllc:0.0467
mixRatehllc - mixRate: -0.000920
hllcMixCnt - realMixCnt: -46
hllc cost time: 59
********************begin r :{0.1},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.1 setA size: 50000 , setB size: 50000
generate data cost time: 52
realMixCnt: 5091 , mixRate:0.10182
Map collection cost time: 19
hllcMixCnt: 5116 , mixRatehllc:0.10232
mixRatehllc - mixRate: 0.000500
hllcMixCnt - realMixCnt: 25
hllc cost time: 88
********************begin r :{0.2},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 250000 ,r : 0.2 setA size: 50000 , setB size: 50000
generate data cost time: 72
realMixCnt: 24889 , mixRate:0.49778
Map collection cost time: 18
hllcMixCnt: 25002 , mixRatehllc:0.50004
mixRatehllc - mixRate: 0.002260
hllcMixCnt - realMixCnt: 113
hllc cost time: 61
********************begin r :{0.5},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.5 setA size: 50000 , setB size: 50000
generate data cost time: 86
realMixCnt: 25140 , mixRate:0.5028
Map collection cost time: 15
hllcMixCnt: 25190 , mixRatehllc:0.5038
mixRatehllc - mixRate: 0.001000
hllcMixCnt - realMixCnt: 50
hllc cost time: 71
********************begin r :{0.01},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.01 setA size: 100000 , setB size: 100000
generate data cost time: 154
realMixCnt: 1051 , mixRate:0.01051
Map collection cost time: 29
hllcMixCnt: 811 , mixRatehllc:0.00811
mixRatehllc - mixRate: -0.002400
hllcMixCnt - realMixCnt: -240
hllc cost time: 232
********************begin r :{0.05},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.05 setA size: 100000 , setB size: 100000
generate data cost time: 171
realMixCnt: 4903 , mixRate:0.04903
Map collection cost time: 19
hllcMixCnt: 5095 , mixRatehllc:0.05095
mixRatehllc - mixRate: 0.001920
hllcMixCnt - realMixCnt: 192
hllc cost time: 122
********************begin r :{0.1},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.1 setA size: 100000 , setB size: 100000
generate data cost time: 131
realMixCnt: 9931 , mixRate:0.09931
Map collection cost time: 42
hllcMixCnt: 10136 , mixRatehllc:0.10136
mixRatehllc - mixRate: 0.002050
hllcMixCnt - realMixCnt: 205
hllc cost time: 155
********************begin r :{0.2},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.2 setA size: 100000 , setB size: 100000
generate data cost time: 117
realMixCnt: 20148 , mixRate:0.20148
Map collection cost time: 35
hllcMixCnt: 20414 , mixRatehllc:0.20414
mixRatehllc - mixRate: 0.002660
hllcMixCnt - realMixCnt: 266
hllc cost time: 111
********************begin r :{0.5},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.5 setA size: 100000 , setB size: 100000
generate data cost time: 130
realMixCnt: 49964 , mixRate:0.49964
Map collection cost time: 35
hllcMixCnt: 50268 , mixRatehllc:0.50268
mixRatehllc - mixRate: 0.003040
hllcMixCnt - realMixCnt: 304
hllc cost time: 133
********************begin r :{0.01},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 20000000 ,r : 0.01 setA size: 200000 , setB size: 200000
generate data cost time: 260
realMixCnt: 2035 , mixRate:0.010175
Map collection cost time: 83
hllcMixCnt: 1247 , mixRatehllc:0.006235
mixRatehllc - mixRate: -0.003940
hllcMixCnt - realMixCnt: -788
hllc cost time: 389
********************begin r :{0.05},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 4000000 ,r : 0.05 setA size: 200000 , setB size: 200000
generate data cost time: 311
realMixCnt: 10159 , mixRate:0.050795
Map collection cost time: 94
hllcMixCnt: 10030 , mixRatehllc:0.05015
mixRatehllc - mixRate: -0.000645
hllcMixCnt - realMixCnt: -129
hllc cost time: 308
********************begin r :{0.1},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.1 setA size: 200000 , setB size: 200000
generate data cost time: 255
realMixCnt: 20009 , mixRate:0.100045
Map collection cost time: 133
hllcMixCnt: 19539 , mixRatehllc:0.097695
mixRatehllc - mixRate: -0.002350
hllcMixCnt - realMixCnt: -470
hllc cost time: 235
********************begin r :{0.2},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.2 setA size: 200000 , setB size: 200000
generate data cost time: 229
realMixCnt: 39946 , mixRate:0.19973
Map collection cost time: 92
hllcMixCnt: 41310 , mixRatehllc:0.20655
mixRatehllc - mixRate: 0.006820
hllcMixCnt - realMixCnt: 1364
hllc cost time: 271
********************begin r :{0.5},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 400000 ,r : 0.5 setA size: 200000 , setB size: 200000
generate data cost time: 357
realMixCnt: 100095 , mixRate:0.500475
Map collection cost time: 93
hllcMixCnt: 100242 , mixRatehllc:0.50121
mixRatehllc - mixRate: 0.000735
hllcMixCnt - realMixCnt: 147
hllc cost time: 422
********************begin r :{0.01},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 50000000 ,r : 0.01 setA size: 500000 , setB size: 500000
generate data cost time: 758
realMixCnt: 5084 , mixRate:0.010168
Map collection cost time: 211
hllcMixCnt: 2978 , mixRatehllc:0.005956
mixRatehllc - mixRate: -0.004212
hllcMixCnt - realMixCnt: -2106
hllc cost time: 844
********************begin r :{0.05},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.05 setA size: 500000 , setB size: 500000
generate data cost time: 721
realMixCnt: 25296 , mixRate:0.050592
Map collection cost time: 222
hllcMixCnt: 23440 , mixRatehllc:0.04688
mixRatehllc - mixRate: -0.003712
hllcMixCnt - realMixCnt: -1856
hllc cost time: 699
********************begin r :{0.1},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 5000000 ,r : 0.1 setA size: 500000 , setB size: 500000
generate data cost time: 688
realMixCnt: 50178 , mixRate:0.100356
Map collection cost time: 200
hllcMixCnt: 45070 , mixRatehllc:0.09014
mixRatehllc - mixRate: -0.010216
hllcMixCnt - realMixCnt: -5108
hllc cost time: 701
********************begin r :{0.2},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 2500000 ,r : 0.2 setA size: 500000 , setB size: 500000
generate data cost time: 897
realMixCnt: 249899 , mixRate:0.499798
Map collection cost time: 223
hllcMixCnt: 250263 , mixRatehllc:0.500526
mixRatehllc - mixRate: 0.000728
hllcMixCnt - realMixCnt: 364
hllc cost time: 658
********************begin r :{0.5},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.5 setA size: 500000 , setB size: 500000
generate data cost time: 868
realMixCnt: 249895 , mixRate:0.49979
Map collection cost time: 245
hllcMixCnt: 249916 , mixRatehllc:0.499832
mixRatehllc - mixRate: 0.000042
hllcMixCnt - realMixCnt: 21
hllc cost time: 724
********************begin r :{0.01},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.01 setA size: 100000 , setB size: 100000
generate data cost time: 110
realMixCnt: 1026 , mixRate:0.01026
Map collection cost time: 28
hllcMixCnt: 569 , mixRatehllc:0.00569
mixRatehllc - mixRate: -0.004570
hllcMixCnt - realMixCnt: -457
hllc cost time: 95
********************begin r :{0.05},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.05 setA size: 100000 , setB size: 100000
generate data cost time: 91
realMixCnt: 5024 , mixRate:0.05024
Map collection cost time: 26
hllcMixCnt: 5439 , mixRatehllc:0.05439
mixRatehllc - mixRate: 0.004150
hllcMixCnt - realMixCnt: 415
hllc cost time: 131
********************begin r :{0.1},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.1 setA size: 100000 , setB size: 100000
generate data cost time: 93
realMixCnt: 9925 , mixRate:0.09925
Map collection cost time: 28
hllcMixCnt: 10201 , mixRatehllc:0.10201
mixRatehllc - mixRate: 0.002760
hllcMixCnt - realMixCnt: 276
hllc cost time: 141
********************begin r :{0.2},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.2 setA size: 100000 , setB size: 100000
generate data cost time: 90
realMixCnt: 19983 , mixRate:0.19983
Map collection cost time: 32
hllcMixCnt: 19936 , mixRatehllc:0.19936
mixRatehllc - mixRate: -0.000470
hllcMixCnt - realMixCnt: -47
hllc cost time: 128
********************begin r :{0.5},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.5 setA size: 100000 , setB size: 100000
generate data cost time: 121
realMixCnt: 50027 , mixRate:0.50027
Map collection cost time: 35
hllcMixCnt: 49726 , mixRatehllc:0.49726
mixRatehllc - mixRate: -0.003010
hllcMixCnt - realMixCnt: -301
hllc cost time: 137
********************begin r :{0.01},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 20000000 ,r : 0.01 setA size: 200000 , setB size: 200000
generate data cost time: 247
realMixCnt: 1991 , mixRate:0.009955
Map collection cost time: 38
hllcMixCnt: 2118 , mixRatehllc:0.01059
mixRatehllc - mixRate: 0.000635
hllcMixCnt - realMixCnt: 127
hllc cost time: 200
********************begin r :{0.05},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 4000000 ,r : 0.05 setA size: 200000 , setB size: 200000
generate data cost time: 225
realMixCnt: 10000 , mixRate:0.05
Map collection cost time: 71
hllcMixCnt: 9751 , mixRatehllc:0.048755
mixRatehllc - mixRate: -0.001245
hllcMixCnt - realMixCnt: -249
hllc cost time: 273
********************begin r :{0.1},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.1 setA size: 200000 , setB size: 200000
generate data cost time: 224
realMixCnt: 19974 , mixRate:0.09987
Map collection cost time: 71
hllcMixCnt: 19810 , mixRatehllc:0.09905
mixRatehllc - mixRate: -0.000820
hllcMixCnt - realMixCnt: -164
hllc cost time: 300
********************begin r :{0.2},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.2 setA size: 200000 , setB size: 200000
generate data cost time: 243
realMixCnt: 40093 , mixRate:0.200465
Map collection cost time: 82
hllcMixCnt: 40549 , mixRatehllc:0.202745
mixRatehllc - mixRate: 0.002280
hllcMixCnt - realMixCnt: 456
hllc cost time: 297
********************begin r :{0.5},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 400000 ,r : 0.5 setA size: 200000 , setB size: 200000
generate data cost time: 283
realMixCnt: 99874 , mixRate:0.49937
Map collection cost time: 88
hllcMixCnt: 99730 , mixRatehllc:0.49865
mixRatehllc - mixRate: -0.000720
hllcMixCnt - realMixCnt: -144
hllc cost time: 306
********************begin r :{0.01},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 50000000 ,r : 0.01 setA size: 500000 , setB size: 500000
generate data cost time: 678
realMixCnt: 5148 , mixRate:0.010296
Map collection cost time: 181
hllcMixCnt: 3895 , mixRatehllc:0.00779
mixRatehllc - mixRate: -0.002506
hllcMixCnt - realMixCnt: -1253
hllc cost time: 673
********************begin r :{0.05},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.05 setA size: 500000 , setB size: 500000
generate data cost time: 820
realMixCnt: 25131 , mixRate:0.050262
Map collection cost time: 185
hllcMixCnt: 24850 , mixRatehllc:0.0497
mixRatehllc - mixRate: -0.000562
hllcMixCnt - realMixCnt: -281
hllc cost time: 647
********************begin r :{0.1},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 5000000 ,r : 0.1 setA size: 500000 , setB size: 500000
generate data cost time: 691
realMixCnt: 49911 , mixRate:0.099822
Map collection cost time: 187
hllcMixCnt: 50951 , mixRatehllc:0.101902
mixRatehllc - mixRate: 0.002080
hllcMixCnt - realMixCnt: 1040
hllc cost time: 690
********************begin r :{0.2},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 2500000 ,r : 0.2 setA size: 500000 , setB size: 500000
generate data cost time: 888
realMixCnt: 250250 , mixRate:0.5005
Map collection cost time: 212
hllcMixCnt: 249358 , mixRatehllc:0.498716
mixRatehllc - mixRate: -0.001784
hllcMixCnt - realMixCnt: -892
hllc cost time: 608
********************begin r :{0.5},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.5 setA size: 500000 , setB size: 500000
generate data cost time: 840
realMixCnt: 249691 , mixRate:0.499382
Map collection cost time: 230
hllcMixCnt: 249833 , mixRatehllc:0.499666
mixRatehllc - mixRate: 0.000284
hllcMixCnt - realMixCnt: 142
hllc cost time: 714
Process finished with exit code 0
发表评论
-
hllc 不同M 的 小基数的误差率
2017-12-22 14:20 435结论: testHllcError(10, tt) ... -
newExecuteStatementOperation single session
2017-10-16 09:52 536var udfNotInited = true o ... -
yarn spark
2017-09-19 14:08 239--master yarn --deploy-mode cli ... -
test code 09-18-2
2017-09-18 18:47 285object FunnelUtil { var gson ... -
交流 code 09-18
2017-09-18 18:45 277object DataProcess extends Ap ... -
mvn + scala support
2017-09-15 10:00 373<build> <p ... -
THREAD TEST
2017-09-12 18:07 321val THREAD_POOL_SIZE = 10 ... -
json
2017-09-07 10:21 364val gson: Gson = new GsonBuil ... -
curreying function
2017-08-09 15:27 264benchmark2("hllc") ... -
file io
2017-02-04 20:10 279window 下通过 source 读文件各种鬼 改用 Buf ...
相关推荐
基数排序算法 java实现 还有基数排序的原理文档
基数排序算法,这是分不错的算法,实现起来有点难,用静态链表实现,每次改变结点的指向,希望有用,多多指教
基数排序算法在Linux下实现 Ubuntu13.04
该算法用C++语言实现了基数排序算法,已经调试通过,在Linux系统环境中运行结果正常
C++写基数排序算法,数据结构课的上机题代码
这个是在vc6.0下利用递归算法实现的基数排序算法的程序,注释很详细,方便初学者读
用STl实现基数排序算法 功能强大非常适合数据结构课程设计
这个是一个基数排序的算法,算法演示,方便大家学习使用。
基数排序不是基于比较的算法、而是基于统计学角度来观察,此资源包含了基数排序的算法、python代码以及流程图等,欢迎来下载。
随便写的一个基数排序算法,很简单,用STL实现
快速排序算法结构简单,平均性能较佳; 基数排序性能较稳定。结合快速排序和基数排序,本文提出超快速排序算法,通过理论分析和实验表明,新算法的性能优于快速排序算法和基数排序算法。
这边所要介绍的「基数排序法」(radix sort)则是属于「分配式排序」(distribution sort),基数排序法又称「桶子法」(bucket sort)或bin sort,顾名思义,它是透过键值的部份资讯,将要排序的元素分配至某些「桶...
基数排序是一种非比较型整数排序算法,它通过按数字的各个位数进行排序来实现整体序列的有序化。该算法首先确定待排序序列中最大数的位数,然后从最低位(个位)开始,依次对每一位进行排序。在排序过程中,通过创建...
随机产生100个0到999的整数存放于分别用于快速排序和堆排序的2个整型数组和一个用于链式基数排序的静态链表之中。为整数序列的输出定义一个输出函数。依据基数排序的算法分别编写函数程序。
本基数排序算法首先要把数据分为正数和负数部分,正数和负数部分又分为带小数部分,然后都转化成正整数的方法排序,最后取反或者都除以某个数;这里包括了基数算法原理word版本和python代码来实现(每行都有注释!)
基于python的基数排序算法设计与实现
主要介绍了SQL Server中关于基数估计计算预估行数的一些方法探讨,需要的朋友可以参考下
利用关键字序列,打印输出基数排序的每一趟结果。