@@ -4,53 +4,40 @@ import org.apache.spark.sql.functions.when
4
4
5
5
object SkewJoinApp extends SparkApp {
6
6
7
- // ./bin/spark-shell --master spark://spark-master:7077 --driver-memory 4g --executor-memory 1024mb --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.adaptive.enabled=false
7
+ // ./bin/spark-shell --master spark://spark-master:7077 --driver-memory 3g --executor-memory 1024mb --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.adaptive.enabled=false
8
8
9
9
spark.sparkContext.setLogLevel(" WARN" )
10
10
11
11
import spark .implicits ._
12
12
13
13
spark.sparkContext.setJobGroup(" skewed data" , " skewed data" )
14
14
15
- val skewedData = spark
16
- .range(0 , 10000000 ) // 10M
17
- .withColumn(" key" , when($" id" < 10 , $" id" ).otherwise(999 ))
18
- .withColumn(" value" , $" id" )
19
-
20
15
val uniformData = spark
21
- .range(0 , 1000000 ) // 1M
16
+ .range(0 , 10000000 ) // 10M
22
17
.withColumn(" key" , $" id" )
23
18
.withColumn(" value" , $" id" )
24
19
25
- val joined = skewedData.join(uniformData, " key" )
20
+ val skewedData = spark
21
+ .range(0 , 200000000 ) // 200M
22
+ .withColumn(" key" , when($" id" < 10000000 , $" id" ).otherwise(999 ))
23
+ .withColumn(" value" , $" id" )
26
24
27
- val res = joined.filter($" key" === 999 ).count()
28
- println(s " Count for skew key (999): $res" )
25
+ skewedData.join(uniformData, " key" ).count()
29
26
30
27
spark.sparkContext.clearJobGroup()
31
28
32
29
spark.sparkContext.setJobGroup(" adaptative query execution" , " adaptative query execution" )
33
30
34
31
spark.conf.set(" spark.sql.adaptive.enabled" , " true" )
32
+ spark.conf.set(" spark.sql.adaptive.skewJoin.skewedPartitionFactor" , " 1" )
33
+ spark.conf.set(" spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes" , " 20MB" )
34
+ spark.conf.set(" spark.sql.adaptive.advisoryPartitionSizeInBytes" , " 15MB" )
35
35
36
- import org .apache .spark .sql .functions ._
37
-
38
- val skewedDataAQE = spark
39
- .range(0 , 10000000 ) // 10M
40
- .withColumn(" key" , when($" id" < 10 , $" id" ).otherwise(999 ))
41
- .withColumn(" value" , $" id" )
42
-
43
- val uniformDataAQE = spark
44
- .range(0 , 1000000 ) // 1M
45
- .withColumn(" key" , $" id" )
46
- .withColumn(" value" , $" id" )
47
-
48
- val joinedAQE = skewedDataAQE.join(uniformDataAQE, " key" )
36
+ val joinedAQE = skewedData.join(uniformData, " key" )
49
37
50
38
joinedAQE.explain(true )
51
39
52
- val resAQE = joinedAQE.filter($" key" === 999 ).count()
53
- println(s " Count for skew key (999): $resAQE" )
40
+ joinedAQE.count()
54
41
55
42
spark.sparkContext.clearJobGroup()
56
43
0 commit comments