@@ -36,7 +36,7 @@ Sparkit-learn introduces two important distributed data format:
36
36
rdd = sc.parallelize(data, 2 ) # each partition with 10 elements
37
37
# ArrayRDD
38
38
# each partition will contain blocks with 5 elements
39
- X = ArrayRDD(rdd, block_size = 5 ) # 4 blocks, 2 in each partition
39
+ X = ArrayRDD(rdd, bsize = 5 ) # 4 blocks, 2 in each partition
40
40
41
41
Basic operations:
42
42
@@ -71,7 +71,7 @@ Sparkit-learn introduces two important distributed data format:
71
71
# array([ 0, 1, 2, ... 17, 18, 19])
72
72
73
73
# pyspark.rdd operations will still work
74
- X.numPartitions () # 2 - number of partitions
74
+ X.getNumPartitions () # 2 - number of partitions
75
75
76
76
- **DictRDD: **
77
77
@@ -84,19 +84,19 @@ Sparkit-learn introduces two important distributed data format:
84
84
X = range (20 )
85
85
y = range (2 ) * 10
86
86
# PySpark RDD with 2 partitions
87
- X_rdd = sc.parallelize(data_X , 2 ) # each partition with 10 elements
88
- y_rdd = sc.parallelize(data_y , 2 ) # each partition with 10 elements
87
+ X_rdd = sc.parallelize(X , 2 ) # each partition with 10 elements
88
+ y_rdd = sc.parallelize(y , 2 ) # each partition with 10 elements
89
89
zipped_rdd = X_rdd.zip(y_rdd) # zip the two rdd's together
90
90
# DictRDD
91
91
# each partition will contain blocks with 5 elements
92
- Z = DictRDD(zipped_rdd, columns = (' X' , ' y' ), block_size = 5 ) # 4 blocks, 2/partition
92
+ Z = DictRDD(zipped_rdd, columns = (' X' , ' y' ), bsize = 5 ) # 4 blocks, 2/partition
93
93
94
94
# or:
95
95
import numpy as np
96
96
97
97
data = np.array([range (20 ), range (2 )* 10 ]).T
98
98
rdd = sc.parallelize(data, 2 )
99
- Z = DictRDD(rdd, columns = (' X' , ' y' ), block_size = 5 )
99
+ Z = DictRDD(rdd, columns = (' X' , ' y' ), bsize = 5 )
100
100
101
101
Basic operations:
102
102
0 commit comments