Changes to support Arrow 2.0 and Spark 3.0 (#711)

Prashanth Govindarajan · web-flow · commit d6f0b094f4a5 · 2020-10-23T11:42:34.000-07:00
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs
@@ -290,7 +290,7 @@ public void TestDataFrameVectorUdf()
             }
         }
 
-        [SkipIfSparkVersionIsGreaterOrEqualTo(Versions.V3_0_0)]
+        [Fact]
         public void TestGroupedMapUdf()
         {
             DataFrame df = _spark
@@ -368,7 +368,7 @@ private static RecordBatch ArrowBasedCountCharacters(RecordBatch records)
                 returnLength);
         }
 
-        [SkipIfSparkVersionIsGreaterOrEqualTo(Versions.V3_0_0)]
+        [Fact]
         public void TestDataFrameGroupedMapUdf()
         {
             DataFrame df = _spark
diff --git a/src/csharp/Microsoft.Spark.UnitTest/WorkerFunctionTests.cs b/src/csharp/Microsoft.Spark.UnitTest/WorkerFunctionTests.cs
@@ -110,7 +110,7 @@ public void TestArrowWorkerFunctionForBool()
                 new ArrowUdfWrapper<StringArray, BooleanArray, BooleanArray>(
                     (strings, flags) => (BooleanArray)ToArrowArray(
                         Enumerable.Range(0, strings.Length)
-                            .Select(i => flags.GetBoolean(i) || strings.GetString(i).Contains("true"))
+                            .Select(i => flags.GetValue(i).Value || strings.GetString(i).Contains("true"))
                             .ToArray())).Execute);
 
             IArrowArray[] input = new[]
@@ -120,10 +120,10 @@ public void TestArrowWorkerFunctionForBool()
             };
             var results = (BooleanArray)func.Func(input, new[] { 0, 1 });
             Assert.Equal(4, results.Length);
-            Assert.True(results.GetBoolean(0));
-            Assert.True(results.GetBoolean(1));
-            Assert.True(results.GetBoolean(2));
-            Assert.False(results.GetBoolean(3));
+            Assert.True(results.GetValue(0).Value);
+            Assert.True(results.GetValue(1).Value);
+            Assert.True(results.GetValue(2).Value);
+            Assert.False(results.GetValue(3).Value);
         }
 
         /// <summary>
diff --git a/src/csharp/Microsoft.Spark.Worker.UnitTest/CommandExecutorTests.cs b/src/csharp/Microsoft.Spark.Worker.UnitTest/CommandExecutorTests.cs
@@ -702,7 +702,7 @@ public void TestDataFrameSqlCommandExecutorWithEmptyInput(
             IpcOptions ipcOptions)
         {
             var udfWrapper = new Sql.DataFrameUdfWrapper<ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>(
-                (strings) => strings.Apply(cur=> $"udf: {cur}"));
+                (strings) => strings.Apply(cur => $"udf: {cur}"));
 
             var command = new SqlCommand()
             {
@@ -874,15 +874,28 @@ await arrowWriter.WriteRecordBatchAsync(
             RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();
 
             Assert.Equal(numRows, outputBatch.Length);
-            Assert.Equal(2, outputBatch.ColumnCount);
+            StringArray stringArray;
+            Int64Array longArray;
+            if (sparkVersion < new Version(Versions.V3_0_0))
+            {
+                Assert.Equal(2, outputBatch.ColumnCount);
+                stringArray = (StringArray)outputBatch.Column(0);
+                longArray = (Int64Array)outputBatch.Column(1);
+            }
+            else
+            {
+                Assert.Equal(1, outputBatch.ColumnCount);
+                var structArray = (StructArray)outputBatch.Column(0);
+                Assert.Equal(2, structArray.Fields.Count);
+                stringArray = (StringArray)structArray.Fields[0];
+                longArray = (Int64Array)structArray.Fields[1];
+            }
 
-            var stringArray = (StringArray)outputBatch.Column(0);
             for (int i = 0; i < numRows; ++i)
             {
                 Assert.Equal($"udf: {i}", stringArray.GetString(i));
             }
 
-            var longArray = (Int64Array)outputBatch.Column(1);
             for (int i = 0; i < numRows; ++i)
             {
                 Assert.Equal(100 + i, longArray.Values[i]);
@@ -981,15 +994,28 @@ await arrowWriter.WriteRecordBatchAsync(
             RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();
 
             Assert.Equal(numRows, outputBatch.Length);
-            Assert.Equal(2, outputBatch.ColumnCount);
+            StringArray stringArray;
+            DoubleArray doubleArray;
+            if (sparkVersion < new Version(Versions.V3_0_0))
+            {
+                Assert.Equal(2, outputBatch.ColumnCount);
+                stringArray = (StringArray)outputBatch.Column(0);
+                doubleArray = (DoubleArray)outputBatch.Column(1);
+            }
+            else
+            {
+                Assert.Equal(1, outputBatch.ColumnCount);
+                var structArray = (StructArray)outputBatch.Column(0);
+                Assert.Equal(2, structArray.Fields.Count);
+                stringArray = (StringArray)structArray.Fields[0];
+                doubleArray = (DoubleArray)structArray.Fields[1];
+            }
 
-            var stringArray = (StringArray)outputBatch.Column(0);
             for (int i = 0; i < numRows; ++i)
             {
                 Assert.Equal($"udf: {i}", stringArray.GetString(i));
             }
 
-            var doubleArray = (DoubleArray)outputBatch.Column(1);
             for (int i = 0; i < numRows; ++i)
             {
                 Assert.Equal(100 + i, doubleArray.Values[i]);
diff --git a/src/csharp/Microsoft.Spark.Worker/Command/SqlCommandExecutor.cs b/src/csharp/Microsoft.Spark.Worker/Command/SqlCommandExecutor.cs
@@ -422,8 +422,7 @@ private CommandExecutorStat ExecuteArrowSqlCommand(
 
                 var recordBatch = new RecordBatch(resultSchema, results, numEntries);
 
-                // TODO: Remove sync-over-async once WriteRecordBatch exists.
-                writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult();
+                writer.WriteRecordBatch(recordBatch);
             }
 
             WriteEnd(outputStream, ipcOptions);
@@ -468,8 +467,7 @@ private CommandExecutorStat ExecuteDataFrameSqlCommand(
                             new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true, ipcOptions);
                     }
 
-                    // TODO: Remove sync-over-async once WriteRecordBatch exists.
-                    writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();
+                    writer.WriteRecordBatch(result);
                 }
             }
 
@@ -737,6 +735,29 @@ protected internal override CommandExecutorStat ExecuteCore(
             return ExecuteArrowGroupedMapCommand(inputStream, outputStream, commands);
         }
 
+        private RecordBatch WrapColumnsInStructIfApplicable(RecordBatch batch)
+        {
+            if (_version >= new Version(Versions.V3_0_0))
+            {
+                var fields = new Field[batch.Schema.Fields.Count];
+                for (int i = 0; i < batch.Schema.Fields.Count; ++i)
+                {
+                    fields[i] = batch.Schema.GetFieldByIndex(i);
+                }
+
+                var structType = new StructType(fields);
+                var structArray = new StructArray(
+                    structType,
+                    batch.Length,
+                    batch.Arrays.Cast<Apache.Arrow.Array>(),
+                    ArrowBuffer.Empty);
+                Schema schema = new Schema.Builder().Field(new Field("Struct", structType, false)).Build();
+                return new RecordBatch(schema, new[] { structArray }, batch.Length);
+            }
+
+            return batch;
+        }
+
         private CommandExecutorStat ExecuteArrowGroupedMapCommand(
             Stream inputStream,
             Stream outputStream,
@@ -754,19 +775,19 @@ private CommandExecutorStat ExecuteArrowGroupedMapCommand(
             ArrowStreamWriter writer = null;
             foreach (RecordBatch input in GetInputIterator(inputStream))
             {
-                RecordBatch result = worker.Func(input);
+                RecordBatch batch = worker.Func(input);
 
-                int numEntries = result.Length;
+                RecordBatch final = WrapColumnsInStructIfApplicable(batch);
+                int numEntries = final.Length;
                 stat.NumEntriesProcessed += numEntries;
 
                 if (writer == null)
                 {
                     writer =
-                        new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true, ipcOptions);
+                        new ArrowStreamWriter(outputStream, final.Schema, leaveOpen: true, ipcOptions);
                 }
 
-                // TODO: Remove sync-over-async once WriteRecordBatch exists.
-                writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();
+                writer.WriteRecordBatch(final);
             }
 
             WriteEnd(outputStream, ipcOptions);
@@ -794,20 +815,21 @@ private CommandExecutorStat ExecuteDataFrameGroupedMapCommand(
             {
                 FxDataFrame dataFrame = FxDataFrame.FromArrowRecordBatch(input);
                 FxDataFrame resultDataFrame = worker.Func(dataFrame);
+
                 IEnumerable<RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches();
 
-                foreach (RecordBatch result in recordBatches)
+                foreach (RecordBatch batch in recordBatches)
                 {
-                    stat.NumEntriesProcessed += result.Length;
+                    RecordBatch final = WrapColumnsInStructIfApplicable(batch);
+                    stat.NumEntriesProcessed += final.Length;
 
                     if (writer == null)
                     {
                         writer =
-                            new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true, ipcOptions);
+                            new ArrowStreamWriter(outputStream, final.Schema, leaveOpen: true, ipcOptions);
                     }
 
-                    // TODO: Remove sync-over-async once WriteRecordBatch exists.
-                    writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();
+                    writer.WriteRecordBatch(final);
                 }
             }
 
diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
@@ -28,7 +28,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <PackageReference Include="Apache.Arrow" Version="0.15.1" />
+    <PackageReference Include="Apache.Arrow" Version="2.0.0" />
     <PackageReference Include="Microsoft.CSharp" Version="4.5.0" />
     <PackageReference Include="Microsoft.Data.Analysis" Version="0.4.0" />
     <PackageReference Include="Newtonsoft.Json" Version="11.0.2" />
@@ -37,10 +37,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar"
-             Link="jars\%(Filename)%(Extension)"
-             Pack="true"
-             PackagePath="jars\%(Filename)%(Extension)" />
+    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar" Link="jars\%(Filename)%(Extension)" Pack="true" PackagePath="jars\%(Filename)%(Extension)" />
     <Content Include="build\**" Pack="true" PackagePath="build" />
   </ItemGroup>
 

Original file line number	Diff line number	Diff line change
`@@ -290,7 +290,7 @@ public void TestDataFrameVectorUdf()`
`290`	`290`	`}`
`291`	`291`	`}`
`292`	`292`
`293`		`- [SkipIfSparkVersionIsGreaterOrEqualTo(Versions.V3_0_0)]`
	`293`	`+ [Fact]`
`294`	`294`	`public void TestGroupedMapUdf()`
`295`	`295`	`{`
`296`	`296`	`DataFrame df = _spark`
`@@ -368,7 +368,7 @@ private static RecordBatch ArrowBasedCountCharacters(RecordBatch records)`
`368`	`368`	`returnLength);`
`369`	`369`	`}`
`370`	`370`
`371`		`- [SkipIfSparkVersionIsGreaterOrEqualTo(Versions.V3_0_0)]`
	`371`	`+ [Fact]`
`372`	`372`	`public void TestDataFrameGroupedMapUdf()`
`373`	`373`	`{`
`374`	`374`	`DataFrame df = _spark`