#62: Implement splitPath (#64)

benedeki · web-flow · commit 6e0a66619695 · 2022-10-17T12:53:58.000+02:00
* `splitPath` function implemented and used
* Updated UTs
* `keepEmptyFields: Boolean` to `splitPath`
diff --git a/README.md b/README.md
@@ -74,6 +74,14 @@ _SchemaUtils_ provides methods for working with schemas, its comparison and alig
       SchemaUtils.appendPath(path, fieldName)
     ```
 
+5. Separates the field name components of a fully qualified column name as their hierarchy goes from root down to the
+deepest one.
+
+    ```scala
+      SchemaUtils.splitPath(columnName, keepEmptyFields = True)
+    ```
+
+
 ### JsonUtils
 
 _Json Utils_ provides methods for working with Json, both on input and output.
diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/implicits/StructTypeImplicits.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/implicits/StructTypeImplicits.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.types._
 import za.co.absa.spark.commons.adapters.TransformAdapter
 import za.co.absa.spark.commons.implicits.DataTypeImplicits.DataTypeEnhancements
 import za.co.absa.spark.commons.implicits.StructFieldImplicits.StructFieldEnhancements
+import za.co.absa.spark.commons.utils.SchemaUtils
 import za.co.absa.spark.commons.utils.SchemaUtils.{getAllArraySubPaths, isCommonSubPath}
 
 import scala.annotation.tailrec
@@ -102,7 +103,7 @@ object StructTypeImplicits {
         }
       }
 
-      val pathTokens = path.split('.').toList
+      val pathTokens = SchemaUtils.splitPath(path)
       Try{
         examineStructField(pathTokens.tail, schema(pathTokens.head))
       }.getOrElse(None)
@@ -192,7 +193,7 @@ object StructTypeImplicits {
      * @return true if the column is the only column in a struct
      */
     def isOnlyField(path: String): Boolean = {
-      val pathSegments = path.split('.')
+      val pathSegments = SchemaUtils.splitPath(path)
       evaluateConditionsForField(schema, pathSegments, path, applyArrayHelper = false, applyLeafCondition = true,
         field => field.fields.length == 1)
     }
@@ -330,7 +331,7 @@ object StructTypeImplicits {
         }
       }
 
-      val pathToks = path.split('.')
+      val pathToks = SchemaUtils.splitPath(path)
       helper(pathToks, Seq()).mkString(".")
     }
 
@@ -358,7 +359,7 @@ object StructTypeImplicits {
         }
       }
 
-      val pathToks = path.split("\\.")
+      val pathToks = SchemaUtils.splitPath(path)
       helper(pathToks, Seq(), Seq())
     }
 
@@ -411,7 +412,7 @@ object StructTypeImplicits {
      * @return true if a field is an array that is not nested in another array
      */
     def isNonNestedArray(path: String): Boolean = {
-      val pathSegments = path.split('.')
+      val pathSegments = SchemaUtils.splitPath(path)
       evaluateConditionsForField(schema, pathSegments, path, applyArrayHelper = false)
     }
   }
diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/utils/ExplodeTools.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/utils/ExplodeTools.scala
@@ -252,7 +252,7 @@ object ExplodeTools {
     }
 
     val newFieldName = inputDf.schema.getClosestUniqueName(deconstructedColumnName)
-    val resultDf = inputDf.select(processStruct(inputDf.schema, columnName.split('.'), None)
+    val resultDf = inputDf.select(processStruct(inputDf.schema, SchemaUtils.splitPath(columnName), None)
       :+ col(columnName).as(newFieldName): _*)
     DeconstructedNestedField(resultDf, newFieldName, transientColName)
   }
@@ -287,7 +287,7 @@ object ExplodeTools {
       val newFields2 = if (isColumnToFound) newFields else newFields :+ col(columnFrom).as(columnTo)
       inputDf.select(newFields2: _*)
     } else {
-      putFieldIntoNestedStruct(inputDf, columnFrom, columnTo.split('.'), positionColumn)
+      putFieldIntoNestedStruct(inputDf, columnFrom, SchemaUtils.splitPath(columnTo), positionColumn)
     }
   }
 
@@ -342,7 +342,7 @@ object ExplodeTools {
 
   private def addSuperTransientField(inputDf: DataFrame, arrayColPathName: String): (DataFrame, String) = {
     val colName = inputDf.schema.getClosestUniqueName(superTransientColumnName)
-    val nestedColName = (arrayColPathName.split('.').dropRight(1) :+ colName).mkString(".")
+    val nestedColName = (SchemaUtils.splitPath(arrayColPathName).dropRight(1) :+ colName).mkString(".")
     val df = inputDf.nestedWithColumn(nestedColName, lit(null))
     (df, nestedColName)
   }
diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/utils/SchemaUtils.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/utils/SchemaUtils.scala
@@ -74,7 +74,7 @@ object SchemaUtils {
     }
 
     var isParentCommon = true // For Seq() the property holds by [my] convention
-    var restOfPaths: Seq[Seq[String]] = paths.map(_.split('.').toSeq).filter(_.nonEmpty)
+    var restOfPaths: Seq[Seq[String]] = paths.map(SchemaUtils.splitPath).filter(_.nonEmpty)
     while (isParentCommon && restOfPaths.nonEmpty) {
       val parent = restOfPaths.head.head
       isParentCommon = restOfPaths.forall(path => path.head == parent)
@@ -100,6 +100,44 @@ object SchemaUtils {
     }
   }
 
+
+  /**
+   * Separates the field name components of a fully qualified column name as their hierarchy goes from root down to the
+   * deepest one. No validation on the field names is done
+   * Example: `"com.my.package.xyz"` -> `List("com", "my", "package", "xyz")`
+   * Trailing '.' is ignored, leading one not.
+   *
+   * @param columnName      A fully qualified column name
+   * @return Each level field name in sequence how they go from root to the lowest one
+   */
+  def splitPath(columnName: String): List[String] = splitPath(columnName, keepEmptyFields = true)
+
+  /**
+   * Separates the field name components of a fully qualified column name as their hierarchy goes from root down to the
+   * deepest one. No validation on the field names is done
+   * Function is rather overloaded than using default parameter for easier use in functions like `map`
+   * Example: `"com.my.package.xyz"` -> `List("com", "my", "package", "xyz")`
+   * Trailing '.' is ignored, leading one not.
+   *
+   * @param columnName      A fully qualified column name
+   * @param keepEmptyFields If `false` any empty field names are removed from the result list, otherwise kept
+   * @return Each level field name in sequence how they go from root to the lowest one
+   */
+  def splitPath(columnName: String, keepEmptyFields: Boolean): List[String] = {
+    val stripped = columnName.stripSuffix(".")
+
+    if (stripped.isEmpty) {
+      List.empty
+    } else {
+      val segments = stripped.split('.').toList
+      if (keepEmptyFields) {
+        segments
+      } else {
+        segments.filterNot(_.isEmpty)
+      }
+    }
+  }
+
   private def columnPathAndCore(columnName: String): (String, String) = {
     val index = columnName.lastIndexOf('.')
     if (index >= 0) {
diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/utils/SchemaUtilsTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/utils/SchemaUtilsTest.scala
@@ -46,4 +46,21 @@ class SchemaUtilsTest extends AnyFunSuite with Matchers {
     assert (!isCommonSubPath("a.b.c.d.e.f", "a.b.c.x", "a.b.c", "a.b", "a"))
   }
 
+  test("Test splitPath") {
+    assertResult(List("a", "b", "c", "d", "e"))(splitPath("a.b.c.d.e"))
+    assertResult(List("a"))(splitPath("a"))
+    assertResult(List("a", "bcd"))(splitPath("a.bcd"))
+    assertResult(List("a", "bcd"))(splitPath("a.bcd."))
+    assertResult(List("", "a", "bcd"))(splitPath(".a.bcd"))
+    assertResult(List.empty[String])(splitPath(""))
+    assertResult(List.empty[String])(splitPath("."))
+  }
+
+  test("Test splitPath with removing empty fields") {
+    assertResult(List("a", "b", "c", "d", "e"))(splitPath("a.b.c.d.e", keepEmptyFields = false))
+    assertResult(List("a", "e"))(splitPath("a....e", keepEmptyFields = false))
+    assertResult(List("a", "bcd"))(splitPath(".a.bcd", keepEmptyFields = false))
+    assertResult(List.empty[String])(splitPath("", keepEmptyFields = false))
+    assertResult(List.empty[String])(splitPath(".", keepEmptyFields = false))
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ import org.apache.spark.sql.types._`
`22`	`22`	`import za.co.absa.spark.commons.adapters.TransformAdapter`
`23`	`23`	`import za.co.absa.spark.commons.implicits.DataTypeImplicits.DataTypeEnhancements`
`24`	`24`	`import za.co.absa.spark.commons.implicits.StructFieldImplicits.StructFieldEnhancements`
	`25`	`+import za.co.absa.spark.commons.utils.SchemaUtils`
`25`	`26`	`import za.co.absa.spark.commons.utils.SchemaUtils.{getAllArraySubPaths, isCommonSubPath}`
`26`	`27`
`27`	`28`	`import scala.annotation.tailrec`
`@@ -102,7 +103,7 @@ object StructTypeImplicits {`
`102`	`103`	`}`
`103`	`104`	`}`
`104`	`105`
`105`		`- val pathTokens = path.split('.').toList`
	`106`	`+ val pathTokens = SchemaUtils.splitPath(path)`
`106`	`107`	`Try{`
`107`	`108`	`examineStructField(pathTokens.tail, schema(pathTokens.head))`
`108`	`109`	`}.getOrElse(None)`
`@@ -192,7 +193,7 @@ object StructTypeImplicits {`
`192`	`193`	`* @return true if the column is the only column in a struct`
`193`	`194`	`*/`
`194`	`195`	`def isOnlyField(path: String): Boolean = {`
`195`		`- val pathSegments = path.split('.')`
	`196`	`+ val pathSegments = SchemaUtils.splitPath(path)`
`196`	`197`	`evaluateConditionsForField(schema, pathSegments, path, applyArrayHelper = false, applyLeafCondition = true,`
`197`	`198`	`field => field.fields.length == 1)`
`198`	`199`	`}`
`@@ -330,7 +331,7 @@ object StructTypeImplicits {`
`330`	`331`	`}`
`331`	`332`	`}`
`332`	`333`
`333`		`- val pathToks = path.split('.')`
	`334`	`+ val pathToks = SchemaUtils.splitPath(path)`
`334`	`335`	`helper(pathToks, Seq()).mkString(".")`
`335`	`336`	`}`
`336`	`337`
`@@ -358,7 +359,7 @@ object StructTypeImplicits {`
`358`	`359`	`}`
`359`	`360`	`}`
`360`	`361`
`361`		`- val pathToks = path.split("\\.")`
	`362`	`+ val pathToks = SchemaUtils.splitPath(path)`
`362`	`363`	`helper(pathToks, Seq(), Seq())`
`363`	`364`	`}`
`364`	`365`
`@@ -411,7 +412,7 @@ object StructTypeImplicits {`
`411`	`412`	`* @return true if a field is an array that is not nested in another array`
`412`	`413`	`*/`
`413`	`414`	`def isNonNestedArray(path: String): Boolean = {`
`414`		`- val pathSegments = path.split('.')`
	`415`	`+ val pathSegments = SchemaUtils.splitPath(path)`
`415`	`416`	`evaluateConditionsForField(schema, pathSegments, path, applyArrayHelper = false)`
`416`	`417`	`}`
`417`	`418`	`}`
Original file line number	Diff line number	Diff line change
`@@ -252,7 +252,7 @@ object ExplodeTools {`
`252`	`252`	`}`
`253`	`253`
`254`	`254`	`val newFieldName = inputDf.schema.getClosestUniqueName(deconstructedColumnName)`
`255`		`- val resultDf = inputDf.select(processStruct(inputDf.schema, columnName.split('.'), None)`
	`255`	`+ val resultDf = inputDf.select(processStruct(inputDf.schema, SchemaUtils.splitPath(columnName), None)`
`256`	`256`	`:+ col(columnName).as(newFieldName): _*)`
`257`	`257`	`DeconstructedNestedField(resultDf, newFieldName, transientColName)`
`258`	`258`	`}`
`@@ -287,7 +287,7 @@ object ExplodeTools {`
`287`	`287`	`val newFields2 = if (isColumnToFound) newFields else newFields :+ col(columnFrom).as(columnTo)`
`288`	`288`	`inputDf.select(newFields2: _*)`
`289`	`289`	`} else {`
`290`		`- putFieldIntoNestedStruct(inputDf, columnFrom, columnTo.split('.'), positionColumn)`
	`290`	`+ putFieldIntoNestedStruct(inputDf, columnFrom, SchemaUtils.splitPath(columnTo), positionColumn)`
`291`	`291`	`}`
`292`	`292`	`}`
`293`	`293`
`@@ -342,7 +342,7 @@ object ExplodeTools {`
`342`	`342`
`343`	`343`	`private def addSuperTransientField(inputDf: DataFrame, arrayColPathName: String): (DataFrame, String) = {`
`344`	`344`	`val colName = inputDf.schema.getClosestUniqueName(superTransientColumnName)`
`345`		`- val nestedColName = (arrayColPathName.split('.').dropRight(1) :+ colName).mkString(".")`
	`345`	`+ val nestedColName = (SchemaUtils.splitPath(arrayColPathName).dropRight(1) :+ colName).mkString(".")`
`346`	`346`	`val df = inputDf.nestedWithColumn(nestedColName, lit(null))`
`347`	`347`	`(df, nestedColName)`
`348`	`348`	`}`