#258 Remove dependency on scodec.

yruslan · yruslan · commit 3346e8dc1b20 · 2025-01-08T10:29:05.000+01:00
diff --git a/README.md b/README.md
@@ -233,23 +233,22 @@ of the dependencies.
 
 #### Getting all Cobrix dependencies
 
-Cobrix's `spark-cobol` data source depends on the COBOL parser that is a part of Cobrix itself and on `scodec` libraries
-to decode various binary formats.
+Cobrix's `spark-cobol` data source depends on the COBOL parser that is a part of Cobrix itself.
 
 The jars that you need to get are:
 
-* spark-cobol_2.12-2.7.10.jar
-* cobol-parser_2.12-2.7.10.jar
-* scodec-core_2.12-1.10.3.jar
-* scodec-bits_2.12-1.1.4.jar
+* spark-cobol_2.12-2.8.0.jar
+* cobol-parser_2.12-2.8.0.jar
+
+> Versions older than 2.8.0 also need `scodec-core_2.12-1.10.3.jar` and `scodec-bits_2.12-1.1.4.jar`.
 
 > Versions older than 2.7.1 also need `antlr4-runtime-4.8.jar`.
 
 After that you can specify these jars in `spark-shell` command line. Here is an example:
 ```
 $ spark-shell --packages za.co.absa.cobrix:spark-cobol_2.12:2.7.10
 or 
-$ spark-shell --master yarn --deploy-mode client --driver-cores 4 --driver-memory 4G --jars spark-cobol_2.12-2.7.10.jar,cobol-parser_2.12-2.7.10.jar,scodec-core_2.12-1.10.3.jar,scodec-bits_2.12-1.1.4.jar
+$ spark-shell --master yarn --deploy-mode client --driver-cores 4 --driver-memory 4G --jars spark-cobol_2.12-2.8.0.jar,cobol-parser_2.12-2.8.0.jar
 
 Setting default log level to "WARN".
 To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
diff --git a/build.sbt b/build.sbt
@@ -151,10 +151,6 @@ lazy val assemblySettings = Seq(
   assembly / assemblyShadeRules:= Seq(
     // Spark may rely on a different version of ANTLR runtime. Renaming the package helps avoid the binary incompatibility
     ShadeRule.rename("org.antlr.**" -> "za.co.absa.cobrix.cobol.parser.shaded.org.antlr.@1").inAll,
-    // Shading all 3rd party libraries used by 'spark-cobol' in order to avoid binary conflicts.
-    ShadeRule.rename("macrocompat.**" -> "za.co.absa.cobrix.spark.cobol.shaded.macrocompat.@1").inAll,
-    ShadeRule.rename("scodec.**" -> "za.co.absa.cobrix.spark.cobol.shaded.scodec.@1").inAll,
-    ShadeRule.rename("shapeless.**" -> "za.co.absa.cobrix.spark.cobol.shaded.shapeless.@1").inAll,
     // The SLF4j API and implementation are provided by Spark
     ShadeRule.zap("org.slf4j.**").inAll
   ),
diff --git a/cobol-parser/pom.xml b/cobol-parser/pom.xml
@@ -30,11 +30,6 @@
     <packaging>jar</packaging>
 
     <dependencies>
-        <!-- binary codecs -->
-        <dependency>
-            <groupId>org.scodec</groupId>
-            <artifactId>scodec-core_${scala.compat.version}</artifactId>
-        </dependency>
         <!-- parser -->
         <dependency>
             <groupId>org.antlr</groupId>
@@ -46,6 +41,12 @@
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-api</artifactId>
         </dependency>
+        <!-- binary codecs -->
+        <dependency>
+            <groupId>org.scodec</groupId>
+            <artifactId>scodec-core_${scala.compat.version}</artifactId>
+            <scope>test</scope>
+        </dependency>
         <dependency>
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-simple</artifactId>
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/BinaryUtils.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/BinaryUtils.scala
@@ -16,23 +16,13 @@
 
 package za.co.absa.cobrix.cobol.parser.decoders
 
-import scodec.Codec
-import scodec.bits.BitVector
 import za.co.absa.cobrix.cobol.parser.ast.datatype._
 import za.co.absa.cobrix.cobol.parser.common.Constants
 import za.co.absa.cobrix.cobol.parser.encoding.{EBCDIC, Encoding}
 
-import scala.util.control.NonFatal
-
 /** Utilites for decoding Cobol binary data files **/
 //noinspection RedundantBlock
 object BinaryUtils {
-
-  lazy val floatB: Codec[Float] = scodec.codecs.float
-  lazy val floatL: Codec[Float] = scodec.codecs.floatL
-  lazy val doubleB: Codec[Double] = scodec.codecs.double
-  lazy val doubleL: Codec[Double] = scodec.codecs.doubleL
-
   /**
     * This is the EBCDIC to ASCII conversion table. This is an "invariant" subset of EBCDIC code pages.
     * For full EBCDIC code pages support please use [[za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage]]
@@ -105,25 +95,6 @@ object BinaryUtils {
   /** Convert an ASCII character to EBCDIC */
   def asciiToEbcdic(char: Char): Byte = ascii2ebcdic(char.toByte)
 
-  /** Get the bit count of a cobol data type
-    *
-    * @param codec     EBCDIC / ASCII
-    * @param comp      A type of compact stirage
-    * @param precision The precision (the number of digits) of the type
-    * @return
-    */
-  def getBitCount(codec: Codec[_ <: AnyVal], comp: Option[Int], precision: Int): Int = {
-    comp match {
-      case Some(value) =>
-        value match {
-          case compact if compact == 3 =>
-            (precision + 1) * codec.sizeBound.lowerBound.toInt //bcd
-          case _ => codec.sizeBound.lowerBound.toInt // bin/float/floatL
-        }
-      case None => precision * codec.sizeBound.lowerBound.toInt
-    }
-  }
-
   def getBytesCount(compression: Option[Usage], precision: Int, isSigned: Boolean, isExplicitDecimalPt: Boolean, isSignSeparate: Boolean): Int = {
     import Constants._
     val isRealSigned = if (isSignSeparate) false else isSigned
@@ -273,32 +244,4 @@ object BinaryUtils {
     }
     addDecimalPoint(value.toString, scale, scaleFactor)
   }
-
-  /**
-    * A decoder for IEEE-754 big endian floats
-    *
-    * @param bytes A byte array that represents the binary data
-    * @return A boxed float
-    */
-  def decodeFloat(bytes: Array[Byte]): java.lang.Float = {
-    try {
-      floatB.decode(BitVector(bytes)).require.value
-    } catch {
-      case NonFatal(_) => null
-    }
-  }
-
-  /**
-    * A decoder for IEEE-754 big endian doubles
-    *
-    * @param bytes A byte array that represents the binary data
-    * @return A boxed double
-    */
-  def decodeDouble(bytes: Array[Byte]): java.lang.Double = {
-    try {
-      doubleB.decode(BitVector(bytes)).require.value
-    } catch {
-      case NonFatal(_) => null
-    }
-  }
 }
diff --git a/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/FloatingPointDecoders.scala b/cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/FloatingPointDecoders.scala
@@ -16,23 +16,72 @@
 
 package za.co.absa.cobrix.cobol.parser.decoders
 
-import scodec.Codec
-import scodec.bits.BitVector
-
+import java.nio.{ByteBuffer, ByteOrder}
 import scala.util.control.NonFatal
 
 object FloatingPointDecoders {
-  private val floatB: Codec[Float] = scodec.codecs.float
-  private val floatL: Codec[Float] = scodec.codecs.floatL
-  private val doubleB: Codec[Double] = scodec.codecs.double
-  private val doubleL: Codec[Double] = scodec.codecs.doubleL
-
   private val BIT_COUNT_MAGIC = 0x000055AFL
 
+  /**
+    * A decoder for IEEE-754 32 bit big endian floats
+    *
+    * @param bytes A byte array that represents the binary data
+    * @return A boxed float
+    */
+  def decodeFloatB(bytes: Array[Byte]): Float = {
+    require(bytes.length == 4, "Input must be exactly 4 bytes for a 32-bit float")
+
+    val byteBuffer = ByteBuffer.wrap(bytes)
+    byteBuffer.order(ByteOrder.BIG_ENDIAN)
+    byteBuffer.getFloat
+  }
+
+  /**
+    * A decoder for IEEE-754 32 bit little endian floats
+    *
+    * @param bytes A byte array that represents the binary data
+    * @return A boxed float
+    */
+  def decodeFloatL(bytes: Array[Byte]): Float = {
+    require(bytes.length == 4, "Input must be exactly 4 bytes for a 32-bit float")
+
+    val byteBuffer = ByteBuffer.wrap(bytes)
+    byteBuffer.order(ByteOrder.LITTLE_ENDIAN)
+    byteBuffer.getFloat
+  }
+
+  /**
+    * A decoder for IEEE-754 64 bit big endian floats
+    *
+    * @param bytes A byte array that represents the binary data
+    * @return A boxed float
+    */
+  def decodeDoubleB(bytes: Array[Byte]): Double = {
+    require(bytes.length == 8, "Input must be exactly 8 bytes for a 64-bit float")
+
+    val byteBuffer = ByteBuffer.wrap(bytes)
+    byteBuffer.order(ByteOrder.BIG_ENDIAN)
+    byteBuffer.getDouble
+  }
+
+  /**
+    * A decoder for IEEE-754 64 bit little endian floats
+    *
+    * @param bytes A byte array that represents the binary data
+    * @return A boxed float
+    */
+  def decodeDoubleL(bytes: Array[Byte]): Double = {
+    require(bytes.length == 8, "Input must be exactly 8 bytes for a 64-bit float")
+
+    val byteBuffer = ByteBuffer.wrap(bytes)
+    byteBuffer.order(ByteOrder.LITTLE_ENDIAN)
+    byteBuffer.getDouble
+  }
+
   /** Decode IEEE754 single precision big endian encoded number. */
   def decodeIeee754SingleBigEndian(bytes: Array[Byte]): java.lang.Float = {
     try {
-      floatB.decode(BitVector(bytes)).require.value
+      decodeFloatB(bytes)
     } catch {
       case NonFatal(_) => null
     }
@@ -41,7 +90,7 @@ object FloatingPointDecoders {
   /** Decode IEEE754 double precision big endian encoded number. */
   def decodeIeee754DoubleBigEndian(bytes: Array[Byte]): java.lang.Double = {
     try {
-      doubleB.decode(BitVector(bytes)).require.value
+      decodeDoubleB(bytes)
     } catch {
       case NonFatal(_) => null
     }
@@ -50,7 +99,7 @@ object FloatingPointDecoders {
   /** Decode IEEE754 single precision little endian encoded number. */
   def decodeIeee754SingleLittleEndian(bytes: Array[Byte]): java.lang.Float = {
     try {
-      floatL.decode(BitVector(bytes)).require.value
+      decodeFloatL(bytes)
     } catch {
       case NonFatal(_) => null
     }
@@ -59,7 +108,7 @@ object FloatingPointDecoders {
   /** Decode IEEE754 double precision little endian encoded number. */
   def decodeIeee754DoubleLittleEndian(bytes: Array[Byte]): java.lang.Double = {
     try {
-      doubleL.decode(BitVector(bytes)).require.value
+      decodeDoubleL(bytes)
     } catch {
       case NonFatal(_) => null
     }
diff --git a/pom.xml b/pom.xml
@@ -117,8 +117,7 @@
         <jackson.version>2.13.1</jackson.version>
         <mockito.version>4.11.0</mockito.version>
         <scala_logging.version>3.7.2</scala_logging.version>
-        <scodec_bits.version>1.1.4</scodec_bits.version>
-        <scodec_core.version>1.10.3</scodec_core.version>
+        <scodec_core.version>1.11.10</scodec_core.version>
         <slf4j.version>1.7.25</slf4j.version>
     </properties>
 
@@ -227,15 +226,11 @@
                 <artifactId>jul-to-slf4j</artifactId>
                 <version>${slf4j.version}</version>
             </dependency>
-            <dependency>
-                <groupId>org.scodec</groupId>
-                <artifactId>scodec-bits_${scala.compat.version}</artifactId>
-                <version>${scodec_bits.version}</version>
-            </dependency>
             <dependency>
                 <groupId>org.scodec</groupId>
                 <artifactId>scodec-core_${scala.compat.version}</artifactId>
                 <version>${scodec_core.version}</version>
+                <scope>test</scope>
             </dependency>
 
             <!-- Test scope dependencies -->
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
@@ -72,14 +72,14 @@ object Dependencies {
 
   val CobolParserDependencies: Seq[ModuleID] = Seq(
     // compile
-    "org.scodec"   %% "scodec-core"    % scodecCoreVersion excludeAll(ExclusionRule(organization = "org.scala-lang")),
     "org.antlr"     % "antlr4-runtime" % antlrValue,
     "org.slf4j"     % "slf4j-api"      % slf4jVersion,
 
     // test
-    "org.scalatest" %% "scalatest"      % scalatestVersion % Test,
-    "org.mockito"    % "mockito-core"   % mockitoVersion   % Test,
-    "org.slf4j"      % "slf4j-simple"   % slf4jVersion     % Test
+    "org.scalatest" %% "scalatest"      % scalatestVersion  % Test,
+    "org.mockito"    % "mockito-core"   % mockitoVersion    % Test,
+    "org.scodec"    %% "scodec-core"    % scodecCoreVersion % Test,
+    "org.slf4j"      % "slf4j-simple"   % slf4jVersion      % Test
   )
 
   val CobolParserShadedDependencies: Set[ModuleID] = Set(