From f17d8a86ab819959bbf6725f347ce3ce2280b0ef Mon Sep 17 00:00:00 2001 From: Andrew Law Date: Thu, 1 Oct 2020 18:14:32 -0700 Subject: [PATCH 1/4] Support for multiple branched CaseWhen --- src/enclave/Enclave/ExpressionEvaluation.h | 39 +++++++++++++++++++ src/flatbuffers/Expr.fbs | 5 +++ .../edu/berkeley/cs/rise/opaque/Utils.scala | 17 ++++++++ 3 files changed, 61 insertions(+) diff --git a/src/enclave/Enclave/ExpressionEvaluation.h b/src/enclave/Enclave/ExpressionEvaluation.h index 7aa805b5d7..924e0c1b58 100644 --- a/src/enclave/Enclave/ExpressionEvaluation.h +++ b/src/enclave/Enclave/ExpressionEvaluation.h @@ -742,6 +742,45 @@ class FlatbuffersExpressionEvaluator { } } + case tuix::ExprUnion_CaseWhen: + { + auto e = expr->expr_as_CaseWhen(); + size_t num_children = e->children()->size(); + + // Evaluate to the first value whose predicate is true. + // Short circuit on the earliest branch possible. + for (size_t i = 0; i < num_children - 1; i += 2) { + auto predicate_offset = eval_helper(row, (*e->children())[i]); + auto true_value_offset = eval_helper(row, (*e->children())[i+1]); + const tuix::Field *predicate = + flatbuffers::GetTemporaryPointer(builder, predicate_offset); + const tuix::Field *true_value = + flatbuffers::GetTemporaryPointer(builder, true_value_offset); + if (predicate->value_type() != tuix::FieldUnion_BooleanField) { + throw std::runtime_error( + std::string("tuix::CaseWhen requires predicate to return Boolean, not ") + + std::string(tuix::EnumNameFieldUnion(predicate->value_type()))); + } + if (!predicate->is_null()) { + bool pred_val = static_cast(predicate->value())->value(); + if (pred_val) { + return GetOffset(builder, true_value); + } + } + } + + // Getting here means that none of the predicates are true. + // Return the else value if it exists, or NULL if it doesn't. + if (num_children % 2 == 1) { + auto else_value_offset = eval_helper(row, (*e->children())[num_children-1]); + const tuix::Field *else_value = + flatbuffers::GetTemporaryPointer(builder, else_value_offset); + return GetOffset(builder, else_value); + } + + return NULL; + } + // Null expressions case tuix::ExprUnion_IsNull: { diff --git a/src/flatbuffers/Expr.fbs b/src/flatbuffers/Expr.fbs index 28be6c867a..6b4b420b78 100644 --- a/src/flatbuffers/Expr.fbs +++ b/src/flatbuffers/Expr.fbs @@ -24,6 +24,7 @@ union ExprUnion { Add, Subtract, If, + CaseWhen, Cast, Year, VectorAdd, @@ -136,6 +137,10 @@ table If { false_value:Expr; } +table CaseWhen { + children:[Expr]; +} + // Date expressions table Year { child:Expr; diff --git a/src/main/scala/edu/berkeley/cs/rise/opaque/Utils.scala b/src/main/scala/edu/berkeley/cs/rise/opaque/Utils.scala index 9ab50842eb..1531a3bad7 100644 --- a/src/main/scala/edu/berkeley/cs/rise/opaque/Utils.scala +++ b/src/main/scala/edu/berkeley/cs/rise/opaque/Utils.scala @@ -938,6 +938,23 @@ object Utils extends Logging { tuix.If.createIf( builder, predOffset, trueOffset, falseOffset)) + case (CaseWhen(branches, elseValue), childrenOffsets) => + println("HERE") + println(branches) + println(elseValue) + println(childrenOffsets) + println(branches.getClass) + println(elseValue.getClass) + println(childrenOffsets.getClass) + tuix.Expr.createExpr( + builder, + tuix.ExprUnion.CaseWhen, + tuix.CaseWhen.createCaseWhen( + builder, + tuix.CaseWhen.createChildrenVector( + builder, + childrenOffsets.toArray))) + // Null expressions case (IsNull(child), Seq(childOffset)) => tuix.Expr.createExpr( From 52bdeed660967ec29011a24ee74b8953f47ec9c2 Mon Sep 17 00:00:00 2001 From: Andrew Law Date: Mon, 29 Mar 2021 16:59:17 -0700 Subject: [PATCH 2/4] First draft integrity docs --- docs/src/integrity/integrity.rst | 60 ++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 docs/src/integrity/integrity.rst diff --git a/docs/src/integrity/integrity.rst b/docs/src/integrity/integrity.rst new file mode 100644 index 0000000000..b7cbd216ca --- /dev/null +++ b/docs/src/integrity/integrity.rst @@ -0,0 +1,60 @@ +*********************** +Computational Integrity +*********************** + +The integrity module of Opaque ensures that the untrusted job driver hosted on the cloud service schedules tasks in the manner computed by Spark's Catalyst query optimizer. +Opaque runs on Spark, which utilizes data partitioning to speed up computation. +Specifically, Catalyst will compute a physical query plan for a given dataframe query and delegate Spark workers (run on enclaves) to compute Spark SQL operations on data partitions. +Each of these individual units is trusted, but the intermediary steps in which the units communicate is controlled by the job driver, running as untrusted code in the cloud. +The integrity module will detect if the job driver has deviated from the query plan computed by Catalyst. + +Overview +-------- +The main idea behind integrity support is to tag each step of computation with a MAC, attached by the enclave worker when it has completed its computation. +All MACs received by all previous enclave workers are logged. In the end, these MACs are compared and reconstructed into a graph. +This graph is compared to that computed by Catalyst. +If the graphs are isomorphic, then no tampering has occurred. +Else, the result of the query returned by the cloud is rejected. + +Implementation +-------------- +Two main extensions were made to support integrity - one in enclave code, and one in the Scala client application. + +Enclave Code +^^^^^^^^^^^^ +In the enclave code (C++), modifications were made to the ``FlatbuffersWriters.cpp`` file. +Attached to every output of an ``EncryptedBlocks``` object is a MAC over the output. +No further modifications need to be made to the application logic since this functionality hooks into how Opaque workers output their data. + +Scala/Application Code +^^^^^^^^^^^^^^^^^^^^^^ +The main extension supporting Integrity is the ```JobVerificationEngine`` which is a piece of Scala code that broadly carries out three tasks: + +1. Reconstruct the flow of information between enclave workers. + +2. Compute the corresponding DAG of ecalls for a given query. + +3. Compare the two DAGs and output "accept" or "reject." + +These happen in the "verify" function of the JobVerificationEngine class. + +Reconstructing the executed DAG of ecalls involves iterating through the MACs attached by enclave workers, provided in the "LogEntryChain" object in the Job Verification Engine. +This object is filled by Opaque when Spark's ``collect`` method is called when a query is executed. + +Output MACs of parents correspond to input MACs of their child. Using this information, the DAG is created. + +The "expected" DAG is created from Spark's ``dataframe.queryPlan.executedPlan`` object which is a recursive tree node of Spark Operators. +The Job Verification Engine contains the logic to transform this tree of operators into a tree of ecalls. + +Adding Integrity Support for New Operators +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +To support new operators, if they are added, one should make changes to the Enclave code and the Job Verification Engine code. + +In the enclave, make sure that the enclave context's "finish_ecall" method is called before returning in ``Enclave.cpp```. + +In the Job Verification Engine, add the logic to transform the operator into a list of ecalls that the operator uses in ``generateJobNodes``. +This amounts to adding a case in the switch statement of this function. + +Furthermore, add the logic to connect the ecalls together in ``linkEcalls``. +As above, this amounts to adding a case in the switch statement of this function, but requires knowledge of how each ecall communicates the transfer of data partitions to its successor ecall +(broadcast, all to one, one to all, etc.). \ No newline at end of file From be57507445ddfc7950b2b49ffa5c8aa7ad12bb64 Mon Sep 17 00:00:00 2001 From: Andrew Law Date: Tue, 30 Mar 2021 14:53:01 -0700 Subject: [PATCH 3/4] Resolve Chester comments --- docs/src/integrity/integrity.rst | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/docs/src/integrity/integrity.rst b/docs/src/integrity/integrity.rst index b7cbd216ca..cc712bb705 100644 --- a/docs/src/integrity/integrity.rst +++ b/docs/src/integrity/integrity.rst @@ -6,13 +6,14 @@ The integrity module of Opaque ensures that the untrusted job driver hosted on t Opaque runs on Spark, which utilizes data partitioning to speed up computation. Specifically, Catalyst will compute a physical query plan for a given dataframe query and delegate Spark workers (run on enclaves) to compute Spark SQL operations on data partitions. Each of these individual units is trusted, but the intermediary steps in which the units communicate is controlled by the job driver, running as untrusted code in the cloud. -The integrity module will detect if the job driver has deviated from the query plan computed by Catalyst. +The integrity module will detect foul play by the job driver, including deviation from the query plan computed by Catalyst, +shuffling data in an unexpected manner across data partitions, spoofing extra data between ecalls, or dropping output between ecalls. Overview -------- -The main idea behind integrity support is to tag each step of computation with a MAC, attached by the enclave worker when it has completed its computation. -All MACs received by all previous enclave workers are logged. In the end, these MACs are compared and reconstructed into a graph. -This graph is compared to that computed by Catalyst. +The main idea behind integrity support is to tag each step of computation with a MAC over individual enclave workers' encrypted output, attached by the enclave worker when it has completed its computation. +All MACs received by all previous enclave workers are logged. In the end during post verification, these MACs, which each represent an ecall at a data partition, are compared and reconstructed into a graph. +This graph is compared to the DAG of the query plan computed by Catalyst. If the graphs are isomorphic, then no tampering has occurred. Else, the result of the query returned by the cloud is rejected. @@ -22,8 +23,9 @@ Two main extensions were made to support integrity - one in enclave code, and on Enclave Code ^^^^^^^^^^^^ -In the enclave code (C++), modifications were made to the ``FlatbuffersWriters.cpp`` file. -Attached to every output of an ``EncryptedBlocks``` object is a MAC over the output. +In the enclave code (C++), modifications were made to the ``FlatbuffersWriters.cpp`` file and ``FlatbuffersReaders.cpp`` file. +The "write" change attaches a MAC over the ``EncryptedBlocks`` object to the output. +The "read" change checks whether all blocks that were output from the previous ecall were received by the subsequent ecall. No further modifications need to be made to the application logic since this functionality hooks into how Opaque workers output their data. Scala/Application Code @@ -57,4 +59,10 @@ This amounts to adding a case in the switch statement of this function. Furthermore, add the logic to connect the ecalls together in ``linkEcalls``. As above, this amounts to adding a case in the switch statement of this function, but requires knowledge of how each ecall communicates the transfer of data partitions to its successor ecall -(broadcast, all to one, one to all, etc.). \ No newline at end of file +(broadcast, all to one, one to all, etc.). + +Usage +^^^^^ +To use the Job Verification Engine as a black box, make sure that its state is flushed by calling its ``resetForNextJob`` function. +Then, you can call ``Utils.verifyJob`` on the query dataframe, which will return a boolean indicating whether the job has passed post verification. +It returns ``True`` if the job passed, else it returns ``False``. \ No newline at end of file From 2db9389ec417950bb684ae0170b4e3ae1bce1de3 Mon Sep 17 00:00:00 2001 From: Andrew Law Date: Wed, 31 Mar 2021 17:29:36 -0700 Subject: [PATCH 4/4] Add more detail --- docs/src/integrity/integrity.rst | 69 +++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 11 deletions(-) diff --git a/docs/src/integrity/integrity.rst b/docs/src/integrity/integrity.rst index cc712bb705..a90bdf1af2 100644 --- a/docs/src/integrity/integrity.rst +++ b/docs/src/integrity/integrity.rst @@ -11,12 +11,59 @@ shuffling data in an unexpected manner across data partitions, spoofing extra da Overview -------- -The main idea behind integrity support is to tag each step of computation with a MAC over individual enclave workers' encrypted output, attached by the enclave worker when it has completed its computation. -All MACs received by all previous enclave workers are logged. In the end during post verification, these MACs, which each represent an ecall at a data partition, are compared and reconstructed into a graph. +The main idea behind integrity support is to tag each step of computation with a log over individual enclave workers' encrypted output, attached by the enclave worker when it has completed its computation. +During execution, each enclave worker checks its input, which contains logs of the previous ecall's output, to make sure that no rows were tampered with, dropped, or spoofed by the job driver. +This is done using cryptographic MAC functions, whose output can only be computed by the enclave workers sharing a private key with the client. +The job driver or server is unable to tamper with the data without being detected, since they are unable to forge a well formed MAC without the private key. +In the end during post verification, these log objects, called "Crumbs" which each represent an ecall at a data partition, are compared and used to reconstruct a graph representing the flow of information during query execution. +Specifically, the ``input_macs`` field is matched to other ``all_outputs_mac`` fields of other ecalls to create edges between ecalls and their predecessors. This graph is compared to the DAG of the query plan computed by Catalyst. If the graphs are isomorphic, then no tampering has occurred. Else, the result of the query returned by the cloud is rejected. +Logging +------- +Below are the flatbuffers schemas of the relevant logging objects used for integrity, which can be found under ``src/flatbuffers/EncryptedBlock.fbs``. + +:: + + table EncryptedBlocks { + blocks:[EncryptedBlock]; + log:LogEntryChain; + log_mac:[Mac]; + all_outputs_mac:[ubyte]; + } + + table LogEntry { + ecall:int; // ecall executed + num_macs:int; // Number of EncryptedBlock's in this EncryptedBlocks - checked during runtime + mac_lst:[ubyte]; // List of all MACs. one from each EncryptedBlocks - checked during runtime + mac_lst_mac:[ubyte]; // MAC(mac_lst) - checked during runtime + input_macs:[ubyte]; // List of input EncryptedBlocks' all_output_mac's + num_input_macs:int; // Number of input_macs + } + + table LogEntryChain { + curr_entries:[LogEntry]; + past_entries:[Crumb]; + } + + // Contains information about an ecall, which will be pieced together during post verfication to verify the DAG + // A crumb is created at an ecall for each previous ecall that sent some data to this ecall + table Crumb { + input_macs:[ubyte]; // List of EncryptedBlocks all_output_mac's, from LogEntry + num_input_macs:int; // Number of input_macs + all_outputs_mac:[ubyte]; // MAC over all outputs of ecall from which this EncryptedBlocks came from, of size OE_HMAC_SIZE + ecall:int; // Ecall executed + log_mac:[ubyte]; // MAC over the LogEntryChain from this EncryptedBlocks, of size OE_HMAC_SIZE + } + +The ``EncryptedBlocks`` object is what is produced from an enclave worker and passed to the next ecall. +The ``LogEntry`` object contains information about the current ecall, including its unique integer identifier, MAC outputs over each ``EncryptedBlocks`` it produced, and the ``input_macs`` field, which is a list of the output macs of its predecessor ecall. +The ``LogEntryChain`` contains a list of log entries for a single data partitions. There will be as many ``LogEntryChain`` objects for a given query as there are data partitions. +The ``JobVerificationEngine`` has access to a list of ``LogEntryChain``\s. +The ``Crumb`` object contains the ``LogEntry`` information of previous ecalls, stored in the ``LogEntryChain``. + Implementation -------------- Two main extensions were made to support integrity - one in enclave code, and one in the Scala client application. @@ -24,13 +71,13 @@ Two main extensions were made to support integrity - one in enclave code, and on Enclave Code ^^^^^^^^^^^^ In the enclave code (C++), modifications were made to the ``FlatbuffersWriters.cpp`` file and ``FlatbuffersReaders.cpp`` file. -The "write" change attaches a MAC over the ``EncryptedBlocks`` object to the output. -The "read" change checks whether all blocks that were output from the previous ecall were received by the subsequent ecall. +The "write" change attaches a log to the ``EncryptedBlocks`` object, which contains the enclave worker's encrypted output. +The "read" change is a runtime check verifying whether all blocks that were output from the previous ecall were received by the subsequent ecall. No further modifications need to be made to the application logic since this functionality hooks into how Opaque workers output their data. Scala/Application Code ^^^^^^^^^^^^^^^^^^^^^^ -The main extension supporting Integrity is the ```JobVerificationEngine`` which is a piece of Scala code that broadly carries out three tasks: +The main extension supporting Integrity is the ``JobVerificationEngine`` which is a piece of Scala code that broadly carries out three tasks: 1. Reconstruct the flow of information between enclave workers. @@ -38,10 +85,10 @@ The main extension supporting Integrity is the ```JobVerificationEngine`` which 3. Compare the two DAGs and output "accept" or "reject." -These happen in the "verify" function of the JobVerificationEngine class. +These happen in the ``verify`` function of the JobVerificationEngine class. -Reconstructing the executed DAG of ecalls involves iterating through the MACs attached by enclave workers, provided in the "LogEntryChain" object in the Job Verification Engine. -This object is filled by Opaque when Spark's ``collect`` method is called when a query is executed. +Reconstructing the executed DAG of ecalls involves iterating through the MACs attached by enclave workers, which are fields in the ``Crumb`` and ``LogEntry`` objects stored in each ``LogEntryChain`` in the Job Verification Engine. +The list of ``LogEntryChain``\s is filled by Opaque when Spark's ``collect`` method is called when a query is executed. Output MACs of parents correspond to input MACs of their child. Using this information, the DAG is created. @@ -52,13 +99,13 @@ Adding Integrity Support for New Operators ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ To support new operators, if they are added, one should make changes to the Enclave code and the Job Verification Engine code. -In the enclave, make sure that the enclave context's "finish_ecall" method is called before returning in ``Enclave.cpp```. +In the enclave, make sure that the enclave context's ``finish_ecall`` method is called before returning in ``Enclave.cpp``. In the Job Verification Engine, add the logic to transform the operator into a list of ecalls that the operator uses in ``generateJobNodes``. -This amounts to adding a case in the switch statement of this function. +This amounts to adding a case in the cascading if/else statement of this function. Furthermore, add the logic to connect the ecalls together in ``linkEcalls``. -As above, this amounts to adding a case in the switch statement of this function, but requires knowledge of how each ecall communicates the transfer of data partitions to its successor ecall +As above, this amounts to adding a case in the cascading if/else statement of this function, but requires knowledge of how each ecall communicates the transfer of data partitions to its successor ecall (broadcast, all to one, one to all, etc.). Usage