diff --git a/omniscidb/CMakeLists.txt b/omniscidb/CMakeLists.txt
index bc2ec9295..2828807d0 100644
--- a/omniscidb/CMakeLists.txt
+++ b/omniscidb/CMakeLists.txt
@@ -617,6 +617,19 @@ else()
   set(PROFILER_LIBS "")
 endif()
 
+find_library(libhash_table
+         NAMES 
+         hash_table
+         PATHS 
+         SOMEPATH/l0_physops/build/hash_table
+)
+
+if ( NOT libhash_table )
+        message( WARNING "hash_table library NOT FOUND - the respective targets won't be build")
+else()
+        message( STATUS "hash_table library : ${libhash_table}")
+endif( NOT libhash_table )
+
 add_subdirectory(SqliteConnector)
 
 add_subdirectory(StringDictionary)
diff --git a/omniscidb/QueryEngine/CMakeLists.txt b/omniscidb/QueryEngine/CMakeLists.txt
index 727fc36af..8eedec6fe 100644
--- a/omniscidb/QueryEngine/CMakeLists.txt
+++ b/omniscidb/QueryEngine/CMakeLists.txt
@@ -322,6 +322,7 @@ list(APPEND QUERY_ENGINE_LIBS ${llvm_libs} ${ZLIB_LIBRARIES})
 
 add_subdirectory(CostModel)
 list(APPEND QUERY_ENGINE_LIBS CostModel)
+list(APPEND QUERY_ENGINE_LIBS ${libhash_table})
 
 target_link_libraries(QueryEngine ${QUERY_ENGINE_LIBS})
 
diff --git a/omniscidb/QueryEngine/Compiler/CommonGpuRuntime.cpp b/omniscidb/QueryEngine/Compiler/CommonGpuRuntime.cpp
index 27f222e3b..24940af77 100644
--- a/omniscidb/QueryEngine/Compiler/CommonGpuRuntime.cpp
+++ b/omniscidb/QueryEngine/Compiler/CommonGpuRuntime.cpp
@@ -64,4 +64,21 @@ DEVICE const GENERIC_ADDR_SPACE int64_t* init_shared_mem_nop(
     const int32_t groups_buffer_size) {
   return groups_buffer;
 }
+
+DEVICE ALWAYS_INLINE int64_t
+baseline_hash_join_idx_32(GENERIC_ADDR_SPACE const int8_t* hash_buff,
+                          GENERIC_ADDR_SPACE const int8_t* key,
+                          const size_t key_bytes,
+                          const size_t entry_count) {
+  return baseline_hash_join_idx_impl<int32_t>(hash_buff, key, key_bytes, entry_count);
+}
+
+NEVER_INLINE DEVICE int64_t
+get_composite_key_index_32(GENERIC_ADDR_SPACE const int32_t* key,
+                           const size_t key_component_count,
+                           GENERIC_ADDR_SPACE const int32_t* composite_key_dict,
+                           const size_t entry_count) {
+  return get_composite_key_index_impl(
+      key, key_component_count, composite_key_dict, entry_count);
+}
 }
diff --git a/omniscidb/QueryEngine/Compiler/genx.cpp b/omniscidb/QueryEngine/Compiler/genx.cpp
index 66e3c1895..40e018c2a 100644
--- a/omniscidb/QueryEngine/Compiler/genx.cpp
+++ b/omniscidb/QueryEngine/Compiler/genx.cpp
@@ -7,8 +7,117 @@
 #include <algorithm>
 #include <cstdint>
 
+#include "../GpuRtConstants.h"
+#include "CommonRuntimeDefs.h"
+#include "QueryEngine/MurmurHash1Inl.h"
 #include "Shared/funcannotations.h"
 
+template <typename T = int64_t>
+inline DEVICE T SUFFIX(get_invalid_key)() {
+  return EMPTY_KEY_64;
+}
+
+template <>
+inline DEVICE int32_t SUFFIX(get_invalid_key)() {
+  return EMPTY_KEY_32;
+}
+
+DEVICE bool compare_to_key(GENERIC_ADDR_SPACE const int8_t* entry,
+                           GENERIC_ADDR_SPACE const int8_t* key,
+                           const size_t key_bytes) {
+  for (size_t i = 0; i < key_bytes; ++i) {
+    if (entry[i] != key[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+inline bool keys_are_equal(GENERIC_ADDR_SPACE const T* key1,
+                           GENERIC_ADDR_SPACE const T* key2,
+                           const size_t key_component_count) {
+  for (size_t i = 0; i < key_component_count; ++i) {
+    if (key1[i] != key2[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+namespace {
+
+const int kNoMatch = -1;
+const int kNotPresent = -2;
+
+}  // namespace
+
+template <class T>
+DEVICE int64_t get_matching_slot(GENERIC_ADDR_SPACE const int8_t* hash_buff,
+                                 const uint32_t h,
+                                 GENERIC_ADDR_SPACE const int8_t* key,
+                                 const size_t key_bytes) {
+  const auto lookup_result_ptr = hash_buff + h * (key_bytes + sizeof(T));
+  if (compare_to_key(lookup_result_ptr, key, key_bytes)) {
+    return *reinterpret_cast<GENERIC_ADDR_SPACE const T*>(lookup_result_ptr + key_bytes);
+  }
+  if (*reinterpret_cast<GENERIC_ADDR_SPACE const T*>(lookup_result_ptr) ==
+      SUFFIX(get_invalid_key) < typename remove_addr_space<T>::type > ()) {
+    return kNotPresent;
+  }
+  return kNoMatch;
+}
+
+template <class T>
+DEVICE int64_t baseline_hash_join_idx_impl(GENERIC_ADDR_SPACE const int8_t* hash_buff,
+                                           GENERIC_ADDR_SPACE const int8_t* key,
+                                           const size_t key_bytes,
+                                           const size_t entry_count) {
+  if (!entry_count) {
+    return kNoMatch;
+  }
+  const uint32_t h = MurmurHash1Impl(key, key_bytes, 0) % entry_count;
+  int64_t matching_slot = get_matching_slot<T>(hash_buff, h, key, key_bytes);
+  if (matching_slot != kNoMatch) {
+    return matching_slot;
+  }
+  uint32_t h_probe = (h + 1) % entry_count;
+  while (h_probe != h) {
+    matching_slot = get_matching_slot<T>(hash_buff, h_probe, key, key_bytes);
+    if (matching_slot != kNoMatch) {
+      return matching_slot;
+    }
+    h_probe = (h_probe + 1) % entry_count;
+  }
+  return kNoMatch;
+}
+
+template <typename T>
+FORCE_INLINE DEVICE int64_t get_composite_key_index_impl(const T* key,
+                                                         const size_t key_component_count,
+                                                         const T* composite_key_dict,
+                                                         const size_t entry_count) {
+  const uint32_t h =
+      MurmurHash1Impl(key, key_component_count * sizeof(T), 0) % entry_count;
+  uint32_t off = h * key_component_count;
+  if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
+    return h;
+  }
+  uint32_t h_probe = (h + 1) % entry_count;
+  while (h_probe != h) {
+    off = h_probe * key_component_count;
+    if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) {
+      return h_probe;
+    }
+    if (composite_key_dict[off] ==
+        SUFFIX(get_invalid_key) < typename remove_addr_space<T>::type > ()) {
+      return -1;
+    }
+    h_probe = (h_probe + 1) % entry_count;
+  }
+  return -1;
+}
+
 extern "C" {
 int64_t atomic_cas_int_64(GENERIC_ADDR_SPACE int64_t*, int64_t, int64_t);
 int32_t atomic_cas_int_32(GENERIC_ADDR_SPACE int32_t*, int32_t, int32_t);
diff --git a/omniscidb/QueryEngine/IRCodegen.cpp b/omniscidb/QueryEngine/IRCodegen.cpp
index 4d87f6e60..bc3798e33 100644
--- a/omniscidb/QueryEngine/IRCodegen.cpp
+++ b/omniscidb/QueryEngine/IRCodegen.cpp
@@ -825,10 +825,10 @@ std::shared_ptr<HashJoin> Executor::buildCurrentLevelHashTable(
     check_valid_join_qual(qual_bin_oper);
     JoinHashTableOrError hash_table_or_error;
     if (!current_level_hash_table) {
-      if (co.device_type == ExecutorDeviceType::GPU && getDataMgr()->getGpuMgr() &&
-          getDataMgr()->getGpuMgr()->getPlatform() == GpuMgrPlatform::L0) {
-        throw QueryMustRunOnCpu();
-      }
+      // if (co.device_type == ExecutorDeviceType::GPU && getDataMgr()->getGpuMgr() &&
+      //     getDataMgr()->getGpuMgr()->getPlatform() == GpuMgrPlatform::L0) {
+      //   throw QueryMustRunOnCpu();
+      // }
       hash_table_or_error = buildHashTableForQualifier(
           qual_bin_oper,
           query_infos,
diff --git a/omniscidb/QueryEngine/JoinHashTable/BaselineHashTable.h b/omniscidb/QueryEngine/JoinHashTable/BaselineHashTable.h
index 040f3ba01..e2237fbfe 100644
--- a/omniscidb/QueryEngine/JoinHashTable/BaselineHashTable.h
+++ b/omniscidb/QueryEngine/JoinHashTable/BaselineHashTable.h
@@ -33,7 +33,7 @@ class BaselineHashTable : public HashTable {
                     const size_t hash_table_size)
       : cpu_hash_table_buff_size_(hash_table_size)
       , gpu_hash_table_buff_(nullptr)
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
       , device_id_(0)
       , buffer_provider_(nullptr)
 #endif
@@ -51,14 +51,14 @@ class BaselineHashTable : public HashTable {
                     const size_t hash_table_size,
                     const size_t device_id)
       : gpu_hash_table_buff_(nullptr)
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
       , device_id_(device_id)
       , buffer_provider_(buffer_provider)
 #endif
       , layout_(layout)
       , entry_count_(entry_count)
       , emitted_keys_count_(emitted_keys_count) {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
     CHECK(buffer_provider_);
     gpu_hash_table_buff_ = GpuAllocator::allocGpuAbstractBuffer(
         buffer_provider_, hash_table_size, device_id_);
@@ -68,7 +68,7 @@ class BaselineHashTable : public HashTable {
   }
 
   ~BaselineHashTable() override {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
     if (gpu_hash_table_buff_) {
       CHECK(buffer_provider_);
       buffer_provider_->free(gpu_hash_table_buff_);
@@ -108,7 +108,7 @@ class BaselineHashTable : public HashTable {
   size_t cpu_hash_table_buff_size_;
   Data_Namespace::AbstractBuffer* gpu_hash_table_buff_;
 
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
   const size_t device_id_;
   BufferProvider* buffer_provider_;
 #endif
diff --git a/omniscidb/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp b/omniscidb/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp
index d9a2e6b9a..0dd60aea1 100644
--- a/omniscidb/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp
+++ b/omniscidb/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp
@@ -165,7 +165,7 @@ std::string BaselineJoinHashTable::toString(const ExecutorDeviceType device_type
   auto hash_table = hash_tables_for_device_[device_id];
   CHECK(hash_table);
   auto buffer_size = hash_table->getHashTableBufferSize(device_type);
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
   auto buffer_provider = executor_->getBufferProvider();
   std::unique_ptr<int8_t[]> buffer_copy;
   if (device_type == ExecutorDeviceType::GPU) {
@@ -204,7 +204,7 @@ std::set<DecodedJoinHashBufferEntry> BaselineJoinHashTable::toSet(
   auto hash_table = getHashTableForDevice(device_id);
   CHECK(hash_table);
   auto buffer_size = hash_table->getHashTableBufferSize(device_type);
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
   auto buffer_provider = executor_->getBufferProvider();
   std::unique_ptr<int8_t[]> buffer_copy;
   if (device_type == ExecutorDeviceType::GPU) {
@@ -375,7 +375,7 @@ std::pair<size_t, size_t> BaselineJoinHashTable::approximateTupleCount(
     }
     return std::make_pair(hll_size(hll_result, count_distinct_desc.bitmap_sz_bits), 0);
   }
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
   auto buffer_provider = executor_->getBufferProvider();
   std::vector<std::vector<uint8_t>> host_hll_buffers(device_count_);
   for (auto& host_hll_buffer : host_hll_buffers) {
@@ -409,11 +409,19 @@ std::pair<size_t, size_t> BaselineJoinHashTable::approximateTupleCount(
                                 nullptr);
           const auto key_handler_gpu =
               transfer_flat_object_to_gpu(key_handler, allocator);
+#ifdef HAVE_CUDA
           approximate_distinct_tuples_on_device(
               reinterpret_cast<uint8_t*>(device_hll_buffer),
               count_distinct_desc.bitmap_sz_bits,
               key_handler_gpu,
               columns_for_device.join_columns[0].num_elems);
+#else
+          approximate_distinct_tuples_on_l0(reinterpret_cast<uint8_t*>(device_hll_buffer),
+                                            nullptr,
+                                            count_distinct_desc.bitmap_sz_bits,
+                                            columns_for_device.join_columns[0].num_elems,
+                                            key_handler_gpu);
+#endif
 
           auto& host_hll_buffer = host_hll_buffers[device_id];
           buffer_provider->copyFromDevice(reinterpret_cast<int8_t*>(&host_hll_buffer[0]),
@@ -675,7 +683,7 @@ int BaselineJoinHashTable::initHashTableForDevice(
     // but the query runs on GPU (join on dictionary encoded columns).
     // Don't transfer the buffer if there was an error since we'll bail anyway.
     if (memory_level_ == Data_Namespace::GPU_LEVEL && !err) {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
       BaselineJoinHashTableBuilder builder;
 
       builder.allocateDeviceMemory(hashtable_layout,
@@ -706,7 +714,7 @@ int BaselineJoinHashTable::initHashTableForDevice(
 #endif
     }
   } else {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
     BaselineJoinHashTableBuilder builder;
 
     GpuAllocator allocator(executor_->getBufferProvider(), device_id);
diff --git a/omniscidb/QueryEngine/JoinHashTable/Builders/BaselineHashTableBuilder.h b/omniscidb/QueryEngine/JoinHashTable/Builders/BaselineHashTableBuilder.h
index 7cf6c0668..b522201ea 100644
--- a/omniscidb/QueryEngine/JoinHashTable/Builders/BaselineHashTableBuilder.h
+++ b/omniscidb/QueryEngine/JoinHashTable/Builders/BaselineHashTableBuilder.h
@@ -21,6 +21,7 @@
 #include "QueryEngine/JoinHashTable/BaselineJoinHashTable.h"
 #include "QueryEngine/JoinHashTable/Runtime/HashJoinKeyHandlers.h"
 #include "QueryEngine/JoinHashTable/Runtime/JoinHashTableGpuUtils.h"
+#include "SOME_PATH/l0_physops/hash_table/BaselineHashTable/BaselineHashTableBuilder.h"
 #include "Shared/thread_count.h"
 
 template <typename SIZE,
@@ -355,7 +356,7 @@ class BaselineJoinHashTableBuilder {
                             const size_t emitted_keys_count,
                             const int device_id,
                             const Executor* executor) {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
     const auto entry_size =
         (key_component_count + (layout == HashType::OneToOne ? 1 : 0)) *
         key_component_width;
@@ -402,7 +403,7 @@ class BaselineJoinHashTableBuilder {
                          const Executor* executor) {
     auto timer = DEBUG_TIMER(__func__);
     int err = 0;
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
     allocateDeviceMemory(layout,
                          key_component_width,
                          key_component_count,
@@ -417,7 +418,12 @@ class BaselineJoinHashTableBuilder {
     }
     auto buffer_provider = executor->getBufferProvider();
     GpuAllocator allocator(buffer_provider, device_id);
+#ifdef HAVE_CUDA
     auto dev_err_buff = reinterpret_cast<CUdeviceptr>(allocator.alloc(sizeof(int)));
+#else
+    auto dev_err_buff = reinterpret_cast<int8_t*>(allocator.alloc(sizeof(int)));
+#endif
+
     buffer_provider->copyToDevice(reinterpret_cast<int8_t*>(dev_err_buff),
                                   reinterpret_cast<const int8_t*>(&err),
                                   sizeof(err),
@@ -431,24 +437,41 @@ class BaselineJoinHashTableBuilder {
     const auto key_handler_gpu = transfer_flat_object_to_gpu(*key_handler, allocator);
     switch (key_component_width) {
       case 4:
+#ifdef HAVE_CUDA
         init_baseline_hash_join_buff_on_device_32(gpu_hash_table_buff,
                                                   keyspace_entry_count,
                                                   key_component_count,
                                                   layout == HashType::OneToOne,
                                                   -1);
+#else
+        init_baseline_hash_join_buff_on_l0<int32_t>(gpu_hash_table_buff,
+                                              keyspace_entry_count,
+                                              key_component_count,
+                                              layout == HashType::OneToOne,
+                                              -1);
+#endif
         break;
       case 8:
+#ifdef HAVE_CUDA
         init_baseline_hash_join_buff_on_device_64(gpu_hash_table_buff,
                                                   keyspace_entry_count,
                                                   key_component_count,
                                                   layout == HashType::OneToOne,
                                                   -1);
+#else
+        init_baseline_hash_join_buff_on_l0<int64_t>(gpu_hash_table_buff,
+                                                  keyspace_entry_count,
+                                                  key_component_count,
+                                                  layout == HashType::OneToOne,
+                                                  -1);
+#endif
         break;
       default:
         UNREACHABLE();
     }
     switch (key_component_width) {
       case 4: {
+#ifdef HAVE_CUDA
         fill_baseline_hash_join_buff_on_device<int32_t>(
             gpu_hash_table_buff,
             keyspace_entry_count,
@@ -459,6 +482,17 @@ class BaselineJoinHashTableBuilder {
             reinterpret_cast<int*>(dev_err_buff),
             key_handler_gpu,
             join_columns.front().num_elems);
+#else
+        fill_baseline_hash_join_buff_on_l0<int32_t>(gpu_hash_table_buff,
+                                              keyspace_entry_count,
+                                              -1,
+                                              for_semi_join,
+                                              key_component_count,
+                                              layout == HashType::OneToOne,
+                                              reinterpret_cast<int*>(dev_err_buff),
+                                              key_handler_gpu,
+                                              join_columns.front().num_elems);
+#endif
         buffer_provider->copyFromDevice(reinterpret_cast<int8_t*>(&err),
                                         reinterpret_cast<const int8_t*>(dev_err_buff),
                                         sizeof(err),
@@ -466,6 +500,7 @@ class BaselineJoinHashTableBuilder {
         break;
       }
       case 8: {
+#ifdef HAVE_CUDA
         fill_baseline_hash_join_buff_on_device<int64_t>(
             gpu_hash_table_buff,
             keyspace_entry_count,
@@ -476,6 +511,18 @@ class BaselineJoinHashTableBuilder {
             reinterpret_cast<int*>(dev_err_buff),
             key_handler_gpu,
             join_columns.front().num_elems);
+#else
+        fill_baseline_hash_join_buff_on_l0<int64_t>(
+            gpu_hash_table_buff,
+            keyspace_entry_count,
+            -1,
+            for_semi_join,
+            key_component_count,
+            layout == HashType::OneToOne,
+            reinterpret_cast<int*>(dev_err_buff),
+            key_handler_gpu,
+            join_columns.front().num_elems);
+#endif
         buffer_provider->copyFromDevice(reinterpret_cast<int8_t*>(&err),
                                         reinterpret_cast<const int8_t*>(dev_err_buff),
                                         sizeof(err),
@@ -492,11 +539,17 @@ class BaselineJoinHashTableBuilder {
       const auto entry_size = key_component_count * key_component_width;
       auto one_to_many_buff = reinterpret_cast<int32_t*>(
           gpu_hash_table_buff + keyspace_entry_count * entry_size);
+#ifdef HAVE_CUDA
       init_hash_join_buff_on_device(one_to_many_buff, keyspace_entry_count, -1);
+#else
+      init_hash_join_buff_on_l0(one_to_many_buff, keyspace_entry_count, -1);
+#endif
+
       setHashLayout(layout);
       switch (key_component_width) {
         case 4: {
           const auto composite_key_dict = reinterpret_cast<int32_t*>(gpu_hash_table_buff);
+#ifdef HAVE_CUDA
           fill_one_to_many_baseline_hash_table_on_device<int32_t>(
               one_to_many_buff,
               composite_key_dict,
@@ -505,11 +558,19 @@ class BaselineJoinHashTableBuilder {
               key_component_count,
               key_handler_gpu,
               join_columns.front().num_elems);
-
+#else
+          fill_one_to_many_baseline_hash_table_on_l0<int32_t>(one_to_many_buff,
+                                                        composite_key_dict,
+                                                        keyspace_entry_count,
+                                                        -1,
+                                                        key_handler_gpu,
+                                                        join_columns.front().num_elems);
+#endif
           break;
         }
         case 8: {
           const auto composite_key_dict = reinterpret_cast<int64_t*>(gpu_hash_table_buff);
+#ifdef HAVE_CUDA
           fill_one_to_many_baseline_hash_table_on_device<int64_t>(
               one_to_many_buff,
               composite_key_dict,
@@ -518,7 +579,14 @@ class BaselineJoinHashTableBuilder {
               key_component_count,
               key_handler_gpu,
               join_columns.front().num_elems);
-
+#else
+          fill_one_to_many_baseline_hash_table_on_l0<int64_t>(one_to_many_buff,
+                                                        composite_key_dict,
+                                                        keyspace_entry_count,
+                                                        -1,
+                                                        key_handler_gpu,
+                                                        join_columns.front().num_elems);
+#endif
           break;
         }
         default:
diff --git a/omniscidb/QueryEngine/JoinHashTable/Builders/PerfectHashTableBuilder.h b/omniscidb/QueryEngine/JoinHashTable/Builders/PerfectHashTableBuilder.h
index 71929c0e8..652c1c988 100644
--- a/omniscidb/QueryEngine/JoinHashTable/Builders/PerfectHashTableBuilder.h
+++ b/omniscidb/QueryEngine/JoinHashTable/Builders/PerfectHashTableBuilder.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "QueryEngine/JoinHashTable/PerfectHashTable.h"
+#include "SOME_PATH/l0_physops/hash_table/PerfectHashTable/PerfectHashTableBuilder.h"
 
 #include "Shared/scope.h"
 
@@ -30,7 +31,7 @@ class PerfectJoinHashTableBuilder {
                             const int device_id,
                             const int device_count,
                             const Executor* executor) {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
     const size_t total_count =
         layout == HashType::OneToOne
             ? hash_entry_info.getNormalizedHashEntryCount()
@@ -48,7 +49,7 @@ class PerfectJoinHashTableBuilder {
 #endif  // HAVE_CUDA
   }
 
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
   void initHashTableOnGpu(const ChunkKey& chunk_key,
                           const JoinColumn& join_column,
                           const ExpressionRange& col_range,
@@ -69,8 +70,13 @@ class PerfectJoinHashTableBuilder {
       buffer_provider->free(gpu_hash_table_err_buff);
     };
     CHECK(gpu_hash_table_err_buff);
-    auto dev_err_buff =
-        reinterpret_cast<CUdeviceptr>(gpu_hash_table_err_buff->getMemoryPtr());
+    #ifdef HAVE_CUDA
+      auto dev_err_buff =
+          reinterpret_cast<CUdeviceptr>(gpu_hash_table_err_buff->getMemoryPtr());
+    #else
+      auto dev_err_buff =
+          reinterpret_cast<int8_t*>(gpu_hash_table_err_buff->getMemoryPtr());
+    #endif
     int err{0};
     buffer_provider->copyToDevice(reinterpret_cast<int8_t*>(dev_err_buff),
                                   reinterpret_cast<const int8_t*>(&err),
@@ -79,10 +85,16 @@ class PerfectJoinHashTableBuilder {
 
     CHECK(hash_table_);
     auto gpu_hash_table_buff = hash_table_->getGpuBuffer();
-
+    #ifdef HAVE_CUDA
     init_hash_join_buff_on_device(reinterpret_cast<int32_t*>(gpu_hash_table_buff),
                                   hash_entry_info.getNormalizedHashEntryCount(),
                                   hash_join_invalid_val);
+    #else
+    init_hash_join_buff_on_l0(reinterpret_cast<int32_t*>(gpu_hash_table_buff),
+                                  hash_entry_info.getNormalizedHashEntryCount(),
+                                  hash_join_invalid_val);
+    #endif
+
     if (chunk_key.empty()) {
       return;
     }
@@ -101,6 +113,7 @@ class PerfectJoinHashTableBuilder {
                                  get_join_column_type_kind(type)};
     auto use_bucketization = inner_col->type()->isDate();
     if (layout == HashType::OneToOne) {
+      #ifdef HAVE_CUDA
       fill_hash_join_buff_on_device_bucketized(
           reinterpret_cast<int32_t*>(gpu_hash_table_buff),
           hash_join_invalid_val,
@@ -109,22 +122,54 @@ class PerfectJoinHashTableBuilder {
           join_column,
           type_info,
           hash_entry_info.bucket_normalization);
+      #else
+        fill_hash_join_buff_bucketized_on_l0(
+          reinterpret_cast<int32_t*>(gpu_hash_table_buff),
+          hash_join_invalid_val,
+          for_semi_anti_join(join_type),
+          join_column,
+          type_info,
+          NULL,
+          0,
+          hash_entry_info.bucket_normalization,
+          reinterpret_cast<int*>(dev_err_buff));
+      #endif
+
     } else {
       if (use_bucketization) {
+        #ifdef HAVE_CUDA
         fill_one_to_many_hash_table_on_device_bucketized(
             reinterpret_cast<int32_t*>(gpu_hash_table_buff),
             hash_entry_info,
             hash_join_invalid_val,
             join_column,
             type_info);
+      #else
+        fill_one_to_many_hash_table_on_l0_bucketized(
+            reinterpret_cast<int32_t*>(gpu_hash_table_buff),
+            hash_entry_info,
+            hash_join_invalid_val,
+            join_column,
+            type_info);
+      #endif
+
       } else {
+        #ifdef HAVE_CUDA
         fill_one_to_many_hash_table_on_device(
             reinterpret_cast<int32_t*>(gpu_hash_table_buff),
             hash_entry_info,
             hash_join_invalid_val,
             join_column,
             type_info);
+      #else
+        fill_one_to_many_hash_table_on_l0(
+            reinterpret_cast<int32_t*>(gpu_hash_table_buff),
+            hash_entry_info,
+            hash_join_invalid_val,
+            join_column,
+            type_info);
       }
+      #endif
     }
     buffer_provider->copyFromDevice(reinterpret_cast<int8_t*>(&err),
                                     reinterpret_cast<int8_t*>(dev_err_buff),
diff --git a/omniscidb/QueryEngine/JoinHashTable/HashJoin.cpp b/omniscidb/QueryEngine/JoinHashTable/HashJoin.cpp
index c76b3c524..5501b0538 100644
--- a/omniscidb/QueryEngine/JoinHashTable/HashJoin.cpp
+++ b/omniscidb/QueryEngine/JoinHashTable/HashJoin.cpp
@@ -363,13 +363,17 @@ int64_t HashJoin::getJoinHashBuffer(const ExecutorDeviceType device_type,
   }
   CHECK(hash_tables_for_device_[device_id]);
   auto hash_table = hash_tables_for_device_[device_id].get();
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
   if (device_type == ExecutorDeviceType::CPU) {
     return reinterpret_cast<int64_t>(hash_table->getCpuBuffer());
   } else {
     CHECK(hash_table);
     const auto gpu_buff = hash_table->getGpuBuffer();
+#ifdef HAVE_CUDA
     return reinterpret_cast<CUdeviceptr>(gpu_buff);
+#else
+    return reinterpret_cast<int64_t>(gpu_buff);
+#endif
   }
 #else
   CHECK(device_type == ExecutorDeviceType::CPU);
diff --git a/omniscidb/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp b/omniscidb/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp
index 61f650fee..dfbfcf789 100644
--- a/omniscidb/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp
+++ b/omniscidb/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp
@@ -529,7 +529,7 @@ int PerfectJoinHashTable::initHashTableForDevice(
     return 0;
   }
 
-#ifndef HAVE_CUDA
+#if !defined(HAVE_CUDA) && !defined(HAVE_L0)
   CHECK_EQ(Data_Namespace::CPU_LEVEL, effective_memory_level);
 #endif
   int err{0};
@@ -636,7 +636,7 @@ int PerfectJoinHashTable::initHashTableForDevice(
     // Transfer the hash table on the GPU if we've only built it on CPU
     // but the query runs on GPU (join on dictionary encoded columns).
     if (memory_level_ == Data_Namespace::GPU_LEVEL) {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
       auto buffer_provider = executor_->getBufferProvider();
       auto type = inner_col->type();
       CHECK(type->isString() || type->isExtDictionary());
@@ -674,7 +674,7 @@ int PerfectJoinHashTable::initHashTableForDevice(
       hash_tables_for_device_[device_id] = hash_table;
     }
   } else {
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
     PerfectJoinHashTableBuilder builder;
     CHECK_EQ(Data_Namespace::GPU_LEVEL, effective_memory_level);
     builder.allocateDeviceMemory(join_column,
@@ -895,7 +895,7 @@ std::string PerfectJoinHashTable::toString(const ExecutorDeviceType device_type,
   auto buffer = getJoinHashBuffer(device_type, device_id);
   auto buffer_size = getJoinHashBufferSize(device_type, device_id);
   auto hash_table = getHashTableForDevice(device_id);
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
   auto buffer_provider = executor_->getBufferProvider();
   std::unique_ptr<int8_t[]> buffer_copy;
   if (device_type == ExecutorDeviceType::GPU) {
@@ -932,7 +932,7 @@ std::set<DecodedJoinHashBufferEntry> PerfectJoinHashTable::toSet(
   auto buffer = getJoinHashBuffer(device_type, device_id);
   auto buffer_size = getJoinHashBufferSize(device_type, device_id);
   auto hash_table = getHashTableForDevice(device_id);
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA) || defined(HAVE_L0)
   auto buffer_provider = executor_->getBufferProvider();
   std::unique_ptr<int8_t[]> buffer_copy;
   if (device_type == ExecutorDeviceType::GPU) {
diff --git a/omniscidb/Tests/IntelGPUEnablingTest.cpp b/omniscidb/Tests/IntelGPUEnablingTest.cpp
index 2ab58b202..0fad397d7 100644
--- a/omniscidb/Tests/IntelGPUEnablingTest.cpp
+++ b/omniscidb/Tests/IntelGPUEnablingTest.cpp
@@ -56,6 +56,16 @@ struct ExecuteTestBase {
     }
   }
 
+  static void insertSqliteAndHDK(const std::string& table_name,
+                                 const std::vector<std::string>& tuples) {
+    const std::string insert_template{
+        std::string("INSERT INTO ").append(table_name).append(" VALUES(")};
+    for (const std::string& tuple : tuples) {
+      run_sqlite_query(insert_template + tuple + ");");
+      insertCsvValues(table_name, tuple);
+    }
+  }
+
   static void createTestInnerLoopJoinTable() {
     createTable(
         "test_inner_loop_join",
@@ -65,13 +75,13 @@ struct ExecuteTestBase {
     run_sqlite_query(
         "CREATE TABLE test_inner_loop_join(x int not null, y int not null, xx "
         "smallint);");
-
-    run_sqlite_query("INSERT INTO test_inner_loop_join VALUES(7, 43, 12);");
-    run_sqlite_query("INSERT INTO test_inner_loop_join VALUES(8, 2, 11);");
-    run_sqlite_query("INSERT INTO test_inner_loop_join VALUES(9, 7, 10);");
-    insertCsvValues("test_inner_loop_join", "7,43,2");
-    insertCsvValues("test_inner_loop_join", "8,2,11");
-    insertCsvValues("test_inner_loop_join", "9,7,10");
+    const std::vector<std::string> tuples{
+        "7,43,2",
+        "8,2,11",
+        "9,7,10",
+        "7,9,10",
+    };
+    insertSqliteAndHDK("test_inner_loop_join", tuples);
   }
 
   static void createSmallTestsTable() {
@@ -300,6 +310,13 @@ TEST_F(JoinTest, SimpleJoin) {
     g_dt);
 }
 
+TEST_F(JoinTest, HashJoin) {
+  // Baseline OneToOne
+  c("SELECT a.x FROM test_inner_loop_join as a, small_tests as b WHERE (a.x = b.x) AND "
+    "(a.y = b.x) ",
+    g_dt);
+}
+
 class AggregationTest : public ExecuteTestBase, public ::testing::Test {};
 
 TEST_F(AggregationTest, StandaloneCount) {