diff --git a/omniscidb/CMakeLists.txt b/omniscidb/CMakeLists.txt index bc2ec9295..2828807d0 100644 --- a/omniscidb/CMakeLists.txt +++ b/omniscidb/CMakeLists.txt @@ -617,6 +617,19 @@ else() set(PROFILER_LIBS "") endif() +find_library(libhash_table + NAMES + hash_table + PATHS + SOMEPATH/l0_physops/build/hash_table +) + +if ( NOT libhash_table ) + message( WARNING "hash_table library NOT FOUND - the respective targets won't be build") +else() + message( STATUS "hash_table library : ${libhash_table}") +endif( NOT libhash_table ) + add_subdirectory(SqliteConnector) add_subdirectory(StringDictionary) diff --git a/omniscidb/QueryEngine/CMakeLists.txt b/omniscidb/QueryEngine/CMakeLists.txt index 727fc36af..8eedec6fe 100644 --- a/omniscidb/QueryEngine/CMakeLists.txt +++ b/omniscidb/QueryEngine/CMakeLists.txt @@ -322,6 +322,7 @@ list(APPEND QUERY_ENGINE_LIBS ${llvm_libs} ${ZLIB_LIBRARIES}) add_subdirectory(CostModel) list(APPEND QUERY_ENGINE_LIBS CostModel) +list(APPEND QUERY_ENGINE_LIBS ${libhash_table}) target_link_libraries(QueryEngine ${QUERY_ENGINE_LIBS}) diff --git a/omniscidb/QueryEngine/Compiler/CommonGpuRuntime.cpp b/omniscidb/QueryEngine/Compiler/CommonGpuRuntime.cpp index 27f222e3b..24940af77 100644 --- a/omniscidb/QueryEngine/Compiler/CommonGpuRuntime.cpp +++ b/omniscidb/QueryEngine/Compiler/CommonGpuRuntime.cpp @@ -64,4 +64,21 @@ DEVICE const GENERIC_ADDR_SPACE int64_t* init_shared_mem_nop( const int32_t groups_buffer_size) { return groups_buffer; } + +DEVICE ALWAYS_INLINE int64_t +baseline_hash_join_idx_32(GENERIC_ADDR_SPACE const int8_t* hash_buff, + GENERIC_ADDR_SPACE const int8_t* key, + const size_t key_bytes, + const size_t entry_count) { + return baseline_hash_join_idx_impl(hash_buff, key, key_bytes, entry_count); +} + +NEVER_INLINE DEVICE int64_t +get_composite_key_index_32(GENERIC_ADDR_SPACE const int32_t* key, + const size_t key_component_count, + GENERIC_ADDR_SPACE const int32_t* composite_key_dict, + const size_t entry_count) { + return get_composite_key_index_impl( + key, key_component_count, composite_key_dict, entry_count); +} } diff --git a/omniscidb/QueryEngine/Compiler/genx.cpp b/omniscidb/QueryEngine/Compiler/genx.cpp index 66e3c1895..40e018c2a 100644 --- a/omniscidb/QueryEngine/Compiler/genx.cpp +++ b/omniscidb/QueryEngine/Compiler/genx.cpp @@ -7,8 +7,117 @@ #include #include +#include "../GpuRtConstants.h" +#include "CommonRuntimeDefs.h" +#include "QueryEngine/MurmurHash1Inl.h" #include "Shared/funcannotations.h" +template +inline DEVICE T SUFFIX(get_invalid_key)() { + return EMPTY_KEY_64; +} + +template <> +inline DEVICE int32_t SUFFIX(get_invalid_key)() { + return EMPTY_KEY_32; +} + +DEVICE bool compare_to_key(GENERIC_ADDR_SPACE const int8_t* entry, + GENERIC_ADDR_SPACE const int8_t* key, + const size_t key_bytes) { + for (size_t i = 0; i < key_bytes; ++i) { + if (entry[i] != key[i]) { + return false; + } + } + return true; +} + +template +inline bool keys_are_equal(GENERIC_ADDR_SPACE const T* key1, + GENERIC_ADDR_SPACE const T* key2, + const size_t key_component_count) { + for (size_t i = 0; i < key_component_count; ++i) { + if (key1[i] != key2[i]) { + return false; + } + } + return true; +} + +namespace { + +const int kNoMatch = -1; +const int kNotPresent = -2; + +} // namespace + +template +DEVICE int64_t get_matching_slot(GENERIC_ADDR_SPACE const int8_t* hash_buff, + const uint32_t h, + GENERIC_ADDR_SPACE const int8_t* key, + const size_t key_bytes) { + const auto lookup_result_ptr = hash_buff + h * (key_bytes + sizeof(T)); + if (compare_to_key(lookup_result_ptr, key, key_bytes)) { + return *reinterpret_cast(lookup_result_ptr + key_bytes); + } + if (*reinterpret_cast(lookup_result_ptr) == + SUFFIX(get_invalid_key) < typename remove_addr_space::type > ()) { + return kNotPresent; + } + return kNoMatch; +} + +template +DEVICE int64_t baseline_hash_join_idx_impl(GENERIC_ADDR_SPACE const int8_t* hash_buff, + GENERIC_ADDR_SPACE const int8_t* key, + const size_t key_bytes, + const size_t entry_count) { + if (!entry_count) { + return kNoMatch; + } + const uint32_t h = MurmurHash1Impl(key, key_bytes, 0) % entry_count; + int64_t matching_slot = get_matching_slot(hash_buff, h, key, key_bytes); + if (matching_slot != kNoMatch) { + return matching_slot; + } + uint32_t h_probe = (h + 1) % entry_count; + while (h_probe != h) { + matching_slot = get_matching_slot(hash_buff, h_probe, key, key_bytes); + if (matching_slot != kNoMatch) { + return matching_slot; + } + h_probe = (h_probe + 1) % entry_count; + } + return kNoMatch; +} + +template +FORCE_INLINE DEVICE int64_t get_composite_key_index_impl(const T* key, + const size_t key_component_count, + const T* composite_key_dict, + const size_t entry_count) { + const uint32_t h = + MurmurHash1Impl(key, key_component_count * sizeof(T), 0) % entry_count; + uint32_t off = h * key_component_count; + if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) { + return h; + } + uint32_t h_probe = (h + 1) % entry_count; + while (h_probe != h) { + off = h_probe * key_component_count; + if (keys_are_equal(&composite_key_dict[off], key, key_component_count)) { + return h_probe; + } + if (composite_key_dict[off] == + SUFFIX(get_invalid_key) < typename remove_addr_space::type > ()) { + return -1; + } + h_probe = (h_probe + 1) % entry_count; + } + return -1; +} + extern "C" { int64_t atomic_cas_int_64(GENERIC_ADDR_SPACE int64_t*, int64_t, int64_t); int32_t atomic_cas_int_32(GENERIC_ADDR_SPACE int32_t*, int32_t, int32_t); diff --git a/omniscidb/QueryEngine/IRCodegen.cpp b/omniscidb/QueryEngine/IRCodegen.cpp index 4d87f6e60..bc3798e33 100644 --- a/omniscidb/QueryEngine/IRCodegen.cpp +++ b/omniscidb/QueryEngine/IRCodegen.cpp @@ -825,10 +825,10 @@ std::shared_ptr Executor::buildCurrentLevelHashTable( check_valid_join_qual(qual_bin_oper); JoinHashTableOrError hash_table_or_error; if (!current_level_hash_table) { - if (co.device_type == ExecutorDeviceType::GPU && getDataMgr()->getGpuMgr() && - getDataMgr()->getGpuMgr()->getPlatform() == GpuMgrPlatform::L0) { - throw QueryMustRunOnCpu(); - } + // if (co.device_type == ExecutorDeviceType::GPU && getDataMgr()->getGpuMgr() && + // getDataMgr()->getGpuMgr()->getPlatform() == GpuMgrPlatform::L0) { + // throw QueryMustRunOnCpu(); + // } hash_table_or_error = buildHashTableForQualifier( qual_bin_oper, query_infos, diff --git a/omniscidb/QueryEngine/JoinHashTable/BaselineHashTable.h b/omniscidb/QueryEngine/JoinHashTable/BaselineHashTable.h index 040f3ba01..e2237fbfe 100644 --- a/omniscidb/QueryEngine/JoinHashTable/BaselineHashTable.h +++ b/omniscidb/QueryEngine/JoinHashTable/BaselineHashTable.h @@ -33,7 +33,7 @@ class BaselineHashTable : public HashTable { const size_t hash_table_size) : cpu_hash_table_buff_size_(hash_table_size) , gpu_hash_table_buff_(nullptr) -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) , device_id_(0) , buffer_provider_(nullptr) #endif @@ -51,14 +51,14 @@ class BaselineHashTable : public HashTable { const size_t hash_table_size, const size_t device_id) : gpu_hash_table_buff_(nullptr) -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) , device_id_(device_id) , buffer_provider_(buffer_provider) #endif , layout_(layout) , entry_count_(entry_count) , emitted_keys_count_(emitted_keys_count) { -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) CHECK(buffer_provider_); gpu_hash_table_buff_ = GpuAllocator::allocGpuAbstractBuffer( buffer_provider_, hash_table_size, device_id_); @@ -68,7 +68,7 @@ class BaselineHashTable : public HashTable { } ~BaselineHashTable() override { -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) if (gpu_hash_table_buff_) { CHECK(buffer_provider_); buffer_provider_->free(gpu_hash_table_buff_); @@ -108,7 +108,7 @@ class BaselineHashTable : public HashTable { size_t cpu_hash_table_buff_size_; Data_Namespace::AbstractBuffer* gpu_hash_table_buff_; -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) const size_t device_id_; BufferProvider* buffer_provider_; #endif diff --git a/omniscidb/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp b/omniscidb/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp index d9a2e6b9a..0dd60aea1 100644 --- a/omniscidb/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp +++ b/omniscidb/QueryEngine/JoinHashTable/BaselineJoinHashTable.cpp @@ -165,7 +165,7 @@ std::string BaselineJoinHashTable::toString(const ExecutorDeviceType device_type auto hash_table = hash_tables_for_device_[device_id]; CHECK(hash_table); auto buffer_size = hash_table->getHashTableBufferSize(device_type); -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) auto buffer_provider = executor_->getBufferProvider(); std::unique_ptr buffer_copy; if (device_type == ExecutorDeviceType::GPU) { @@ -204,7 +204,7 @@ std::set BaselineJoinHashTable::toSet( auto hash_table = getHashTableForDevice(device_id); CHECK(hash_table); auto buffer_size = hash_table->getHashTableBufferSize(device_type); -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) auto buffer_provider = executor_->getBufferProvider(); std::unique_ptr buffer_copy; if (device_type == ExecutorDeviceType::GPU) { @@ -375,7 +375,7 @@ std::pair BaselineJoinHashTable::approximateTupleCount( } return std::make_pair(hll_size(hll_result, count_distinct_desc.bitmap_sz_bits), 0); } -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) auto buffer_provider = executor_->getBufferProvider(); std::vector> host_hll_buffers(device_count_); for (auto& host_hll_buffer : host_hll_buffers) { @@ -409,11 +409,19 @@ std::pair BaselineJoinHashTable::approximateTupleCount( nullptr); const auto key_handler_gpu = transfer_flat_object_to_gpu(key_handler, allocator); +#ifdef HAVE_CUDA approximate_distinct_tuples_on_device( reinterpret_cast(device_hll_buffer), count_distinct_desc.bitmap_sz_bits, key_handler_gpu, columns_for_device.join_columns[0].num_elems); +#else + approximate_distinct_tuples_on_l0(reinterpret_cast(device_hll_buffer), + nullptr, + count_distinct_desc.bitmap_sz_bits, + columns_for_device.join_columns[0].num_elems, + key_handler_gpu); +#endif auto& host_hll_buffer = host_hll_buffers[device_id]; buffer_provider->copyFromDevice(reinterpret_cast(&host_hll_buffer[0]), @@ -675,7 +683,7 @@ int BaselineJoinHashTable::initHashTableForDevice( // but the query runs on GPU (join on dictionary encoded columns). // Don't transfer the buffer if there was an error since we'll bail anyway. if (memory_level_ == Data_Namespace::GPU_LEVEL && !err) { -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) BaselineJoinHashTableBuilder builder; builder.allocateDeviceMemory(hashtable_layout, @@ -706,7 +714,7 @@ int BaselineJoinHashTable::initHashTableForDevice( #endif } } else { -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) BaselineJoinHashTableBuilder builder; GpuAllocator allocator(executor_->getBufferProvider(), device_id); diff --git a/omniscidb/QueryEngine/JoinHashTable/Builders/BaselineHashTableBuilder.h b/omniscidb/QueryEngine/JoinHashTable/Builders/BaselineHashTableBuilder.h index 7cf6c0668..b522201ea 100644 --- a/omniscidb/QueryEngine/JoinHashTable/Builders/BaselineHashTableBuilder.h +++ b/omniscidb/QueryEngine/JoinHashTable/Builders/BaselineHashTableBuilder.h @@ -21,6 +21,7 @@ #include "QueryEngine/JoinHashTable/BaselineJoinHashTable.h" #include "QueryEngine/JoinHashTable/Runtime/HashJoinKeyHandlers.h" #include "QueryEngine/JoinHashTable/Runtime/JoinHashTableGpuUtils.h" +#include "SOME_PATH/l0_physops/hash_table/BaselineHashTable/BaselineHashTableBuilder.h" #include "Shared/thread_count.h" template getBufferProvider(); GpuAllocator allocator(buffer_provider, device_id); +#ifdef HAVE_CUDA auto dev_err_buff = reinterpret_cast(allocator.alloc(sizeof(int))); +#else + auto dev_err_buff = reinterpret_cast(allocator.alloc(sizeof(int))); +#endif + buffer_provider->copyToDevice(reinterpret_cast(dev_err_buff), reinterpret_cast(&err), sizeof(err), @@ -431,24 +437,41 @@ class BaselineJoinHashTableBuilder { const auto key_handler_gpu = transfer_flat_object_to_gpu(*key_handler, allocator); switch (key_component_width) { case 4: +#ifdef HAVE_CUDA init_baseline_hash_join_buff_on_device_32(gpu_hash_table_buff, keyspace_entry_count, key_component_count, layout == HashType::OneToOne, -1); +#else + init_baseline_hash_join_buff_on_l0(gpu_hash_table_buff, + keyspace_entry_count, + key_component_count, + layout == HashType::OneToOne, + -1); +#endif break; case 8: +#ifdef HAVE_CUDA init_baseline_hash_join_buff_on_device_64(gpu_hash_table_buff, keyspace_entry_count, key_component_count, layout == HashType::OneToOne, -1); +#else + init_baseline_hash_join_buff_on_l0(gpu_hash_table_buff, + keyspace_entry_count, + key_component_count, + layout == HashType::OneToOne, + -1); +#endif break; default: UNREACHABLE(); } switch (key_component_width) { case 4: { +#ifdef HAVE_CUDA fill_baseline_hash_join_buff_on_device( gpu_hash_table_buff, keyspace_entry_count, @@ -459,6 +482,17 @@ class BaselineJoinHashTableBuilder { reinterpret_cast(dev_err_buff), key_handler_gpu, join_columns.front().num_elems); +#else + fill_baseline_hash_join_buff_on_l0(gpu_hash_table_buff, + keyspace_entry_count, + -1, + for_semi_join, + key_component_count, + layout == HashType::OneToOne, + reinterpret_cast(dev_err_buff), + key_handler_gpu, + join_columns.front().num_elems); +#endif buffer_provider->copyFromDevice(reinterpret_cast(&err), reinterpret_cast(dev_err_buff), sizeof(err), @@ -466,6 +500,7 @@ class BaselineJoinHashTableBuilder { break; } case 8: { +#ifdef HAVE_CUDA fill_baseline_hash_join_buff_on_device( gpu_hash_table_buff, keyspace_entry_count, @@ -476,6 +511,18 @@ class BaselineJoinHashTableBuilder { reinterpret_cast(dev_err_buff), key_handler_gpu, join_columns.front().num_elems); +#else + fill_baseline_hash_join_buff_on_l0( + gpu_hash_table_buff, + keyspace_entry_count, + -1, + for_semi_join, + key_component_count, + layout == HashType::OneToOne, + reinterpret_cast(dev_err_buff), + key_handler_gpu, + join_columns.front().num_elems); +#endif buffer_provider->copyFromDevice(reinterpret_cast(&err), reinterpret_cast(dev_err_buff), sizeof(err), @@ -492,11 +539,17 @@ class BaselineJoinHashTableBuilder { const auto entry_size = key_component_count * key_component_width; auto one_to_many_buff = reinterpret_cast( gpu_hash_table_buff + keyspace_entry_count * entry_size); +#ifdef HAVE_CUDA init_hash_join_buff_on_device(one_to_many_buff, keyspace_entry_count, -1); +#else + init_hash_join_buff_on_l0(one_to_many_buff, keyspace_entry_count, -1); +#endif + setHashLayout(layout); switch (key_component_width) { case 4: { const auto composite_key_dict = reinterpret_cast(gpu_hash_table_buff); +#ifdef HAVE_CUDA fill_one_to_many_baseline_hash_table_on_device( one_to_many_buff, composite_key_dict, @@ -505,11 +558,19 @@ class BaselineJoinHashTableBuilder { key_component_count, key_handler_gpu, join_columns.front().num_elems); - +#else + fill_one_to_many_baseline_hash_table_on_l0(one_to_many_buff, + composite_key_dict, + keyspace_entry_count, + -1, + key_handler_gpu, + join_columns.front().num_elems); +#endif break; } case 8: { const auto composite_key_dict = reinterpret_cast(gpu_hash_table_buff); +#ifdef HAVE_CUDA fill_one_to_many_baseline_hash_table_on_device( one_to_many_buff, composite_key_dict, @@ -518,7 +579,14 @@ class BaselineJoinHashTableBuilder { key_component_count, key_handler_gpu, join_columns.front().num_elems); - +#else + fill_one_to_many_baseline_hash_table_on_l0(one_to_many_buff, + composite_key_dict, + keyspace_entry_count, + -1, + key_handler_gpu, + join_columns.front().num_elems); +#endif break; } default: diff --git a/omniscidb/QueryEngine/JoinHashTable/Builders/PerfectHashTableBuilder.h b/omniscidb/QueryEngine/JoinHashTable/Builders/PerfectHashTableBuilder.h index 71929c0e8..652c1c988 100644 --- a/omniscidb/QueryEngine/JoinHashTable/Builders/PerfectHashTableBuilder.h +++ b/omniscidb/QueryEngine/JoinHashTable/Builders/PerfectHashTableBuilder.h @@ -17,6 +17,7 @@ #pragma once #include "QueryEngine/JoinHashTable/PerfectHashTable.h" +#include "SOME_PATH/l0_physops/hash_table/PerfectHashTable/PerfectHashTableBuilder.h" #include "Shared/scope.h" @@ -30,7 +31,7 @@ class PerfectJoinHashTableBuilder { const int device_id, const int device_count, const Executor* executor) { -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) const size_t total_count = layout == HashType::OneToOne ? hash_entry_info.getNormalizedHashEntryCount() @@ -48,7 +49,7 @@ class PerfectJoinHashTableBuilder { #endif // HAVE_CUDA } -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) void initHashTableOnGpu(const ChunkKey& chunk_key, const JoinColumn& join_column, const ExpressionRange& col_range, @@ -69,8 +70,13 @@ class PerfectJoinHashTableBuilder { buffer_provider->free(gpu_hash_table_err_buff); }; CHECK(gpu_hash_table_err_buff); - auto dev_err_buff = - reinterpret_cast(gpu_hash_table_err_buff->getMemoryPtr()); + #ifdef HAVE_CUDA + auto dev_err_buff = + reinterpret_cast(gpu_hash_table_err_buff->getMemoryPtr()); + #else + auto dev_err_buff = + reinterpret_cast(gpu_hash_table_err_buff->getMemoryPtr()); + #endif int err{0}; buffer_provider->copyToDevice(reinterpret_cast(dev_err_buff), reinterpret_cast(&err), @@ -79,10 +85,16 @@ class PerfectJoinHashTableBuilder { CHECK(hash_table_); auto gpu_hash_table_buff = hash_table_->getGpuBuffer(); - + #ifdef HAVE_CUDA init_hash_join_buff_on_device(reinterpret_cast(gpu_hash_table_buff), hash_entry_info.getNormalizedHashEntryCount(), hash_join_invalid_val); + #else + init_hash_join_buff_on_l0(reinterpret_cast(gpu_hash_table_buff), + hash_entry_info.getNormalizedHashEntryCount(), + hash_join_invalid_val); + #endif + if (chunk_key.empty()) { return; } @@ -101,6 +113,7 @@ class PerfectJoinHashTableBuilder { get_join_column_type_kind(type)}; auto use_bucketization = inner_col->type()->isDate(); if (layout == HashType::OneToOne) { + #ifdef HAVE_CUDA fill_hash_join_buff_on_device_bucketized( reinterpret_cast(gpu_hash_table_buff), hash_join_invalid_val, @@ -109,22 +122,54 @@ class PerfectJoinHashTableBuilder { join_column, type_info, hash_entry_info.bucket_normalization); + #else + fill_hash_join_buff_bucketized_on_l0( + reinterpret_cast(gpu_hash_table_buff), + hash_join_invalid_val, + for_semi_anti_join(join_type), + join_column, + type_info, + NULL, + 0, + hash_entry_info.bucket_normalization, + reinterpret_cast(dev_err_buff)); + #endif + } else { if (use_bucketization) { + #ifdef HAVE_CUDA fill_one_to_many_hash_table_on_device_bucketized( reinterpret_cast(gpu_hash_table_buff), hash_entry_info, hash_join_invalid_val, join_column, type_info); + #else + fill_one_to_many_hash_table_on_l0_bucketized( + reinterpret_cast(gpu_hash_table_buff), + hash_entry_info, + hash_join_invalid_val, + join_column, + type_info); + #endif + } else { + #ifdef HAVE_CUDA fill_one_to_many_hash_table_on_device( reinterpret_cast(gpu_hash_table_buff), hash_entry_info, hash_join_invalid_val, join_column, type_info); + #else + fill_one_to_many_hash_table_on_l0( + reinterpret_cast(gpu_hash_table_buff), + hash_entry_info, + hash_join_invalid_val, + join_column, + type_info); } + #endif } buffer_provider->copyFromDevice(reinterpret_cast(&err), reinterpret_cast(dev_err_buff), diff --git a/omniscidb/QueryEngine/JoinHashTable/HashJoin.cpp b/omniscidb/QueryEngine/JoinHashTable/HashJoin.cpp index c76b3c524..5501b0538 100644 --- a/omniscidb/QueryEngine/JoinHashTable/HashJoin.cpp +++ b/omniscidb/QueryEngine/JoinHashTable/HashJoin.cpp @@ -363,13 +363,17 @@ int64_t HashJoin::getJoinHashBuffer(const ExecutorDeviceType device_type, } CHECK(hash_tables_for_device_[device_id]); auto hash_table = hash_tables_for_device_[device_id].get(); -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) if (device_type == ExecutorDeviceType::CPU) { return reinterpret_cast(hash_table->getCpuBuffer()); } else { CHECK(hash_table); const auto gpu_buff = hash_table->getGpuBuffer(); +#ifdef HAVE_CUDA return reinterpret_cast(gpu_buff); +#else + return reinterpret_cast(gpu_buff); +#endif } #else CHECK(device_type == ExecutorDeviceType::CPU); diff --git a/omniscidb/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp b/omniscidb/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp index 61f650fee..dfbfcf789 100644 --- a/omniscidb/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp +++ b/omniscidb/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp @@ -529,7 +529,7 @@ int PerfectJoinHashTable::initHashTableForDevice( return 0; } -#ifndef HAVE_CUDA +#if !defined(HAVE_CUDA) && !defined(HAVE_L0) CHECK_EQ(Data_Namespace::CPU_LEVEL, effective_memory_level); #endif int err{0}; @@ -636,7 +636,7 @@ int PerfectJoinHashTable::initHashTableForDevice( // Transfer the hash table on the GPU if we've only built it on CPU // but the query runs on GPU (join on dictionary encoded columns). if (memory_level_ == Data_Namespace::GPU_LEVEL) { -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) auto buffer_provider = executor_->getBufferProvider(); auto type = inner_col->type(); CHECK(type->isString() || type->isExtDictionary()); @@ -674,7 +674,7 @@ int PerfectJoinHashTable::initHashTableForDevice( hash_tables_for_device_[device_id] = hash_table; } } else { -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) PerfectJoinHashTableBuilder builder; CHECK_EQ(Data_Namespace::GPU_LEVEL, effective_memory_level); builder.allocateDeviceMemory(join_column, @@ -895,7 +895,7 @@ std::string PerfectJoinHashTable::toString(const ExecutorDeviceType device_type, auto buffer = getJoinHashBuffer(device_type, device_id); auto buffer_size = getJoinHashBufferSize(device_type, device_id); auto hash_table = getHashTableForDevice(device_id); -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) auto buffer_provider = executor_->getBufferProvider(); std::unique_ptr buffer_copy; if (device_type == ExecutorDeviceType::GPU) { @@ -932,7 +932,7 @@ std::set PerfectJoinHashTable::toSet( auto buffer = getJoinHashBuffer(device_type, device_id); auto buffer_size = getJoinHashBufferSize(device_type, device_id); auto hash_table = getHashTableForDevice(device_id); -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) || defined(HAVE_L0) auto buffer_provider = executor_->getBufferProvider(); std::unique_ptr buffer_copy; if (device_type == ExecutorDeviceType::GPU) { diff --git a/omniscidb/Tests/IntelGPUEnablingTest.cpp b/omniscidb/Tests/IntelGPUEnablingTest.cpp index 2ab58b202..0fad397d7 100644 --- a/omniscidb/Tests/IntelGPUEnablingTest.cpp +++ b/omniscidb/Tests/IntelGPUEnablingTest.cpp @@ -56,6 +56,16 @@ struct ExecuteTestBase { } } + static void insertSqliteAndHDK(const std::string& table_name, + const std::vector& tuples) { + const std::string insert_template{ + std::string("INSERT INTO ").append(table_name).append(" VALUES(")}; + for (const std::string& tuple : tuples) { + run_sqlite_query(insert_template + tuple + ");"); + insertCsvValues(table_name, tuple); + } + } + static void createTestInnerLoopJoinTable() { createTable( "test_inner_loop_join", @@ -65,13 +75,13 @@ struct ExecuteTestBase { run_sqlite_query( "CREATE TABLE test_inner_loop_join(x int not null, y int not null, xx " "smallint);"); - - run_sqlite_query("INSERT INTO test_inner_loop_join VALUES(7, 43, 12);"); - run_sqlite_query("INSERT INTO test_inner_loop_join VALUES(8, 2, 11);"); - run_sqlite_query("INSERT INTO test_inner_loop_join VALUES(9, 7, 10);"); - insertCsvValues("test_inner_loop_join", "7,43,2"); - insertCsvValues("test_inner_loop_join", "8,2,11"); - insertCsvValues("test_inner_loop_join", "9,7,10"); + const std::vector tuples{ + "7,43,2", + "8,2,11", + "9,7,10", + "7,9,10", + }; + insertSqliteAndHDK("test_inner_loop_join", tuples); } static void createSmallTestsTable() { @@ -300,6 +310,13 @@ TEST_F(JoinTest, SimpleJoin) { g_dt); } +TEST_F(JoinTest, HashJoin) { + // Baseline OneToOne + c("SELECT a.x FROM test_inner_loop_join as a, small_tests as b WHERE (a.x = b.x) AND " + "(a.y = b.x) ", + g_dt); +} + class AggregationTest : public ExecuteTestBase, public ::testing::Test {}; TEST_F(AggregationTest, StandaloneCount) {