Skip to content
Draft
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
70323c9
Init: add examples
haoyanwa Feb 20, 2025
4162599
Input and output DMA.
haoyanwa Feb 21, 2025
34f0d82
Added streaming beat control signal.
haoyanwa Feb 21, 2025
951a1f6
Restartable kernel for io_parallel.
haoyanwa Feb 21, 2025
8445de7
Updated oneAPI backend testbench.
haoyanwa Feb 24, 2025
0d21e99
Updated oneAPI template: io_stream kernel template.
haoyanwa Feb 24, 2025
257385a
Remove temp files.
haoyanwa Feb 24, 2025
0b8ef13
Refactoring oneAPI backend myproject_test.
haoyanwa Feb 24, 2025
cf98216
Merge branch 'fastmachinelearning:main' into oneapi_backend/experiment
haoyanwa Feb 24, 2025
70054aa
Cosmetic change.
haoyanwa Feb 24, 2025
c307715
oneAPI backend simulation support.
haoyanwa Feb 25, 2025
454d556
Merge branch 'main' into oneapi_backend/experiment
jmitrevs Mar 6, 2025
7e028e6
pre-commit fixes
jmitrevs Mar 26, 2025
97c187d
Merge branch 'main' into oneapi_backend/experiment
jmitrevs Mar 26, 2025
00f82a3
oneAPI BSP support.
haoyanwa Apr 1, 2025
496846d
User API and documentation.
haoyanwa Apr 1, 2025
84ad787
Merge pull request #1254 from haoyanwa/oneapi_backend/experiment
jmitrevs Apr 2, 2025
120c2e4
pre-commit fixes
jmitrevs Apr 2, 2025
e2cec76
Merge branch 'main' into oneapi_backend/experiment
jmitrevs Apr 16, 2025
d869a5c
update convert_data and convert_data_back to use packets
jmitrevs Apr 17, 2025
7e2e747
consolidate convert_data and DMA_convert_data in nnet_data_movement.h
jmitrevs May 1, 2025
0b3dbeb
update all the activations
jmitrevs May 2, 2025
36881e0
migrate batchnorm to restartatabe
jmitrevs May 2, 2025
60c0f42
Merge remote-tracking branch 'upstream/main' into jm_oneAPI_experiment
jmitrevs Jun 11, 2025
44ee08f
pre-commit fix
jmitrevs Jun 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions hls4ml/backends/oneapi/oneapi_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_para
'WriteTar': write_tar,
}

if 'use_bsp' in _:
config['IS_BSP'] = True

return config

def compile(self, model):
Expand Down
28 changes: 19 additions & 9 deletions hls4ml/backends/oneapi/oneapi_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,18 @@ def definition_cpp(self, name_suffix='', as_reference=False):
else:
return f'{self.type.name} {self.name}{name_suffix}'

def declare_cpp(self, pipe_min_size=0, indent=''):
lines = indent + f'class {self.pipe_id};\n'
lines += indent + (
f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
+ f'{self.type.name}, {pipe_min_size}, PipeProps>;\n'
# Updated pipe min size to be 32 for simulation.
def declare_cpp(self, pipe_min_size=32, indent=''):
# Updated to use streaming beat for restartable streaming kernel.
# Streaming beat is a wrapper type of the actual type with sideband control signals.
# Syntax: using BeatT = sycl::ext::intel::experimental::StreamingBeat<DataT, eop, empty>;
streaming_beat_t = f"{self.pipe_name}BeatT"
lines = (
f"{indent}class {self.pipe_id};\n"
f"{indent}using {streaming_beat_t} = "
f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n"
f"{indent}using {self.pipe_name} = sycl::ext::intel::experimental::pipe<"
f"{self.pipe_id}, {streaming_beat_t}, {pipe_min_size}, HostPipePropertiesT>;\n"
)
return lines

Expand All @@ -193,10 +200,13 @@ def definition_cpp(self, name_suffix='', as_reference=True):
return f'{self.name}{name_suffix}'

def declare_cpp(self, indent=''):
lines = indent + f'class {self.pipe_id};\n'
lines += indent + (
f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
+ f'{self.type.name}, {self.pragma[-1]}>;\n'
streaming_beat_t = f"{self.pipe_name}BeatT"
lines = (
f"{indent}class {self.pipe_id};\n"
f"{indent}using {streaming_beat_t} = "
f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n"
f"{indent}using {self.pipe_name} = "
f"sycl::ext::intel::experimental::pipe<{self.pipe_id}, {streaming_beat_t}, {self.pragma[-1]}>;\n"
)
return lines

Expand Down
5 changes: 3 additions & 2 deletions hls4ml/templates/oneapi/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,13 @@ set(LIBRARY_NAME myproject-${LIB_STAMP})
# You can also specify a device family (E.g. "Arria10" or "Stratix10") or a
# specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP.
if(NOT DEFINED FPGA_DEVICE)
set(FPGA_DEVICE "Arria10")
set(FPGA_DEVICE "Agilex7")
endif()

# Use cmake -DUSER_FPGA_FLAGS=<flags> to set extra flags for FPGA backend
# compilation.
set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS})
# -Xsoptimize=latency Turns off the hyper-optimized handshake
set(USER_FPGA_FLAGS -Wno-unused-label;${USER_FPGA_FLAGS};-Xsoptimize=latency)

# Use cmake -DUSER_FLAGS=<flags> to set extra flags for general compilation.
set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS})
Expand Down
111 changes: 111 additions & 0 deletions hls4ml/templates/oneapi/firmware/myproject.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,117 @@
// currently this is fixed
using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>));

// Pipe properties for host pipes. Host pipes connect to the data source DMA and sink DMA.
// They are connected to the first and the last layer to stream data into and out from the kernel.
using HostPipePropertiesT = decltype(sycl::ext::oneapi::experimental::properties(
sycl::ext::intel::experimental::ready_latency<0>, sycl::ext::intel::experimental::bits_per_symbol<8>,
sycl::ext::intel::experimental::uses_valid<true>, sycl::ext::intel::experimental::first_symbol_in_high_order_bits<true>,
sycl::ext::intel::experimental::protocol_avalon_streaming_uses_ready));

namespace nnet {

#if !defined(IS_BSP)
// Definition for buffer locations for Avalon MM host.
inline constexpr unsigned kInputBufferLocation = 0;
inline constexpr unsigned kOutputBufferLocation = 1;
#endif

// Implementation of a direct memory access kernel. Move data from source, convert,
// and send to the sink. Adaptive to SYCL HLS and hardware acceleration flow.
template <class src_T, class dest_pipe> struct DMA_convert_data {
#if !defined(IS_BSP)
// When targeting a device family, we instantiate an Avalon Memory Mapped Host for
Copy link
Contributor Author

@jmitrevs jmitrevs Apr 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if all the DMA_convert_data things should be moved to a different file. In the SYCL HLS style they are effectively part of the testbench, so I think should be in a different file. In the accelerator flow, they still are different kernels, utility kernels in a way, so I think they should be separate.

// data transaction between host and the DMA kernel during emulation and simulation.
sycl::ext::oneapi::experimental::annotated_arg<
src_T *,
decltype(sycl::ext::oneapi::experimental::properties{
sycl::ext::intel::experimental::latency<0>, sycl::ext::intel::experimental::dwidth<16>,
sycl::ext::intel::experimental::buffer_location<kInputBufferLocation>,
sycl::ext::intel::experimental::read_write_mode_read, sycl::ext::intel::experimental::wait_request_requested})>
#else
// When targeting oneAPI BSP, we can use USM pointer to access host memory.
src_T *const
#endif
src;
size_t num_iteration;

[[intel::kernel_args_restrict]] void operator()() const {

#if defined(IS_BSP)
// Access data using host pointer.
sycl::ext::intel::host_ptr<src_T> src_ptr(src);
#else
// Host allocation is not supported when targeting an FPGA family or part number.
src_T *src_ptr(src);
#endif
// First, extract the PipeDataT from the pipe
using PipeDataType = typename nnet::ExtractPipeType<dest_pipe>::value_type;
// Then, extract the DataT from StreamingBeat
using DstDataType = typename nnet::ExtractDataType<PipeDataType>::value_type;
constexpr auto dstTypeSize = std::tuple_size<DstDataType>{};

[[intel::fpga_register]] typename nnet::ExtractPipeType<dest_pipe>::value_type packet;

// Keep sending data to the input layer and keep the kernels running.
for (size_t i = 0; i < num_iteration; i++) {
#pragma unroll
for (size_t j = 0; j < dstTypeSize; j++) {
packet.data[j] = src_ptr[i * dstTypeSize + j];
}
packet.sop = (i == 0);
// Assert end-of-packet signal after the last iteration.
// All down-stream kernels will stop seeing eop.
packet.eop = (i == (num_iteration - 1));
dest_pipe::write(packet);
}
}
};

// Symmetrical to the DMA_convert_data above, this DMA drains the output pipe and
// writes result to memory.
template <class src_pipe, class dst_T> struct DMA_convert_data_back {
#if !defined(IS_BSP)
// Without BSP, instantiate an Avalon Memory Mapped Host to write to host.
sycl::ext::oneapi::experimental::annotated_arg<
dst_T *,
decltype(sycl::ext::oneapi::experimental::properties{
sycl::ext::intel::experimental::latency<0>, sycl::ext::intel::experimental::dwidth<16>,
sycl::ext::intel::experimental::buffer_location<kOutputBufferLocation>,
sycl::ext::intel::experimental::read_write_mode_write, sycl::ext::intel::experimental::wait_request_requested})>
#else
// USM pointer, otherwise.
dst_T *const
#endif
dst;
size_t num_iteration;

[[intel::kernel_args_restrict]] void operator()() const {
#if defined(IS_BSP)
sycl::ext::intel::host_ptr<dst_T> dst_ptr(dst);
#else
dst_T *dst_ptr(dst);
#endif
// First, extract the PipeDataT from the pipe
using PipeDataType = typename nnet::ExtractPipeType<src_pipe>::value_type;
// Then, extract the DataT from StreamingBeat
using SrcDataType = typename nnet::ExtractDataType<PipeDataType>::value_type;
constexpr auto srcTypeSize = std::tuple_size<SrcDataType>{};

[[intel::fpga_register]] typename nnet::ExtractPipeType<src_pipe>::value_type packet;

// Drain the output pipe and write result to memory.
for (size_t i = 0; i < num_iteration; i++) {
packet = src_pipe::read();
#pragma unroll
for (size_t j = 0; j < srcTypeSize; j++) {
dst_ptr[i * srcTypeSize + j] = static_cast<dst_T>(packet.data[j].to_double());
}
}
}
};

} // namespace nnet

// Need to declare the input and output pipes

// hls-fpga-machine-learning insert inputs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,31 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void linear_stream
// *************************************************
// ReLU Activation
// *************************************************
template <class data_pipe, class res_pipe, typename CONFIG_T> void relu_stream() {
template <class data_pipe, class res_pipe, typename CONFIG_T> [[intel::use_stall_enable_clusters]] void relu_stream() {
using namespace nnet;
using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
[[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;

bool keep_going = true;
ReLUActLoop:
[[intel::initiation_interval(
1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
auto in_data = data_pipe::read();
typename ExtractPipeType<res_pipe>::value_type out_data;
[[intel::initiation_interval(1)]] while (keep_going) {
for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
[[intel::fpga_register]] auto in_data = data_pipe::read();
ReLUPackLoop:
#pragma unroll
for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
if (in_data.data[j] > 0)
out_data.data[j] = in_data.data[j];
else
out_data.data[j] = 0;
}

ReLUPackLoop:
#pragma unroll
for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
if (in_data[j] > 0)
out_data[j] = in_data[j];
else
out_data[j] = 0;
}
out_data.sop = in_data.sop;
out_data.eop = in_data.eop;
res_pipe::write(out_data);

res_pipe::write(out_data);
keep_going = !in_data.eop;
}
}
}

Expand Down
33 changes: 26 additions & 7 deletions hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,34 @@

namespace nnet {

// Note: DataPack logic removed, at least in the initial version
// Restartable streaming kernel implementation.
// Computation is carried out in a while-1 loop as long as there is valid input.
// The loop breaks when the end-of-packet signal is asserted by upstream task.
template <class data_pipe, class res_pipe, typename CONFIG_T>
void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) {
[[intel::use_stall_enable_clusters]] void dense_resource_stream(const typename CONFIG_T::weight_t weights,
const typename CONFIG_T::bias_t biases) {
using namespace nnet;
using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;

[[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res;
[[intel::fpga_register]] auto data = data_pipe::read();
dense_resource<typename ExtractPipeType<data_pipe>::value_type, typename ExtractPipeType<res_pipe>::value_type,
CONFIG_T>(data, res, weights, biases);
res_pipe::write(res);
[[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type resbeat;

bool keep_going = true;
bool did_read_input;
[[intel::initiation_interval(1)]] while (keep_going) {
did_read_input = false;
[[intel::fpga_register]] auto databeat = data_pipe::read(did_read_input);

if (did_read_input) {
dense_resource<DataT, ResT, CONFIG_T>(databeat.data, resbeat.data, weights, biases);

resbeat.sop = databeat.sop;
resbeat.eop = databeat.eop;

res_pipe::write(resbeat);
keep_going = !databeat.eop;
}
}
}

} // namespace nnet
Expand Down
11 changes: 11 additions & 0 deletions hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <tuple>
#include <utility>

#include <sycl/ext/intel/prototype/pipes_ext.hpp> // Streaming Beat and pipe properties.

namespace nnet {

// Define the pipe type that we use
Expand All @@ -34,6 +36,15 @@ struct ExtractPipeType<PipeClass<PipeName, PipeDataT, kPipeMinCapacity, PipeProp
typedef PipeDataT value_type;
};

// Helper template for extracting datatype from oneAPI StreamingBeat type.
template <typename T> struct ExtractDataType { typedef T value_type; };

// Specialization on oneAPI StreamingBeat type.
template <typename DataT, bool EnableSOP, bool EnableEmpty>
struct ExtractDataType<sycl::ext::intel::experimental::StreamingBeat<DataT, EnableSOP, EnableEmpty>> {
typedef DataT value_type;
};

/*
* HLS Shift Register Implementation
* To verify a shift register is used in hardware, go to report.html > Area Analysis of System
Expand Down
Loading
Loading