-
Notifications
You must be signed in to change notification settings - Fork 482
Parallel conv partial fix #1380
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 9 commits
f7535b7
e88b856
7a0ee88
d077b72
f349e41
39d416b
7a6f1db
343ae5c
e530e18
43b8cee
8143d75
4aaf11c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -101,8 +101,11 @@ def format(self, node): | |
and node.get_attr('strategy').lower() == 'latency' | ||
and node.model.config.get_config_value('IOType') == 'io_parallel' | ||
) | ||
if is_pointwise_parallel_latency: | ||
params['conv_fn'] = f'{namespace}::pointwise_conv_{node.index}' | ||
|
||
n_partitions = node.attributes['n_partitions'] | ||
|
||
if is_pointwise_parallel_latency and n_partitions == 1: | ||
params['conv_fn'] = 'nnet::BatchedDenseForConv1D' | ||
else: | ||
if node.get_attr('strategy').lower() == 'latency': | ||
params['conv_fn'] = 'nnet::Conv1DLatency' | ||
|
@@ -115,11 +118,11 @@ def format(self, node): | |
conv_config = self.template.format(**params) | ||
|
||
mult_params = self._default_config_params(node) | ||
if is_pointwise_parallel_latency: | ||
mult_params['n_in'] = int( | ||
node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse'] | ||
if is_pointwise_parallel_latency and n_partitions == 1: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't this restrict the whole function to a single (fully parallel) case? Also, ff we ever fix the HLS code to make it work for any combination of PF/RF, will this line be the only change we need to make on the python side? If so, maybe document it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does restrict it to the full parallel case, since otherwise the performance was worse than the standard implementation. Will need to check again with the last pragma change... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In vivado_hls it fails, but in vitis it just ignored the pragma |
||
mult_params['n_in'] = ( | ||
node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') // n_partitions | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So basically, this change now forces the parallelization to be based on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current behavior of replacing pf by rf is an undocumented change. This only aligns it back to the standard convolution behavior (albeit still not fully working). With the last pragma change maybe it is fixed, but will need to check again |
||
) | ||
mult_params['n_out'] = int(node.get_attr('in_width') * node.get_attr('n_filt') / mult_params['reuse']) | ||
mult_params['n_out'] = node.get_attr('in_width') * node.get_attr('n_filt') // n_partitions | ||
else: | ||
mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') | ||
mult_params['n_out'] = node.get_attr('n_filt') | ||
|
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,7 +37,13 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO | |
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], | ||
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { | ||
// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully. | ||
//#pragma HLS INLINE recursive | ||
// But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/ | ||
// Vitis2025.1 hangs in RTL simulation with this, though | ||
|
||
#pragma HLS INLINE recursive | ||
|
||
// #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions | ||
// ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be | ||
|
||
CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases); | ||
} | ||
|
@@ -50,7 +56,12 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], | |
assert(CONFIG_T::filt_width == 1); | ||
|
||
// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully. | ||
//#pragma HLS INLINE recursive | ||
// But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/¯ | ||
|
||
#pragma HLS INLINE recursive | ||
|
||
// #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions | ||
// ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be | ||
|
||
CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases); | ||
} | ||
|
@@ -61,7 +72,7 @@ class Conv1DLatency : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> { | |
static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], | ||
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], | ||
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { | ||
//#pragma HLS INLINE region | ||
// #pragma HLS INLINE recursive | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is already disabled, and now remains disabled but with a different action. Why? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was enbaled and disabled when changing things around. Though, since inline region doesn't exist in vitis, the current one seems to be making a bit more sense... |
||
conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases); | ||
} | ||
}; | ||
|
@@ -72,11 +83,49 @@ class Conv1DResource : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> { | |
static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], | ||
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], | ||
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { | ||
//#pragma HLS INLINE region | ||
// #pragma HLS INLINE recursive | ||
conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases); | ||
} | ||
}; | ||
|
||
template <class data_T, class res_T, typename CONFIG_T> | ||
class BatchedDenseForConv1D : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add a comment to say the purpose of this code (and also for the 1D version)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The PR allows modification from contributors. Feel free to add some if you find it necessary. |
||
public: | ||
static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], | ||
typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], | ||
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { | ||
|
||
#pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions | ||
#pragma HLS INLINE RECURSIVE | ||
data_T data_tmp[CONFIG_T::n_partitions][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions]; | ||
#pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0 | ||
res_T res_tmp[CONFIG_T::n_partitions][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions]; | ||
#pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 | ||
|
||
for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) { | ||
#pragma HLS UNROLL | ||
for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions; ii++) { | ||
#pragma HLS UNROLL | ||
data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions + ii]; | ||
} | ||
} | ||
|
||
#pragma HLS ALLOCATION function instances=nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T> limit=1 | ||
|
||
for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) { | ||
nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[jj], res_tmp[jj], weights, biases); | ||
} | ||
|
||
for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) { | ||
#pragma HLS UNROLL | ||
for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions; ii++) { | ||
#pragma HLS UNROLL | ||
res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions + ii] = res_tmp[jj][ii]; | ||
} | ||
} | ||
} | ||
}; | ||
|
||
} // namespace nnet | ||
|
||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mentioned this in a comment in the conversation. I think trace, precision, and reuse_factor are regenerated no matter what, so the values you copy here get overriden (unless something has changed from before). It may be an hls4ml behavior worth revisiting and potentially revising, but I don't think this change fixes anything.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
reuse_factor
defined under Model key is not propagated as expected otherwise. Rm'ed warning if the update opr is trivialThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Did you see why it's not propagated properly? Shouldn't it come from a configuration in the first place?