Migrate from Vitis HLS
Example 1
This tutorial shows how to convert the vector-add example from Vitis HLS to TAPA.
Update the Includes
Replace
hls_stream.h
bytapa.h
.No changes to other HLS headers such as
ap_int.h
, etc.
#include <hls_vector.h>
-#include <hls_stream.h>
+#include <tapa.h>
#include "assert.h"
Update the Top Function
Update the top function header.
Replace the pointer parameters by
tapa::mmap<T>
. Note thattapa::mmap<T>
is passed by value.We no longer need to write
#pragma HLS interface
.
void vadd(
- hls::vector<uint32_t, NUM_WORDS>* in1,
- hls::vector<uint32_t, NUM_WORDS>* in2,
- hls::vector<uint32_t, NUM_WORDS>* out,
+ tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> in1,
+ tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> in2,
+ tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> out,
int size
) {
- #pragma HLS INTERFACE m_axi port = in1 bundle = gmem0
- #pragma HLS INTERFACE m_axi port = in2 bundle = gmem1
- #pragma HLS INTERFACE m_axi port = out bundle = gmem0
Update the stream definitions.
Replace
hls::stream<DATA_TYPE>
bytapa::stream<DATA_TYPE>
. This creates a stream with the default depth of 2, as in Vitis HLS. A different depth can be specified withtapa::stream<DATA_TYPE, FIFO_DEPTH>
.If there are stream arrays, we should use
tapa::streams<DATA_TYPE, ARRAY_SIZE, FIFO_DEPTH>
. Refer to Example 2 for details.
- hls::stream<hls::vector<uint32_t, NUM_WORDS>> in1_stream("input_stream_1");
- hls::stream<hls::vector<uint32_t, NUM_WORDS>> in2_stream("input_stream_2");
- hls::stream<hls::vector<uint32_t, NUM_WORDS>> out_stream("output_stream");
+ tapa::stream<hls::vector<uint32_t, NUM_WORDS>> in1_stream("input_stream_1");
+ tapa::stream<hls::vector<uint32_t, NUM_WORDS>> in2_stream("input_stream_2");
+ tapa::stream<hls::vector<uint32_t, NUM_WORDS>> out_stream("output_stream");
Update the task invocations.
No need for
#pragma HLS dataflow
.Use the
tapa::task().invoke()
API.The first argument is the task function; the remaining arguments are passed to the task.
Use a chain of
.invoke()
to call all tasks. Note that we only append;
to the very end.
- #pragma HLS dataflow
- load_input(in1, in1_stream, size);
- load_input(in2, in2_stream, size);
- compute_add(in1_stream, in2_stream, out_stream, size);
- store_result(out, out_stream, size);
+ tapa::task()
+ .invoke(load_input, in1, in1_stream, size)
+ .invoke(load_input, in2, in2_stream, size)
+ .invoke(compute_add, in1_stream, in2_stream, out_stream, size)
+ .invoke(store_result, out, out_stream, size)
+ ;
Update Task Definitions
Update stream arguments.
Replace
hls::stream<DATA_TYPE>
bytapa::istream<DATA_TYPE>
ortapa::ostream<DATA_TYPE>
.Note that we distinguish whether a stream argument is an input stream or an output stream.
No need to specify the stream depth here.
Don’t forget to pass streams by reference (with
&
).
void compute_add(
- hls::stream<hls::vector<uint32_t, NUM_WORDS>>& in1_stream,
- hls::stream<hls::vector<uint32_t, NUM_WORDS>>& in2_stream,
- hls::stream<hls::vector<uint32_t, NUM_WORDS>>& out_stream,
+ tapa::istream<hls::vector<uint32_t, NUM_WORDS>>& in1_stream,
+ tapa::istream<hls::vector<uint32_t, NUM_WORDS>>& in2_stream,
+ tapa::ostream<hls::vector<uint32_t, NUM_WORDS>>& out_stream,
int size
) {
Update external memory arguments.
Replace pointers by
tapa::mmap<DATA_TYPE>
. Note thattapa::mmap<DATA_TYPE>
is passed by value (without*
or&
).The code reads from to
out_stream
, so it is actually atapa::istream
; likewise,in_stream
is actually atapa::ostream
. Don’t be confused by the stream names.
void store_result(
- hls::vector<uint32_t, NUM_WORDS>* out,
- hls::stream<hls::vector<uint32_t, NUM_WORDS>>& out_stream,
+ tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> out,
+ tapa::istream<hls::vector<uint32_t, NUM_WORDS>>& out_stream,
int size
) {
// ...
}
void load_input(
- hls::vector<uint32_t, NUM_WORDS>* in,
- hls::stream<hls::vector<uint32_t, NUM_WORDS>>& inStream,
+ tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> in,
+ tapa::ostream<hls::vector<uint32_t, NUM_WORDS>>& inStream,
int size
) {
// ...
}
Update the stream APIs if necessary.
Most APIs of
tapa::stream
are compatible withhls::stream
.
Final Look of Example 1
#include <hls_vector.h>
#include <tapa.h>
#include "assert.h"
#define MEMORY_DWIDTH 512
#define SIZEOF_WORD 4
#define NUM_WORDS ((MEMORY_DWIDTH) / (8 * SIZEOF_WORD))
#define DATA_SIZE 4096
void load_input(tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> in,
tapa::ostream<hls::vector<uint32_t, NUM_WORDS>>& inStream,
int Size) {
for (int i = 0; i < Size; i++) {
#pragma HLS pipeline II = 1
inStream << in[i];
}
}
void compute_add(tapa::istream<hls::vector<uint32_t, NUM_WORDS>>& in1_stream,
tapa::istream<hls::vector<uint32_t, NUM_WORDS>>& in2_stream,
tapa::ostream<hls::vector<uint32_t, NUM_WORDS>>& out_stream,
int Size) {
for (int i = 0; i < Size; i++) {
#pragma HLS pipeline II = 1
out_stream << (in1_stream.read() + in2_stream.read());
}
}
void store_result(tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> out,
tapa::istream<hls::vector<uint32_t, NUM_WORDS>>& out_stream,
int Size) {
for (int i = 0; i < Size; i++) {
#pragma HLS pipeline II = 1
out[i] = out_stream.read();
}
}
extern "C" {
void vadd(tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> in1,
tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> in2,
tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> out, int size) {
tapa::stream<hls::vector<uint32_t, NUM_WORDS>> in1_stream("input_stream_1");
tapa::stream<hls::vector<uint32_t, NUM_WORDS>> in2_stream("input_stream_2");
tapa::stream<hls::vector<uint32_t, NUM_WORDS>> out_stream("output_stream");
tapa::task()
.invoke(load_input, in1, in1_stream, size)
.invoke(load_input, in2, in2_stream, size)
.invoke(compute_add, in1_stream, in2_stream, out_stream, size)
.invoke(store_result, out, out_stream, size);
}
}
Example 2
This tutorial covers more corner cases not mentioned in Example 1.
Dataflow in a Loop
Currently TAPA does not support the “dataflow-in-a-loop” coding style. One big idea in TAPA is that we want a strict decoupling of the communication structures from the computing units. The compiler will enforce that the top function should only include:
Stream definitions
Task invocations
If your original Vitis HLS code uses the dataflow-in-a-loop style, you may push the loop into the tasks.
In the following Vitis HLS example, the dataflow region is defined within a loop to be executed for multiple iterations. However, this is not allowed in TAPA, because the loop will become additional logic that may mess up with the computing logic and thus hinder the timing closure. A common approach is to push the loop into the parallel tasks —- write a copy of the loop in each task inside the dataflow region.
While this restriction may seem bothering, it ensures a good timing quality of the generated hardware. Automated transformation is possible, but that remains a future enhancement.
// before
for (int i = 0; i < size; i++) { // this loop is invalid in TAPA
#pragma HLS dataflow
load_input(...);
compute_add(...);
// ...
}
void load_input(
// ...
) {
foo(); bar();
}
// after
// for (int i = 0; i < size; i++) {
tapa::task()
.invoke(load_input, in1, in1_stream, size)
.invoke(load_input, in2, in2_stream, size)
.invoke(compute_add, in1_stream, in2_stream, out_stream, size)
.invoke(store_result, out, out_stream, size)
;
// }
void load_input(
// ...
) {
for (int i = 0; i < size; i++) { // move the loop here
foo(); bar();
}
}
Computation in the Top Function
TAPA does not support computation in the top function because we want to strictly decouple communication and computation.
If your original Vitis HLS code blends computation into the dataflow region, you could push them into specific tasks.
In this example,
size /= NUM_WORDS;
is actually invalid for TAPA, although it may seem trivial.
// before
size /= NUM_WORDS;
#pragma HLS dataflow
load_input(in1, in1_stream, size);
load_input(in2, in2_stream, size);
compute_add(in1_stream, in2_stream, out_stream, size);
store_result(out, out_stream, size);
// after
tapa::task()
.invoke(load_input, in1, in1_stream, size)
.invoke(load_input, in2, in2_stream, size)
.invoke(compute_add, in1_stream, in2_stream, out_stream, size)
.invoke(store_result, out, out_stream, size)
;
void load_input(
tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> in,
tapa::ostream<hls::vector<uint32_t, NUM_WORDS>>& inStream,
int size
) {
size /= NUM_WORDS; // move the computation here
for (int i = 0; i < size; i++) {
#pragma HLS pipeline II=1
inStream << in[i];
}
}
Final Look of Example 2
#include <hls_vector.h>
#include <tapa.h>
#include "assert.h"
#define MEMORY_DWIDTH 512
#define SIZEOF_WORD 4
#define NUM_WORDS ((MEMORY_DWIDTH) / (8 * SIZEOF_WORD))
#define DATA_SIZE 4096
void load_input(tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> in,
tapa::ostream<hls::vector<uint32_t, NUM_WORDS>>& inStream,
int size) {
size /= NUM_WORDS;
for (int i = 0; i < size; i++) {
#pragma HLS pipeline II = 1
inStream << in[i];
}
}
void compute_add(tapa::istream<hls::vector<uint32_t, NUM_WORDS>>& in1_stream,
tapa::istream<hls::vector<uint32_t, NUM_WORDS>>& in2_stream,
tapa::ostream<hls::vector<uint32_t, NUM_WORDS>>& out_stream,
int size) {
size /= NUM_WORDS;
for (int i = 0; i < size; i++) {
#pragma HLS pipeline II = 1
out_stream << (in1_stream.read() + in2_stream.read());
}
}
void store_result(tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> out,
tapa::istream<hls::vector<uint32_t, NUM_WORDS>>& out_stream,
int size) {
size /= NUM_WORDS;
for (int i = 0; i < size; i++) {
#pragma HLS pipeline II = 1
out[i] = out_stream.read();
}
}
extern "C" {
void vadd(tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> in1,
tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> in2,
tapa::mmap<hls::vector<uint32_t, NUM_WORDS>> out, int size) {
tapa::stream<hls::vector<uint32_t, NUM_WORDS>> in1_stream("input_stream_1");
tapa::stream<hls::vector<uint32_t, NUM_WORDS>> in2_stream("input_stream_2");
tapa::stream<hls::vector<uint32_t, NUM_WORDS>> out_stream("output_stream");
tapa::task()
.invoke(load_input, in1, in1_stream, size)
.invoke(load_input, in2, in2_stream, size)
.invoke(compute_add, in1_stream, in2_stream, out_stream, size)
.invoke(store_result, out, out_stream, size);
}
}