Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Squeeze] Introduce Squeeze and Unsqueeze hardware operators #1153

Draft
wants to merge 2 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 30 additions & 2 deletions src/finn/custom_op/fpgadataflow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,36 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# The base class of all generic custom operations before specializing to either
# HLS or RTL backend
from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp

# Dictionary of HWCustomOp implementations
custom_op = dict()


# Registers a class into the custom_op dictionary
# Note: This must be defined first, before importing any custom op
# implementation to avoid "importing partially initialized module" issues.
def register_custom_op(cls):
# The class must actually implement HWCustomOp
assert issubclass(cls, HWCustomOp), f"{cls} must subclass {HWCustomOp}"
# Insert the class into the custom_op dictionary by its name
custom_op[cls.__name__] = cls # noqa: Some weird type annotation issue?
# Pass through the class unmodified
return cls


# flake8: noqa
# Disable linting from here, as all import will be flagged E402 and maybe F401


# Import the submodule containing the Squeeze operation
# Note: This will automatically register all decorated classes into this domain
import finn.custom_op.fpgadataflow.squeeze
# Import the submodule containing the Unsqueeze operation
import finn.custom_op.fpgadataflow.unsqueeze

from finn.custom_op.fpgadataflow.addstreams import AddStreams
from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp
from finn.custom_op.fpgadataflow.concat import StreamingConcat
Expand Down Expand Up @@ -55,8 +85,6 @@
from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour
from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU

custom_op = dict()

# make sure new HLSCustomOp subclasses are imported here so that they get
# registered and plug in correctly into the infrastructure
custom_op["MVAU"] = MVAU
Expand Down
36 changes: 34 additions & 2 deletions src/finn/custom_op/fpgadataflow/hls/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,40 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# The base class of all HWCustomOp specializations to HLS backend implementation
from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend

# The base class of all generic custom operations before specializing to either
# HLS or RTL backend
from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp

# Dictionary of HLSBackend implementations
custom_op = dict()


# Registers a class into the custom_op dictionary
# Note: This must be defined first, before importing any custom op
# implementation to avoid "importing partially initialized module" issues.
def register_custom_op(cls):
# The class must actually implement HWCustomOp
assert issubclass(cls, HWCustomOp), f"{cls} must subclass {HWCustomOp}"
# The class must also implement the HLSBackend
assert issubclass(cls, HLSBackend), f"{cls} must subclass {HLSBackend}"
# Insert the class into the custom_op dictionary by its name
custom_op[cls.__name__] = cls # noqa: Some weird type annotation issue?
# Pass through the class unmodified
return cls


# flake8: noqa
# Disable linting from here, as all import will be flagged E402 and maybe F401

# Import the submodule containing the specialization of the Squeeze operation
# Note: This will automatically register all decorated classes into this domain
import finn.custom_op.fpgadataflow.hls.squeeze_hls
# Import the submodule containing the specialization of the Unsqueeze operation
import finn.custom_op.fpgadataflow.hls.unsqueeze_hls

from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls
from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls
from finn.custom_op.fpgadataflow.hls.checksum_hls import CheckSum_hls
Expand Down Expand Up @@ -53,8 +87,6 @@
from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls
from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VVAU_hls

custom_op = dict()

# make sure new HLSCustomOp subclasses are imported here so that they get
# registered and plug in correctly into the infrastructure
custom_op["AddStreams_hls"] = AddStreams_hls
Expand Down
234 changes: 234 additions & 0 deletions src/finn/custom_op/fpgadataflow/hls/squeeze_hls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
# noqa: Duplicate: The HLS implementation is identical to the Unsqueeze
# operator, maybe these should be unified...
# fmt: off
# Disable formatter. This is deliberately formatted to stay within 80 characters
# per line. Black, however, formats some lines going beyond this.

# Numpy math and arrays
import numpy as np

# Operating system stuff, e.g. paths
import os

# QONNX wrapper to ONNX model graphs
from qonnx.core.modelwrapper import ModelWrapper

# Utility for registering HLSBackend HWCustomOp implementations into the module
# scope
from finn.custom_op.fpgadataflow.hls import register_custom_op

# Base class for specializing HW operators as implemented via HLS
from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend

# The generic HW custom operator version of the operator as a base class
from finn.custom_op.fpgadataflow.squeeze import Squeeze


# HLS Backend specialization of the squeeze operator
@register_custom_op
class Squeeze_hls(Squeeze, HLSBackend): # noqa: Class name does not follow
# CapWords convention
# Node attributes matching the HLS operator
def get_nodeattr_types(self):
# Start from parent operator class attributes
attrs = Squeeze.get_nodeattr_types(self)
# Add the HLSBackend default attributes on top
attrs.update(HLSBackend.get_nodeattr_types(self))
# Add/Specialize implementation specific attributes here...
# Return the updated attributes dictionary
return attrs

# Executes squeeze operation in C++ simulation
def _execute_node_cppsim(self, context, graph): # noqa: graph unused
# Get the node wrapped by this custom op
node = self.onnx_node
# Input data is stored in numpy files in the code generation dictionary
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
# Get the input out of the execution context
inp = context[node.input[0]] # noqa: Duplicate code prepare simulation
# Validate the shape of the input
assert list(inp.shape) == self.get_normal_input_shape(ind=0), \
f"Input shape mismatch for {node.input[0]}"
# Reshape the input into folded form
inp = inp.reshape(self.get_folded_input_shape(ind=0))
# Save the folded input to file to be used by simulation
np.save(os.path.join(code_gen_dir, "inp.npy"), inp)

# Execute the precompiled model
super().exec_precompiled_singlenode_model()

# Load the output numpy file generated by the C++ simulation
out = np.load(os.path.join(code_gen_dir, "out.npy"))
# Reshape the folded output and insert into the execution context
context[node.output[0]] = out.reshape(
self.get_normal_output_shape(ind=0)
)

# Maximum width of any ap_int used in this operator
def get_ap_int_max_w(self):
# Width of the input, there is just one input
i_bits_max = self.get_instream_width(ind=0)
# Width of the output, there is just one output
o_bits_max = self.get_outstream_width(ind=0)
# Find the biggest of the inputs/outputs
return max([i_bits_max, o_bits_max])

# Generates list of C++ includes to be placed at the top of the generated
# code
def global_includes(self):
# Currently nothing to include
self.code_gen_dict["$GLOBALS$"] = []

# Generates C++ parameters file, i.e., constant initializer inputs
def generate_params(self, model: ModelWrapper, path: str):
# Squeeze has no parameters
pass

# Generates C++ code of type alias, global constant and macro definitions
def defines(self, var):
# Insert constants and type aliases into the dictionary
self.code_gen_dict["$DEFINES$"] = [
# Input and output element datatypes
f"using InpType = {self.inp_dtype.get_hls_datatype_str()};",
f"using OutType = {self.out_dtype.get_hls_datatype_str()};",
# Width of single elements to avoid using ::width attribute which is
# not present for datatype float
f"static constexpr auto InpWidth = {self.inp_dtype.bitwidth()};",
f"static constexpr auto OutWidth = {self.out_dtype.bitwidth()};",
# Datatype of elements packed into the input stream
f"using InpPacked = ap_uint<{self.get_instream_width(ind=0)}>;",
# Datatype of elements packed into the output stream
f"using OutPacked = ap_uint<{self.get_outstream_width(ind=0)}>;",
# Input and output HLS stream datatypes
"using InpStream = hls::stream<InpPacked>;",
"using OutStream = hls::stream<OutPacked>;",
]

# Generates C++ code for reading data from .npy (numpy format) for testing
# in C++ simulation
def read_npy_data(self):
# Input data is stored in numpy files in the code generation dictionary
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
# Prepare empty stream reading to append optionals
self.code_gen_dict["$READNPYDATA$"] = []
# Generate function calls for reading the input files into the input
# streams
self.code_gen_dict["$READNPYDATA$"] += [
# Generate function call reading from file into the input stream
# Note: Inputs are always represented as numpy floats
'npy2apintstream<InpPacked, InpType, InpWidth, float>(',
f'"{code_gen_dir}/inp.npy", inp_{self.hls_sname()}, false',
');'
]

# Generates C++ code for declaring all streams involved in C++ simulation
# for testing
def strm_decl(self):
# There are always one input and one output stream
self.code_gen_dict["$STREAMDECLARATIONS$"] = [
# Note: Assumes stream type aliases to be set in defines
f"InpStream inp_{self.hls_sname()};"
f"OutStream out_{self.hls_sname()};"
]

# Generates C++ code for calling the computation part of the operator
def docompute(self):
# Number of iterations required to process the whole folded input stream
# Note: This is all but the PE (last) dimension
num_iter = np.prod(self.get_folded_output_shape()[:-1])
# Write the body of the top-level function
self.code_gen_dict["$DOCOMPUTE$"] = [
# Repeat for the number of inputs
f"for(std::size_t i = 0; i < {num_iter}; ++i) {{",
# Pipeline the steps of this loop
"#pragma HLS pipeline II=1 style=flp",
# Just read from the input and immediately write the same element to
# the output. Squeezed dimensions, i.e., those with a size of 1 do
# not contribute to the number and order of elements and thus can
# simply be ignored.
f"out_{self.hls_sname()}.write(inp_{self.hls_sname()}.read());",
f"}}" # noqa: f-string symmetry
]

# Generates C++ code for reading the output stream and converting back to
# numpy format for testing in C** simulation
def dataoutstrm(self):
# Output data will be stored in numpy files in the code generation
# dictionary
code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
# Get the expected shape of the folded output array formatted as a C++
# vector initializer
# Note: Valid formatting relies on correct placement of curly braces
# and line breaks: Open/close all three braces on the same line of code
# to avoid '\n' to be inserted into the string
shape = f"""{{{
','.join((str(i) for i in self.get_folded_output_shape(ind=0)))
}}}"""
# Generate function call for reading from the output stream into the
# output file
self.code_gen_dict["$DATAOUTSTREAM$"] = [
# Generate function call reading from stream into the output file
# Note: Outputs are always represented as numpy floats
'apintstream2npy<OutPacked, OutType, OutWidth, float>(',
f'out_{self.hls_sname()}, {shape}, "{code_gen_dir}/out.npy", false',
');',
]

# Generates C++ code for saving the output of C++ simulation to a file in
# numpy format
def save_as_npy(self):
# Note: This seems to be empty in ALL HLSBackends. Probably it was used
# for something before, which is now integrated into dataoutstrm()?
self.code_gen_dict["$SAVEASCNPY$"] = []

# Generates essentially the head of the C++ function from which the IP block
# will be generated during ipgen, i.e. actual synthesis
def blackboxfunction(self):
# Insert function head describing the top level interface of the
# squeeze operator
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
# Note: Assumes stream type aliases to be set in defines
f"void {self.onnx_node.name} (",
f" InpStream &inp_{self.hls_sname()},",
f" OutStream &out_{self.hls_sname()}",
")",
]

# Generates C++ pragmas to be inserted into the main function of the C++
# simulation and the ipgen-blackboxfunction as well
def pragmas(self):
# Check whether there are already pragmas in the code generation
# dictionary
if "$PRAGMAS$" not in self.code_gen_dict:
# If not, insert an empty list to collect more pragmas
self.code_gen_dict["$PRAGMAS$"] = []

# Add HLS interface directives specifying how to create RTL ports for
# the top-level function arguments
self.code_gen_dict["$PRAGMAS$"] += [
# Connect the input and output stream with an axi stream interface
f"#pragma HLS INTERFACE axis port=out_{self.hls_sname()}",
f"#pragma HLS INTERFACE axis port=inp_{self.hls_sname()}",
# No block-level I/O protocol for the function return value
"#pragma HLS INTERFACE ap_ctrl_none port=return"
]

# Returns the names of input and output interfaces grouped by protocol
def get_verilog_top_module_intf_names(self):
# Start collecting interface names in a dictionary starting with clock
# and reset
intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa
# AXI stream input interfaces
intf_names["s_axis"] = [
(f"inp_{self.hls_sname()}", self.get_instream_width_padded(ind=0))
]
# AXI stream output interfaces
intf_names["m_axis"] = [
(f"out_{self.hls_sname()}", self.get_outstream_width_padded(ind=0))
]
# No AXI-MM, AXI-Lite or protocol-less interfaces
intf_names["aximm"] = []
intf_names["axilite"] = []
intf_names["ap_none"] = []
# Return the interface name dictionary
return intf_names
Loading