From 0f24141781f2a05df1131ecc0d6693d1990b96b3 Mon Sep 17 00:00:00 2001 From: alexcere <48130030+alexcere@users.noreply.github.com> Date: Fri, 8 Nov 2024 23:38:30 +0100 Subject: [PATCH] Start working on version with multiple inlinings --- .../cfg_block_actions/inline_function.py | 12 +- src/cfg_methods/cost_computation.py | 97 +++++++++++++ src/cfg_methods/function_inlining.py | 128 ++++++++++++++---- src/parser/cfg_block.py | 3 + src/parser/cfg_block_list.py | 37 ++++- src/parser/cfg_instruction.py | 18 +++ 6 files changed, 258 insertions(+), 37 deletions(-) create mode 100644 src/cfg_methods/cost_computation.py diff --git a/src/cfg_methods/cfg_block_actions/inline_function.py b/src/cfg_methods/cfg_block_actions/inline_function.py index 1c6b898..a61960b 100644 --- a/src/cfg_methods/cfg_block_actions/inline_function.py +++ b/src/cfg_methods/cfg_block_actions/inline_function.py @@ -18,7 +18,7 @@ class InlineFunction(BlockAction): """ def __init__(self, instr_position: int, cfg_block: CFGBlock, cfg_blocklist: CFGBlockList, - function_name: str, cfg_object: CFGObject): + cfg_function: CFGFunction): """ It receives the position in which we want to split, the corresponding block in which we are appending the corresponding block list, its block list, the function name and the block list @@ -27,10 +27,9 @@ def __init__(self, instr_position: int, cfg_block: CFGBlock, cfg_blocklist: CFGB self._instr_position: int = instr_position self._cfg_block: CFGBlock = cfg_block self._cfg_blocklist: CFGBlockList = cfg_blocklist - self._function_name: function_name_T = function_name - self._cfg_function: CFGFunction = cfg_object.functions[function_name] + self._function_name: function_name_T = cfg_function.name + self._cfg_function: CFGFunction = cfg_function self._function_blocklist: CFGBlockList = self._cfg_function.blocks - self._cfg_object: CFGObject = cfg_object self._first_sub_block: Optional[CFGObject] = None self._second_sub_block: Optional[CFGObject] = None @@ -40,7 +39,9 @@ def perform_action(self): # First we need to split the block in the function call, which is given by the instr position. # As a final check, we ensure the instruction in that position corresponds to the function name passed as # an argument - assert call_instruction.get_op_name() == self._function_name, \ + # Considering we might have duplicated the function multiple times, we just check that the original call matches + # the start of the function name + assert self._function_name.startswith(call_instruction.get_op_name()), \ f"Expected function call {self._function_name} in position {self._instr_position} but got instead" \ f"{self._cfg_block.get_instructions()}" @@ -133,7 +134,6 @@ def perform_action(self): self._function_blocklist.blocks.clear() del self._function_blocklist del self._cfg_function - self._cfg_object.functions.pop(self._function_name) @property def first_sub_block(self) -> Optional[CFGBlock]: diff --git a/src/cfg_methods/cost_computation.py b/src/cfg_methods/cost_computation.py new file mode 100644 index 0000000..d5680df --- /dev/null +++ b/src/cfg_methods/cost_computation.py @@ -0,0 +1,97 @@ +""" +Module to compute an estimation on the gas and bytes-in-size spent. Useful for determining whether a function must +be inlined or not +""" +from typing import Dict, Tuple, Set +from global_params.types import function_name_T, var_id_T, component_name_T +from parser.cfg import CFG +from parser.cfg_function import CFGFunction +from parser.cfg_block_list import CFGBlockList +from parser.cfg_block import CFGBlock +from parser.utils_parser import get_push_number_hex + +# Type of the cost we are interested on studying: gas and size costs +costs_T = Tuple[int, int] + +# Type of the dict that maps each function name to an estimation on its gas and size costs +function2costs_T = Dict[function_name_T, costs_T] + + +def compute_gas_bytes(cfg: CFG) -> Dict[component_name_T, function2costs_T]: + """ + Estimates the gas and size costs of all the function inside the CFG structure + """ + function2costs = dict() + for object_id, cfg_object in cfg.objectCFG.items(): + current_object2costs = dict() + + # We also consider the information per function + for function_name in cfg_object.functions: + compute_gas_bytes_function(function_name, cfg_object.functions, current_object2costs) + + function2costs[object_id] = current_object2costs + sub_object = cfg.get_subobject() + + if sub_object is not None: + function2costs.update(compute_gas_bytes(sub_object)) + + return function2costs + + +def compute_gas_bytes_function(function_name: function_name_T, function_dict: Dict[function_name_T, CFGFunction], + function2costs: function2costs_T) -> costs_T: + function_costs = function2costs.get(function_name, None) + if function_costs is not None: + return function_costs + + # We need to keep track of which values are introduced and consumed, as we can count how many times they must + # be duplicated. Initially, we have the elements passed as input + previously_introduced = set(function_dict[function_name].get_arguments()) + + gas_cost, size_cost = compute_gas_bytes_block_list(function_dict[function_name].blocks, function_dict, + function2costs, previously_introduced) + function2costs[function_name] = gas_cost, size_cost + return gas_cost, size_cost + + +def compute_gas_bytes_block_list(cfg_block_list: CFGBlockList, function_dict: Dict[function_name_T, CFGFunction], + function2costs: function2costs_T, previously_introduced: Set[var_id_T]) -> costs_T: + gas_cost, size_cost = 0, 0 + + for block in cfg_block_list.blocks.values(): + block_gas, block_size = compute_gas_bytes_block(block, function_dict, function2costs, previously_introduced) + gas_cost += block_gas + size_cost += block_size + return gas_cost, size_cost + + +def compute_gas_bytes_block(block: CFGBlock, function_dict: Dict[function_name_T, CFGFunction], + function2costs: function2costs_T, previously_introduced: Set[var_id_T]) -> costs_T: + gas_cost, size_cost = 0, 0 + for instruction in block.get_instructions(): + + # First we account the cost of the op name + if instruction.get_op_name() in function_dict.keys(): + gas_cost, size_cost = compute_gas_bytes_function(instruction.get_op_name(), function_dict, function2costs) + else: + gas_cost += instruction.gas_spent_op + size_cost += instruction.bytes_required + + # Then we consider that every argument must be either duplicated or pushed (for constants) + # TODO: think more carefully if we can make some assumptions + for in_value in instruction.get_in_args(): + if in_value.startswith("0x"): + # PUSH0 case + gas_cost += 2 if in_value == "0x00" else 3 + size_cost += 1 if in_value == "0x00" else (1 + get_push_number_hex(in_value)) + elif in_value in previously_introduced: + previously_introduced.remove(in_value) + else: + # Account for a DUPx + gas_cost += 3 + size_cost += 1 + + for out_value in instruction.get_out_args(): + previously_introduced.add(out_value) + + return gas_cost, size_cost diff --git a/src/cfg_methods/function_inlining.py b/src/cfg_methods/function_inlining.py index cf6b335..45ad93a 100644 --- a/src/cfg_methods/function_inlining.py +++ b/src/cfg_methods/function_inlining.py @@ -2,6 +2,7 @@ Module to perform function inlining. """ import json +from copy import deepcopy from typing import Set, Dict, Tuple, List from collections import defaultdict @@ -10,23 +11,29 @@ from global_params.types import block_id_T, component_name_T, function_name_T, block_list_id_T from parser.cfg_block import CFGBlock from parser.cfg_block_list import CFGBlockList +from parser.cfg_function import CFGFunction from parser.cfg_object import CFGObject from parser.cfg import CFG from cfg_methods.cfg_block_actions.inline_function import InlineFunction from cfg_methods.utils import union_find_search +from cfg_methods.cost_computation import function2costs_T, compute_gas_bytes # For each time a function is invoked, we store the position of the instruction (int) in the # block (blok_id_T) that appears in the block list (block_list_id) -function_call_info_T = Dict[str, List[Tuple[int, block_id_T, block_list_id_T]]] +call_info_T = Tuple[int, block_id_T, block_list_id_T] + +function2call_info_T = Dict[str, List[call_info_T]] def inline_functions(cfg: CFG) -> None: """ Inlines the functions that are invoked just in one place """ - cfg_object2modify: Dict[component_name_T, function_call_info_T] = generate_function2information(cfg) + cfg_object2modify: Dict[component_name_T, function2call_info_T] = generate_function2information(cfg) + cfg_function2costs = compute_gas_bytes(cfg) + for object_id, cfg_object in cfg.objectCFG.items(): - inline_functions_cfg_object(cfg_object, cfg_object2modify[object_id]) + inline_functions_cfg_object(cfg_object, cfg_object2modify[object_id], cfg_function2costs[object_id]) sub_object = cfg.get_subobject() if sub_object is not None: @@ -35,7 +42,7 @@ def inline_functions(cfg: CFG) -> None: # Methods to compute the invocation information -def generate_function2information(cfg: CFG) -> Dict[function_name_T, function_call_info_T]: +def generate_function2information(cfg: CFG) -> Dict[function_name_T, function2call_info_T]: """ For each cfg object, a dictionary is produced that links each function to the position, block and block list in which it is used @@ -60,7 +67,7 @@ def generate_function2information(cfg: CFG) -> Dict[function_name_T, function_ca def generate_function2blocks_block_list(cfg_block_list: CFGBlockList, function_names: Set[function_name_T], - function2blocks: function_call_info_T) -> None: + function2blocks: function2call_info_T) -> None: """ Links the function calls that appear in the block list to the exact block and the block list """ @@ -71,7 +78,8 @@ def generate_function2blocks_block_list(cfg_block_list: CFGBlockList, function_n # Methods to perform the inlining of cfg objects -def inline_functions_cfg_object(cfg_object: CFGObject, function_call_info: function_call_info_T): +def inline_functions_cfg_object(cfg_object: CFGObject, function_call_info: function2call_info_T, + function2costs: function2costs_T): # Dict that maps each initial block name in the CFG to the set of blocks in which it can be split block2current: Dict[block_id_T, List[block_id_T]] = dict() @@ -80,37 +88,49 @@ def inline_functions_cfg_object(cfg_object: CFGObject, function_call_info: funct for function_name, call_info in function_call_info.items(): + cfg_function = cfg_object.functions[function_name] # Only consider blocks for inlining that have just one invocation - if len(call_info) == 1: - instr_pos, cfg_block_name, cfg_block_list_name = call_info[0] + if len(call_info) == 1 or _must_be_inlined(function_name, call_info, function2costs, + len(cfg_function.exits)): + + for call_idx, (instr_pos, cfg_block_name, cfg_block_list_name) in enumerate(call_info): + + # First we find in which block list the function block list is stored + # As many substitutions can happen, we have to iterate recursively to find the most recent one + current_block_list_name = union_find_search(cfg_block_list_name, block_list2current) + print(current_block_list_name) + cfg_block_list = cfg_object.get_block_list(current_block_list_name) + + # Then we determine whether the function has been split + split_blocks = block2current.get(cfg_block_name, [cfg_block_name]) + + # We have to determine the corresponding index if there are multiple blocks + if len(split_blocks) > 1: + split_block_index, position_index = _determine_idx(instr_pos, split_blocks, cfg_block_list) + else: + split_block_index = 0 + position_index = instr_pos + _adjust_phi_function_idx_misalignment(cfg_block_list.blocks[split_blocks[split_block_index]]) - # First we find in which block list the function block list is stored - # As many substitutions can happen, we have to iterate recursively to find the most recent one - current_block_list_name = union_find_search(cfg_block_list_name, block_list2current) - cfg_block_list = cfg_object.get_block_list(current_block_list_name) + function_to_inline, renaming_dict = _generate_function_to_inline(cfg_function, call_idx, len(call_info)) - # Then we determine whether the function has been split - split_blocks = block2current.get(cfg_block_name, [cfg_block_name]) + inline_action = InlineFunction(position_index, cfg_block_list.blocks[split_blocks[split_block_index]], + cfg_block_list, function_to_inline) - # We have to determine the corresponding index if there are multiple blocks - if len(split_blocks) > 1: - split_block_index, position_index = _determine_idx(instr_pos, split_blocks, cfg_block_list) - else: - split_block_index = 0 - position_index = instr_pos + _adjust_phi_function_idx_misalignment(cfg_block_list.blocks[split_blocks[split_block_index]]) + inline_action.perform_action() - inline_action = InlineFunction(position_index, cfg_block_list.blocks[split_blocks[split_block_index]], - cfg_block_list, function_name, cfg_object) - inline_action.perform_action() + # Uncomment for validation + # is_correct, reason = validate_block_list_comes_from(cfg_block_list) - # Uncomment for validation - # is_correct, reason = validate_block_list_comes_from(cfg_block_list) + # Finally, we have to update the information of both the block lists and blocks + new_function_name = cfg_function.name - # Finally, we have to update the information of both the block lists and blocks - block_list2current[function_name] = current_block_list_name - block2current[cfg_block_name] = split_blocks[:split_block_index] + \ - [inline_action.first_sub_block.block_id, - inline_action.second_sub_block.block_id] + split_blocks[split_block_index+1:] + block_list2current[new_function_name] = current_block_list_name + block2current[cfg_block_name] = split_blocks[:split_block_index] + \ + [inline_action.first_sub_block.block_id, + inline_action.second_sub_block.block_id] + split_blocks[ + split_block_index + 1:] + # As we have decided to inline, we can just remove it from the list of functions + cfg_object.functions.pop(function_name) def _determine_idx(instr_idx: int, split_block_names: List[block_id_T], cfg_block_list: CFGBlockList) \ @@ -138,3 +158,51 @@ def _adjust_phi_function_idx_misalignment(block: CFGBlock) -> int: # Here we need to reassign the index considering the preceding phi functions in the block, as # we have skipped them return len([True for instr in block.get_instructions() if instr.get_op_name() == "PhiFunction"]) + + +def _must_be_inlined(function_name: function_name_T, call_info_list: List[call_info_T], function2costs: function2costs_T, + n_function_exits: int): + """ + Returns whether a function must be inlined or not, according to the call and costs info + """ + gas_cost, size_cost = function2costs[function_name] + + # "Extra costs" with no inlining: introducing two tags + 2 JUMPDEST + 1 entry jump + multiple exit jumps + no_inlining_extra_gas = (3 * 3) + 2 * 1 + 3 * (1 + n_function_exits) + + # Assuming the tags take 2 bytes + no_inlining_extra_size = (3 * 3) + 2 * 1 + (1 + n_function_exits) + + # "Extra costs" with inlining: number of bytes duplicated by number of calls + inlining_extra_size = size_cost * (len(call_info_list) - 1) + + # Decision for whether a function must be inlined or not + + # Heuristics: 20 bytes = 1 gas + # TODO: devise good heuristics for inlining + return (inlining_extra_size - no_inlining_extra_size) <= 20 * no_inlining_extra_gas + + +def _generate_function_to_inline(original_function: CFGFunction, current_call_idx: int, + n_calls: int) -> Tuple[CFGFunction, Dict[block_id_T, block_id_T]]: + """ + We must rename the blocks when inlining to avoid conflicts, as the function can be inlined multiple times in the + same function (and hence, the same blocks would appear multiple times). We also return the renaming dict + """ + # If there is just one call, we avoid renaming the blocks + if n_calls == 1: + return original_function, dict() + # If we are making multiple copies, we copy it call_idx - 1 times, as the last one should remove it + elif current_call_idx == n_calls - 1: + copied_function = original_function + else: + copied_function = deepcopy(original_function) + + # We have to modify the block list inside the copied function first + block_list = copied_function.blocks + renaming_dict = {block_name: f"{block_name}_copy_{current_call_idx}" for block_name in block_list.blocks} + block_list.rename_blocks(renaming_dict) + + copied_function.exits = [renaming_dict.get(exit_id, exit_id) for exit_id in copied_function.exits] + copied_function.name = f"{copied_function.name}_copy_{current_call_idx}" + return copied_function, renaming_dict diff --git a/src/parser/cfg_block.py b/src/parser/cfg_block.py index 721532a..7ac5ef7 100644 --- a/src/parser/cfg_block.py +++ b/src/parser/cfg_block.py @@ -132,6 +132,9 @@ def set_condition(self, cond: var_id_T) -> None: def get_block_id(self) -> str: return self.block_id + def set_block_id(self, value: var_id_T) -> None: + self.block_id = value + def get_instructions(self) -> List[CFGInstruction]: return self._instructions diff --git a/src/parser/cfg_block_list.py b/src/parser/cfg_block_list.py index 1816baf..e4eb367 100644 --- a/src/parser/cfg_block_list.py +++ b/src/parser/cfg_block_list.py @@ -56,7 +56,7 @@ def add_block(self, block: CFGBlock, is_start_block: bool = False) -> None: self._function_return_blocks.append(block_id) if block_id in self.blocks: - logging.warning("You are overwritting an existing block") + logging.warning(f"You are overwritting an existing block: {block_id}") self.graph = None self.blocks[block_id] = block @@ -81,6 +81,23 @@ def remove_block(self, block_id: block_id_T) -> None: self._function_return_blocks = [return_block for return_block in self._function_return_blocks if return_block != block_id] + def rename_blocks(self, renaming_dict: Dict[block_id_T, block_id_T]): + new_block_dict = dict() + for old_block_id, block in self.blocks.items(): + new_block_id = renaming_dict.get(old_block_id, old_block_id) + block.set_block_id(new_block_id) + new_block_dict[new_block_id] = block + + self.blocks = new_block_dict + + self._terminal_blocks = [renaming_dict.get(terminal_block, terminal_block) + for terminal_block in self._terminal_blocks] + + self._function_return_blocks = [renaming_dict.get(return_block, return_block) + for return_block in self._function_return_blocks] + + self.start_block = renaming_dict.get(self.start_block, self.start_block) + def get_blocks_dict(self): return self.blocks @@ -124,6 +141,24 @@ def to_graph(self) -> networkx.DiGraph: return graph return self.graph + def to_graph_info(self) -> networkx.DiGraph: + """ + Creates a networkx.DiGraph from the blocks information. Useful for debugging + """ + graph = networkx.DiGraph() + graph.add_nodes_from(self.blocks.keys()) + for block_id, block in self.blocks.items(): + for successor in [block.get_jump_to(), block.get_falls_to()]: + if successor is not None: + graph.add_edge(block_id, successor) + + relabel_dict = {block_name: '\n'.join([block.get_block_id(), *[instr.dot_repr() for instr in block.get_instructions()]]) + for block_name, block in self.blocks.items()} + renamed_digraph = networkx.relabel_nodes(graph, relabel_dict) + + return renamed_digraph + + def to_graph_comes_from(self) -> networkx.DiGraph: """ Creates a networkx.DiGraph from the comes_from diff --git a/src/parser/cfg_instruction.py b/src/parser/cfg_instruction.py index 0a5285b..562a152 100644 --- a/src/parser/cfg_instruction.py +++ b/src/parser/cfg_instruction.py @@ -341,6 +341,24 @@ def get_instruction_representation(self): def dot_repr(self): return self.get_instruction_representation() + @property + def gas_spent_op(self) -> int: + """ + Gas spent for performing the operation. Does not consider the cost needed to generate the args + """ + if self.op == "PhiFunction" or self.op == "FunctionReturn": + return 0 + return opcodes.get_ins_cost(self.op) + + @property + def bytes_required(self) -> int: + """ + Bytes required for performing the operation. Does not consider the cost needed to generate the args (hence, 1) + """ + if self.op == "PhiFunction" or self.op == "FunctionReturn": + return 0 + return 1 + def __repr__(self): return json.dumps(self.get_as_json())