diff --git a/Makefile b/Makefile index 91ac643..c3e6c75 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ SHELL := /bin/bash PROJECT := graphsense-lib VENV := venv -RELEASE := 'v24.02rc1' -RELEASESEM := 'v2.2rc1' +RELEASE := 'v24.02rc2' +RELEASESEM := 'v2.2rc2' all: format lint test build diff --git a/setup.cfg b/setup.cfg index 55195ed..e6070d2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -77,9 +77,11 @@ install_requires = graphsense-bitcoin-etl==1.5.4 base58~=2.1 typeguard~=4.1 + diskcache~=5.6 grpcio + [options.packages.find] where = src exclude = diff --git a/src/graphsenselib/config/config.py b/src/graphsenselib/config/config.py index bc04245..3c3b8aa 100644 --- a/src/graphsenselib/config/config.py +++ b/src/graphsenselib/config/config.py @@ -1,3 +1,4 @@ +import logging import math import os from typing import Dict, List, Optional @@ -73,6 +74,11 @@ class KeyspaceSetupConfig(BaseModel): data_configuration: Dict[str, object] = Field(default_factory=lambda: {}) +class DeltaUpdaterConfig(BaseModel): + fs_cache: Optional[FileSink] + currency: str + + class KeyspaceConfig(BaseModel): raw_keyspace_name: str transformed_keyspace_name: str @@ -230,5 +236,15 @@ def get_slack_hooks_by_topic(self, topic: str) -> Optional[SlackTopic]: def get_keyspace_config(self, env: str, currency: str) -> KeyspaceConfig: return self.get_environment(env).get_keyspace(currency) + def get_deltaupdater_config(self, env: str, currency: str) -> DeltaUpdaterConfig: + fs_cache = ( + self.get_environment(env) + .keyspaces[currency] + .ingest_config.raw_keyspace_file_sinks.get("fs-cache") + ) + if fs_cache is None: + logging.debug(f"fs-cache not configured for {currency} in {env}") + return DeltaUpdaterConfig(fs_cache=fs_cache, currency=currency) + config = AppConfig(load=False) diff --git a/src/graphsenselib/datatypes/__init__.py b/src/graphsenselib/datatypes/__init__.py index facc525..2e677ca 100644 --- a/src/graphsenselib/datatypes/__init__.py +++ b/src/graphsenselib/datatypes/__init__.py @@ -2,3 +2,4 @@ from .address import AddressAccount, AddressUtxo from .common import * from .errors import * +from .transactionhash import TransactionHashAccount, TransactionHashUtxo diff --git a/src/graphsenselib/datatypes/address.py b/src/graphsenselib/datatypes/address.py index 340afe3..4b75e81 100644 --- a/src/graphsenselib/datatypes/address.py +++ b/src/graphsenselib/datatypes/address.py @@ -1,6 +1,6 @@ from typing import Union -from ..utils import hex_to_bytearray +from ..utils import hex_to_bytes class AddressUtxo: @@ -61,7 +61,7 @@ def __init__(self, adr: Union[str, bytearray], config): """ self.prefix_length = int(config.address_prefix_length) if type(adr) == str: - self.address_bytes = hex_to_bytearray(adr) + self.address_bytes = hex_to_bytes(adr) elif type(adr) == bytearray: self.address_bytes = adr elif type(adr) == bytes: @@ -94,3 +94,8 @@ def prefix(self) -> str: @property def bytearray(self) -> bytearray: # noqa return self.address_bytes + + +class AddressAccountTrx: + def __init__(self, adr: Union[str, bytearray], config): + raise NotImplementedError("AddressAccountTrx not implemented yet") diff --git a/src/graphsenselib/datatypes/transactionhash.py b/src/graphsenselib/datatypes/transactionhash.py new file mode 100644 index 0000000..0b04942 --- /dev/null +++ b/src/graphsenselib/datatypes/transactionhash.py @@ -0,0 +1,62 @@ +""" +Transaction Class. Functionality depending on the address type. +""" + +from typing import Union + +from ..utils import hex_to_bytes + + +class TransactionHashUtxo: + def __init__(self, txhash: Union[str, bytearray], config): + raise NotImplementedError("TransactionUtxo not implemented yet") + + +class TransactionHashAccount: + def __init__(self, txhash: Union[str, bytearray], config): + """Init a transaction instance. + + Args: + txhash (Union[str, bytearray]): transaction hash + config (ConfigRow): entry from the config table in the transformed keyspace + + Raises: + Exception: Description + ValueError: Description + """ + self.prefix_length = int(config.tx_prefix_length) + if type(txhash) == str: + self.tx_hash_bytes = hex_to_bytes(txhash) + elif type(txhash) == bytearray: + self.tx_hash_bytes = txhash + elif type(txhash) == bytes: + self.tx_hash_bytes = bytearray(txhash) + else: + raise Exception(f"Unknown type for txhash type: {type(txhash)}") + + # todo potentially different length for tron, also saved in bytearray though + if len(self.tx_hash_bytes) != 32: + raise ValueError( + f"Address is not the right length {len(self.tx_hash_bytes)}" + ) + + @property + def hex(self) -> str: # noqa + return self.tx_hash_bytes.hex() + + @property + def db_encoding(self) -> str: + return self.bytearray + # return f"0x{self.hex}" + + @property + def db_encoding_query(self) -> str: + return f"0x{self.hex}" + + @property + def prefix(self) -> str: + return self.hex.upper()[: self.prefix_length] + + @property + def bytearray(self) -> bytearray: # noqa + return self.tx_hash_bytes diff --git a/src/graphsenselib/db/account.py b/src/graphsenselib/db/account.py index f8c368b..6f8efb8 100644 --- a/src/graphsenselib/db/account.py +++ b/src/graphsenselib/db/account.py @@ -7,10 +7,17 @@ class TransformedDbAccount(TransformedDb): def get_highest_cluster_id(self, sanity_check=True) -> Optional[int]: return None + def get_highest_transaction_id(self): + return self._get_hightest_id( + table="transaction_ids_by_transaction_id_group", + sanity_check=True, + id_col="transaction_id", + ) + class RawDbAccount(RawDb): def get_logs_in_block(self, block: int, topic0=None, contract=None) -> Iterable: - group = block // self.get_block_bucket_size() + group = self.get_id_group(block, self.get_block_bucket_size()) if topic0 is None: data = self.select_safe( "log", where={"block_id": block, "block_id_group": group} @@ -25,13 +32,27 @@ def get_logs_in_block(self, block: int, topic0=None, contract=None) -> Iterable: return data def get_transaction_ids_in_block(self, block: int) -> Iterable: - raise Exception("Not yet implemented.") + raise NotImplementedError def get_transactions_in_block(self, block: int) -> Iterable: - raise Exception("Not yet implemented.") + result = self.select( + "transaction", + where={"block_id": block}, + ) + return result + + def get_traces_in_block(self, block: int) -> Iterable: + block_bucket_size = self.get_block_bucket_size() + group = self.get_id_group(block, block_bucket_size) + + results = self.select( + "trace", where={"block_id_group": group, "block_id": block} + ) + + return results def get_addresses_in_block(self, block: int) -> Iterable: - group = block // self.get_block_bucket_size() + group = self.get_id_group(block, self.get_block_bucket_size()) # The fetch size is needed since traces currenly contain a lot of null values # the null values create tombestones and cassandra refuses to read more than @@ -50,7 +71,7 @@ def get_addresses_in_block(self, block: int) -> Iterable: class RawDbAccountTrx(RawDbAccount): def get_addresses_in_block(self, block: int) -> Iterable: - group = block // self.get_block_bucket_size() + group = self.get_id_group(block, self.get_block_bucket_size()) # The fetch size is needed since traces currenly contain a lot of null values # the null values create tombestones and cassandra refuses to read more than diff --git a/src/graphsenselib/db/analytics.py b/src/graphsenselib/db/analytics.py index bab66b5..b2890fd 100644 --- a/src/graphsenselib/db/analytics.py +++ b/src/graphsenselib/db/analytics.py @@ -17,7 +17,8 @@ from ..config import keyspace_types from ..datatypes import DbChangeType -from ..utils import GenericArrayFacade, binary_search, parse_timestamp +from ..utils import GenericArrayFacade, parse_timestamp # binary_search, +from ..utils.account import get_id_group, get_id_group_with_secondary_relations from .cassandra import ( CassandraDb, build_create_stmt, @@ -27,6 +28,7 @@ build_truncate_stmt, ) +CONCURRENCY = 2000 DATE_FORMAT = "%Y-%m-%d" logger = logging.getLogger(__name__) @@ -72,10 +74,11 @@ def get_cql_statement(self, keyspace): class KeyspaceConfig: - def __init__(self, keyspace_name, db_type, address_type): + def __init__(self, keyspace_name, db_type, address_type, tx_hash_type): self._keyspace_name = keyspace_name self._db_type = db_type self._address_type = address_type + self._tx_hash_type = tx_hash_type @property def keyspace_name(self): @@ -89,6 +92,10 @@ def db_type(self): def address_type(self): return self._address_type + @property + def tx_hash_type(self): + return self._tx_hash_type + class WithinKeyspace: def select_stmt( @@ -212,10 +219,15 @@ def select_async_safe( fetch_size=fetch_size, ) - def _get_hightest_id(self, table="block", sanity_check=True) -> Optional[int]: + def _get_hightest_id( + self, table="block", sanity_check=True, id_col=None + ) -> Optional[int]: """Return last ingested address ID from a table.""" - group_id_col = f"{table}_id_group" - id_col = f"{table}_id" + + if id_col is None: + id_col = f"{table}_id" + + group_id_col = f"{id_col}_group" result = self.select(table=table, columns=[group_id_col], per_partition_limit=1) groups = [getattr(row, group_id_col) for row in result.current_rows] @@ -243,12 +255,20 @@ def _ensure_is_highest_id( ) -> bool: if query_id is None: return None + groups = self._get_bucket_divisors_by_table_name() + if table in groups: # this case handles tables with group ids. - group_id_col = f"{table}_id_group" + group_id_col = f"{id_col}_group" bucket_divisor = groups[table] - w = {group_id_col: (query_id + 1) // bucket_divisor, id_col: query_id + 1} + highest_plus_one = query_id + 1 + + id_group_highest_plus_one = self.get_id_group( + highest_plus_one, bucket_divisor + ) + + w = {group_id_col: id_group_highest_plus_one, id_col: highest_plus_one} else: # this case handles tables with no group column and increasing integer ids. w = {id_col: query_id + 1} @@ -270,6 +290,9 @@ def _at_most_one_result(self, result): def _get_only_row_from_table(self, table: str = "configuration"): return self._at_most_one_result(self.select(table, limit=2)) + def get_id_group(self, id_, bucket_size): + return get_id_group(id_, bucket_size) + class DbWriterMixin: @@ -356,6 +379,13 @@ def ingest( ) +def get_last_notnone(result, start, end): + for i in range(end, start, -1): + if result[i] is not None: + return i + return -1 + + class RawDb(ABC, WithinKeyspace, DbReaderMixin, DbWriterMixin): def __init__(self, keyspace_config: KeyspaceConfig, db: CassandraDb): self._keyspace_config = keyspace_config @@ -392,7 +422,9 @@ def get_item(date, index): get_item_date = partial(get_item, date) - r = binary_search(GenericArrayFacade(get_item_date), 1, lo=start, hi=hb) + # r = binary_search(GenericArrayFacade(get_item_date), 1, lo=start, hi=hb) + # todo only for testing, remove in production + r = get_last_notnone(GenericArrayFacade(get_item_date), start, hb) if r == -1: # minus one could mean two things, either @@ -432,7 +464,11 @@ def get_item(index): batch = self.get_exchange_rates_for_block_batch([index]) return 0 if has_er_value(batch) else 1 - r = binary_search(GenericArrayFacade(get_item), 1, lo=start, hi=hb) + # r = binary_search(GenericArrayFacade(get_item), 1, lo=start, hi=hb) + # todo only for testing, remove in production + r = get_last_notnone(GenericArrayFacade(get_item), start, hb) + r += 1 + # binary search should work again as soon as we have enough blocks ingested if r == -1: # minus one could mean two things, either @@ -524,7 +560,7 @@ def get_block_timestamps_batch(self, blocks: list[int]): where={"block_id_group": "?", "block_id": "?"}, limit=1, ) - parameters = [(b, [b // bucket_size, b]) for b in blocks] + parameters = [(b, [self.get_id_group(b, bucket_size), b]) for b in blocks] results = self._db.execute_batch(stmt, parameters) return { a: (parse_timestamp(row.current_rows[0].timestamp)) @@ -580,9 +616,11 @@ def __init__(self, keyspace_config: KeyspaceConfig, db: CassandraDb): self._db_config = None def _get_bucket_divisors_by_table_name(self) -> dict: + bucket_size = self.get_address_id_bucket_size() return { - "address": self.get_address_id_bucket_size(), + "address": bucket_size, "cluster": self.get_cluster_id_bucket_size(), + "transaction_ids_by_transaction_id_group": bucket_size, } def exists(self) -> bool: @@ -603,6 +641,18 @@ def get_address_id_bucket_size(self) -> Optional[int]: config = self.get_configuration() return int(config.bucket_size) if config is not None else None + def get_block_id_bucket_size(self) -> Optional[int]: + config = self.get_configuration() + return int(config.bucket_size) if config is not None else None + + def get_address_transactions_id_bucket_size(self) -> Optional[int]: + config = self.get_configuration() + return int(config.block_bucket_size_address_txs) if config is not None else None + + def get_addressrelations_ids_nbuckets(self) -> Optional[int]: + config = self.get_configuration() + return int(config.addressrelations_ids_nbuckets) if config is not None else None + def get_cluster_id_bucket_size(self) -> Optional[int]: return self.get_address_id_bucket_size() @@ -620,6 +670,9 @@ def get_highest_address_id(self, sanity_check=True) -> Optional[int]: ha = self._get_hightest_id(table="address", sanity_check=sanity_check) return max(ha or 0, du.highest_address_id) if du is not None else ha + def get_highest_transaction_id(self): + return None + @abstractmethod def get_highest_cluster_id(self, sanity_check=True) -> Optional[int]: raise Exception("Must be implemented in chain specific child class") @@ -680,6 +733,10 @@ def to_db_address(self, address): Address = self._keyspace_config.address_type return Address(address, self.get_configuration()) + def to_db_tx_hash(self, tx_hash): + TxHash = self._keyspace_config.tx_hash_type + return TxHash(tx_hash, self.get_configuration()) + @lru_cache(maxsize=1_000_000) def knows_address(self, address: Union[str, bytearray]) -> bool: """Checks if address is in transformed keyspace. @@ -753,7 +810,8 @@ def get_address_id_async_batch(self, addresses: List[str]): ] return zip( - addresses, self._db.execute_statements_async(bstmts, concurrency=2000) + addresses, + self._db.execute_statements_async(bstmts, concurrency=CONCURRENCY), ) def get_address_id_async(self, address: str): @@ -779,7 +837,7 @@ def get_address_async_batch(self, address_ids: List[int]): bstmts = [ prep.bind( { - "address_id_group": addr_id // bs, + "address_id_group": self.get_id_group(addr_id, bs), "address_id": addr_id, } ) @@ -787,11 +845,12 @@ def get_address_async_batch(self, address_ids: List[int]): ] return zip( - address_ids, self._db.execute_statements_async(bstmts, concurrency=2000) + address_ids, + self._db.execute_statements_async(bstmts, concurrency=CONCURRENCY), ) def get_address_async(self, address_id: int): - bucket = address_id // self.get_address_id_bucket_size() + bucket = self.get_id_group(address_id, self.get_address_id_bucket_size()) w = {"address_id_group": bucket, "address_id": f"{address_id}"} return self.select_async( "address", @@ -817,8 +876,9 @@ def get_address_incoming_relations_async_batch( bstmts = [ prep.bind( { - "dst_address_id_group": dst_address - // self.get_address_id_bucket_size(), + "dst_address_id_group": self.get_id_group( + dst_address, self.get_address_id_bucket_size() + ), "dst_address_id": dst_address, "src_address_id": src_address, } @@ -826,13 +886,55 @@ def get_address_incoming_relations_async_batch( for dst_address, src_address in rel_ids ] - return self._db.execute_statements_async(bstmts, concurrency=2000) + return self._db.execute_statements_async(bstmts, concurrency=CONCURRENCY) + + def get_address_inrelations_async_batch_account( + self, rel_ids: List[Tuple[int, int]] + ): + stmt = self.select_stmt( + "address_incoming_relations", + columns=["*"], + where={ + k: "?" + for k in [ + "dst_address_id_group", + "dst_address_id_secondary_group", + "dst_address_id", + "src_address_id", + ] + }, + limit=1, + ) + prep = self._db.get_prepared_statement(stmt) + + bucketsize = self.get_address_id_bucket_size() + relations_nbuckets = self.get_addressrelations_ids_nbuckets() + + bstmts = [] + for dst_address, src_address in rel_ids: + address_group, secondary_group = get_id_group_with_secondary_relations( + dst_address, src_address, bucketsize, relations_nbuckets + ) + bstmts.append( + prep.bind( + { + "dst_address_id_group": address_group, + "dst_address_id_secondary_group": secondary_group, + "dst_address_id": dst_address, + "src_address_id": src_address, + } + ) + ) + + return self._db.execute_statements_async(bstmts, concurrency=CONCURRENCY) def get_address_incoming_relations_async( self, address_id: int, src_address_id: Optional[int] ): w = { - "dst_address_id_group": address_id // self.get_address_id_bucket_size(), + "dst_address_id_group": self.get_id_group( + address_id, self.get_address_id_bucket_size() + ), "dst_address_id": address_id, } if src_address_id is not None: @@ -844,6 +946,29 @@ def get_address_incoming_relations_async( limit=1, ) + def get_max_secondary_ids_async( + self, address_id_groups: List[int], tablename: str, id_group_col: str + ): + stmt = self.select_stmt( + tablename, # address_transactions_secondary_ids + columns=["*"], + where={ + k: "?" + for k in [ + id_group_col, + ] + }, + limit=1, + ) + prep = self._db.get_prepared_statement(stmt) + + bstmts = [ + prep.bind({id_group_col: address_id_group}) + for address_id_group in address_id_groups + ] + + return self._db.execute_statements_async(bstmts, concurrency=CONCURRENCY) + def get_address_outgoing_relations_async_batch( self, rel_ids: List[Tuple[int, int]] ): @@ -861,8 +986,9 @@ def get_address_outgoing_relations_async_batch( bstmts = [ prep.bind( { - "src_address_id_group": src_address - // self.get_address_id_bucket_size(), + "src_address_id_group": self.get_id_group( + src_address, self.get_address_id_bucket_size() + ), "src_address_id": src_address, "dst_address_id": dst_address, } @@ -870,13 +996,76 @@ def get_address_outgoing_relations_async_batch( for src_address, dst_address in rel_ids ] - return self._db.execute_statements_async(bstmts, concurrency=2000) + return self._db.execute_statements_async(bstmts, concurrency=CONCURRENCY) + + def get_address_outrelations_async_batch_account( + self, rel_ids: List[Tuple[int, int]] + ): + stmt = self.select_stmt( + "address_outgoing_relations", + columns=["*"], + where={ + k: "?" + for k in [ + "src_address_id_group", + "src_address_id_secondary_group", + "src_address_id", + "dst_address_id", + ] + }, + limit=1, + ) + prep = self._db.get_prepared_statement(stmt) + bucketsize = self.get_address_id_bucket_size() + relations_nbuckets = self.get_addressrelations_ids_nbuckets() + + bstmts = [] + for src_address, dst_address in rel_ids: + address_group, secondary_group = get_id_group_with_secondary_relations( + src_address, dst_address, bucketsize, relations_nbuckets + ) + bstmts.append( + prep.bind( + { + "src_address_id_group": address_group, + "src_address_id_secondary_group": secondary_group, + "src_address_id": src_address, + "dst_address_id": dst_address, + } + ) + ) + + return self._db.execute_statements_async(bstmts, concurrency=CONCURRENCY) + + def get_balance_async_batch_account(self, address_ids: List[id]): + stmt = self.select_stmt( + "balance", + columns=["*"], + where={k: "?" for k in ["address_id_group", "address_id"]}, + ) + prep = self._db.get_prepared_statement(stmt) + + bstmts = [ + prep.bind( + { + "address_id_group": self.get_id_group( + address_id, self.get_address_id_bucket_size() + ), + "address_id": address_id, + } + ) + for address_id in address_ids + ] + + return self._db.execute_statements_async(bstmts, concurrency=CONCURRENCY) def get_address_outgoing_relations_async( self, address_id: int, dst_address_id: Optional[int] ): w = { - "src_address_id_group": address_id // self.get_address_id_bucket_size(), + "src_address_id_group": self.get_id_group( + address_id, self.get_address_id_bucket_size() + ), "src_address_id": address_id, } if dst_address_id is not None: @@ -901,7 +1090,7 @@ def get_cluster_async_batch(self, cluster_ids: List[int]): bstmts = [ prep.bind( { - "cluster_id_group": clstr_id // bs, + "cluster_id_group": self.get_id_group(clstr_id, bs), "cluster_id": clstr_id, } ) @@ -909,11 +1098,12 @@ def get_cluster_async_batch(self, cluster_ids: List[int]): ] return zip( - cluster_ids, self._db.execute_statements_async(bstmts, concurrency=2000) + cluster_ids, + self._db.execute_statements_async(bstmts, concurrency=CONCURRENCY), ) def get_cluster_async(self, cluster_id: int): - bucket = cluster_id // self.get_cluster_id_bucket_size() + bucket = self.get_id_group(cluster_id, self.get_cluster_id_bucket_size()) w = {"cluster_id_group": bucket, "cluster_id": f"{cluster_id}"} return self.select_async( "cluster", @@ -939,8 +1129,9 @@ def get_cluster_incoming_relations_async_batch( bstmts = [ prep.bind( { - "dst_cluster_id_group": dst_address - // self.get_address_id_bucket_size(), + "dst_cluster_id_group": self.get_id_group( + dst_address, self.get_address_id_bucket_size() + ), "dst_cluster_id": dst_address, "src_cluster_id": src_address, } @@ -948,13 +1139,15 @@ def get_cluster_incoming_relations_async_batch( for dst_address, src_address in rel_ids ] - return self._db.execute_statements_async(bstmts, concurrency=2000) + return self._db.execute_statements_async(bstmts, concurrency=CONCURRENCY) def get_cluster_incoming_relations_async( self, cluster_id: int, src_cluster_id: Optional[int] ): w = { - "dst_cluster_id_group": cluster_id // self.get_cluster_id_bucket_size(), + "dst_cluster_id_group": self.get_id_group( + cluster_id, self.get_cluster_id_bucket_size() + ), "dst_cluster_id": cluster_id, } if src_cluster_id is not None: @@ -983,8 +1176,9 @@ def get_cluster_outgoing_relations_async_batch( bstmts = [ prep.bind( { - "src_cluster_id_group": src_address - // self.get_address_id_bucket_size(), + "src_cluster_id_group": self.get_id_group( + src_address, self.get_address_id_bucket_size() + ), "src_cluster_id": src_address, "dst_cluster_id": dst_address, } @@ -992,13 +1186,15 @@ def get_cluster_outgoing_relations_async_batch( for src_address, dst_address in rel_ids ] - return self._db.execute_statements_async(bstmts, concurrency=2000) + return self._db.execute_statements_async(bstmts, concurrency=CONCURRENCY) def get_cluster_outgoing_relations_async( self, cluster_id: int, dst_cluster_id: Optional[int] ): w = { - "src_cluster_id_group": cluster_id // self.get_cluster_id_bucket_size(), + "src_cluster_id_group": self.get_id_group( + cluster_id, self.get_cluster_id_bucket_size() + ), "src_cluster_id": cluster_id, } if dst_cluster_id is not None: diff --git a/src/graphsenselib/db/factory.py b/src/graphsenselib/db/factory.py index e705a40..3e5ca49 100644 --- a/src/graphsenselib/db/factory.py +++ b/src/graphsenselib/db/factory.py @@ -1,7 +1,12 @@ from collections import namedtuple from ..config import config, schema_types -from ..datatypes import AddressAccount, AddressUtxo +from ..datatypes import ( + AddressAccount, + AddressUtxo, + TransactionHashAccount, + TransactionHashUtxo, +) from .account import RawDbAccount, RawDbAccountTrx, TransformedDbAccount from .analytics import AnalyticsDb from .analytics import KeyspaceConfig as KeyspaceConfigDB @@ -9,7 +14,8 @@ from .utxo import RawDbUtxo, TransformedDbUtxo DbTypeStrategy = namedtuple( - "DatabaseStrategy", ["raw_db_type", "transformed_db_type", "address_type"] + "DatabaseStrategy", + ["raw_db_type", "transformed_db_type", "address_type", "transaction_type"], ) @@ -18,11 +24,20 @@ def get_db_types_by_schema_type(schema_type) -> DbTypeStrategy: raise ValueError(f"{schema_type} not yet defined.") if schema_type == "utxo": - return DbTypeStrategy(RawDbUtxo, TransformedDbUtxo, AddressUtxo) + return DbTypeStrategy( + RawDbUtxo, TransformedDbUtxo, AddressUtxo, TransactionHashUtxo + ) elif schema_type == "account": - return DbTypeStrategy(RawDbAccount, TransformedDbAccount, AddressAccount) + return DbTypeStrategy( + RawDbAccount, TransformedDbAccount, AddressAccount, TransactionHashAccount + ) elif schema_type == "account_trx": - return DbTypeStrategy(RawDbAccountTrx, TransformedDbAccount, AddressAccount) + return DbTypeStrategy( + RawDbAccountTrx, + TransformedDbAccount, + AddressAccount, + TransactionHashAccount, + ) else: raise ValueError(f"{schema_type} not yet supported.") @@ -44,12 +59,16 @@ def from_name( db_types = get_db_types_by_schema_type(schema_type) return AnalyticsDb( raw=KeyspaceConfigDB( - raw_keyspace_name, db_types.raw_db_type, db_types.address_type + raw_keyspace_name, + db_types.raw_db_type, + db_types.address_type, + db_types.transaction_type, ), transformed=KeyspaceConfigDB( transformed_keyspace_name, db_types.transformed_db_type, db_types.address_type, + db_types.transaction_type, ), db=CassandraDb(cassandra_nodes), ) diff --git a/src/graphsenselib/db/utxo.py b/src/graphsenselib/db/utxo.py index 19b2132..0372158 100644 --- a/src/graphsenselib/db/utxo.py +++ b/src/graphsenselib/db/utxo.py @@ -1,6 +1,6 @@ from typing import Iterable, Optional -from ..utils import flatten, hex_to_bytearray +from ..utils import flatten, hex_to_bytes from ..utils.utxo import SlimTx, get_slim_tx_from_transaction from .analytics import RawDb, TransformedDb @@ -15,7 +15,7 @@ def get_highest_cluster_id(self, sanity_check=True) -> Optional[int]: class RawDbUtxo(RawDb): def get_transaction_ids_in_block(self, block: int) -> Iterable: block_bucket_size = self.get_block_bucket_size() - group = block // block_bucket_size + group = self.get_id_group(block, block_bucket_size) result = self.select( "block_transactions", columns=["txs"], @@ -34,7 +34,10 @@ def get_transactions_in_block(self, block: int) -> Iterable: limit=1, ) - parameters = [(tx_id, [tx_id // tx_bucket_size, tx_id]) for tx_id in tx_ids] + parameters = [ + (tx_id, [self.get_id_group(tx_id, tx_bucket_size), tx_id]) + for tx_id in tx_ids + ] results = self._db.execute_batch_async(stmt, parameters) return [tx.one() for tx_id, tx in self._db.await_batch(results)] @@ -43,7 +46,7 @@ def get_addresses_in_block(self, block: int) -> Iterable[SlimTx]: tx_bucket_size = self.get_tx_bucket_size() addresses = [] for tx_id in tx_ids: - group = tx_id // tx_bucket_size + group = self.get_id_group(tx_id, tx_bucket_size) result = self.select_one( "transaction", columns=["inputs", "outputs", "block_id", "tx_hash", "timestamp"], @@ -67,7 +70,7 @@ def get_tx_outputs( columns=["tx_id"], where={ "tx_prefix": f"{tx_hash[:tx_prefix_length]}", - "tx_hash": hex_to_bytearray(tx_hash), + "tx_hash": hex_to_bytes(tx_hash), }, ) if tx_id_record: @@ -75,7 +78,10 @@ def get_tx_outputs( result = self.select_one_safe( "transaction", columns=["outputs"], - where={"tx_id_group": tx_id // tx_bucket_size, "tx_id": tx_id}, + where={ + "tx_id_group": self.get_id_group(tx_id, tx_bucket_size), + "tx_id": tx_id, + }, ) res = {} @@ -95,7 +101,10 @@ def get_latest_tx_id_before_block(self, block_id: int) -> Optional[int]: block = self.select_one_safe( "block_transactions", - where={"block_id_group": last_block // bucket_size, "block_id": last_block}, + where={ + "block_id_group": self.get_id_group(last_block, bucket_size), + "block_id": last_block, + }, ) latest_tx_id = -1 diff --git a/src/graphsenselib/deltaupdate/deltaupdater.py b/src/graphsenselib/deltaupdate/deltaupdater.py index 2044407..3fbd7cf 100644 --- a/src/graphsenselib/deltaupdate/deltaupdater.py +++ b/src/graphsenselib/deltaupdate/deltaupdater.py @@ -145,6 +145,8 @@ def update_transformed( logger.info(f"Got shutdown signal stopping at block {b[-1]}") return b[-1] + updater.clear_cache() + return end_block @@ -213,11 +215,12 @@ def update( ) db.transformed.ingest("configuration", [config_defaults]) + du_config = config.get_deltaupdater_config(env, currency) update_transformed( start_block, end_block, UpdaterFactory().get_updater( - currency, + du_config, db, updater_version, write_new, @@ -229,6 +232,7 @@ def update( ), batch_size=write_batch_size, ) + elif end_block == start_block or start_block - 1 == end_block: logger.info("Nothing to do. Data is up to date.") else: diff --git a/src/graphsenselib/deltaupdate/update/abstractupdater.py b/src/graphsenselib/deltaupdate/update/abstractupdater.py index 46e2199..2bccf91 100644 --- a/src/graphsenselib/deltaupdate/update/abstractupdater.py +++ b/src/graphsenselib/deltaupdate/update/abstractupdater.py @@ -125,15 +125,22 @@ def process_batch(self, batch: Iterable[int]): def persist_updater_progress(self): pass + @abstractmethod + def clear_cache(self): + pass + class UpdateStrategy(AbstractUpdateStrategy): - def __init__(self, db, currency, forward_fill_rates=False): + def __init__( + self, db: AnalyticsDb, currency: str, forward_fill_rates: bool = False + ): super().__init__() self._db = db self._currency = currency self._batch_start_time = None self._nr_new_addresses = 0 self._nr_new_clusters = 0 + self._nr_new_transactions = 0 self._highest_address_id = db.transformed.get_highest_address_id() or 0 self._highest_cluster_id = db.transformed.get_highest_cluster_id() or 1 self.forward_fill_rates = forward_fill_rates diff --git a/src/graphsenselib/deltaupdate/update/account/__init__.py b/src/graphsenselib/deltaupdate/update/account/__init__.py new file mode 100644 index 0000000..cc5042a --- /dev/null +++ b/src/graphsenselib/deltaupdate/update/account/__init__.py @@ -0,0 +1,2 @@ +# flake8: noqa: F401 +from .update import UpdateStrategyAccount diff --git a/src/graphsenselib/deltaupdate/update/account.py b/src/graphsenselib/deltaupdate/update/account/accountlegacy.py similarity index 96% rename from src/graphsenselib/deltaupdate/update/account.py rename to src/graphsenselib/deltaupdate/update/account/accountlegacy.py index b006114..12c0f3c 100644 --- a/src/graphsenselib/deltaupdate/update/account.py +++ b/src/graphsenselib/deltaupdate/update/account/accountlegacy.py @@ -1,6 +1,6 @@ import logging -from .abstractupdater import ( +from graphsenselib.deltaupdate.update.abstractupdater import ( TABLE_NAME_DIRTY, TABLE_NAME_NEW, LegacyUpdateStrategy, @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) -class UpdateStrategyAccount(LegacyUpdateStrategy): +class UpdateStrategyAccountLegacy(LegacyUpdateStrategy): def __init__(self, db, currency, write_new, write_dirty, forward_fill_rates=False): super().__init__( db, currency, write_new, write_dirty, forward_fill_rates=forward_fill_rates diff --git a/src/graphsenselib/deltaupdate/update/account/createchanges.py b/src/graphsenselib/deltaupdate/update/account/createchanges.py new file mode 100644 index 0000000..46985f5 --- /dev/null +++ b/src/graphsenselib/deltaupdate/update/account/createchanges.py @@ -0,0 +1,453 @@ +import logging +from collections import defaultdict +from datetime import datetime +from typing import Any, Callable, Dict, List, NamedTuple, Tuple + +from graphsenselib.db import DbChange +from graphsenselib.deltaupdate.update.abstractupdater import TABLE_NAME_DELTA_HISTORY +from graphsenselib.deltaupdate.update.account.modelsdelta import ( + BalanceDelta, + EntityDeltaAccount, + RawEntityTxAccount, + RelationDeltaAccount, +) +from graphsenselib.deltaupdate.update.generic import DeltaValue, Tx +from graphsenselib.utils import DataObject as MutableNamedTuple +from graphsenselib.utils.account import ( + get_id_group, + get_id_group_with_secondary_addresstransactions, + get_id_group_with_secondary_relations, +) +from graphsenselib.utils.logging import LoggerScope + +logger = logging.getLogger(__name__) + + +def prepare_txs_for_ingest( + delta: List[Tx], + id_bucket_size: int, + block_bucket_size: int, + get_transaction_prefix: Callable[[bytes], Tuple[str, str]], +) -> List[DbChange]: + changes = [] + + for update in delta: + transaction_id = update.tx_id + transaction_id_group = get_id_group(transaction_id, id_bucket_size) + transaction = update.tx_hash + + transaction_prefix = get_transaction_prefix(transaction)[1] + + data = { + "transaction_id_group": transaction_id_group, + "transaction_id": transaction_id, + "transaction": transaction, + } + + chng = DbChange.new( + table="transaction_ids_by_transaction_id_group", + data=data, + ) + changes.append(chng) + + data = { + "transaction_prefix": transaction_prefix, + "transaction": transaction, + "transaction_id": transaction_id, + } + + chng = DbChange.new( + table="transaction_ids_by_transaction_prefix", + data=data, + ) + changes.append(chng) + + # get transaction ids + + changes.append(chng) + + changes.extend( + [ + DbChange.new( + table="block_transactions", + data={ + "block_id_group": get_id_group(tx.block_id, block_bucket_size), + "block_id": tx.block_id, + "tx_id": tx.tx_id, + }, + ) + for tx in delta + if not tx.failed + ] + ) + + return changes + + +def prepare_balances_for_ingest( + delta: List[BalanceDelta], id_bucket_size: int, addr_balances: dict +) -> List[DbChange]: + changes = [] + for balance in delta: + addr_id = balance.identifier + addr_group = get_id_group(addr_id, id_bucket_size) + balance_update = balance.left_join( + addr_balances.get(addr_id, BalanceDelta(addr_id, {})) + ) + + for assetname, dv in balance_update.asset_balances.items(): + chng = DbChange.update( + table="balance", + data={ + "address_id_group": addr_group, + "address_id": addr_id, + "currency": assetname, + "balance": dv.value, + }, + ) + + changes.append(chng) + + return changes + + +def prepare_relations_for_ingest( + delta: List[RelationDeltaAccount], + hash_to_id: Dict[str, bytes], + inrelations: dict, + outrelations: dict, + id_bucket_size: int, + relations_nbuckets: int, +) -> Tuple[List[DbChange], dict, dict]: + new_relations_in = defaultdict(int) + new_relations_out = defaultdict(int) + + changes = [] + + """ Merging relations deltas """ + for relations_update in delta: + outr = outrelations[ + (relations_update.src_identifier, relations_update.dst_identifier) + ].result_or_exc.one() + inr = inrelations[ + (relations_update.src_identifier, relations_update.dst_identifier) + ].result_or_exc.one() + assert (outr is None) == (inr is None) + + id_src = hash_to_id[relations_update.src_identifier] + id_dst = hash_to_id[relations_update.dst_identifier] + + src_group, src_secondary = get_id_group_with_secondary_relations( + id_src, id_dst, id_bucket_size, relations_nbuckets + ) + dst_group, dst_secondary = get_id_group_with_secondary_relations( + id_dst, id_src, id_bucket_size, relations_nbuckets + ) + + if outr is None: + """new address relation to insert""" + new_relations_out[relations_update.src_identifier] += 1 + new_relations_in[relations_update.dst_identifier] += 1 + + chng_in = DbChange.new( + table="address_incoming_relations", + data={ + "dst_address_id_group": dst_group, + "dst_address_id_secondary_group": dst_secondary, + "dst_address_id": id_dst, + "src_address_id": id_src, + "no_transactions": relations_update.no_transactions, + "value": relations_update.value, + "token_values": relations_update.token_values, + }, + ) + chng_out = DbChange.new( + table="address_outgoing_relations", + data={ + "src_address_id_group": src_group, + "src_address_id_secondary_group": src_secondary, + "src_address_id": id_src, + "dst_address_id": id_dst, + "no_transactions": relations_update.no_transactions, + "value": relations_update.value, + "token_values": relations_update.token_values, + }, + ) + + else: + """update existing adddress relation""" + nv = DeltaValue.from_db(outr.value).merge(relations_update.value) + + nv_token = outr.token_values + nv_token = nv_token if nv_token is not None else {} + new_token = relations_update.token_values + keys = set(nv_token.keys()).union(new_token.keys()) + for key in keys: + if key in nv_token and key in relations_update.token_values: + nv_token[key] = DeltaValue.from_db(nv_token[key]).merge( + relations_update.token_values[key] + ) + elif key in nv_token: + pass + elif key in relations_update.token_values: + nv_token[key] = relations_update.token_values[key] + + assert outr.no_transactions == inr.no_transactions + + chng_in = DbChange.update( + table="address_incoming_relations", + data={ + "dst_address_id_group": dst_group, + "dst_address_id_secondary_group": dst_secondary, + "dst_address_id": id_dst, + "src_address_id": id_src, + "no_transactions": outr.no_transactions + + relations_update.no_transactions, + # outr and and inr should be the same + "value": nv, + "token_values": nv_token, + }, + ) + + chng_out = DbChange.update( + table="address_outgoing_relations", + data={ + "src_address_id_group": src_group, + "src_address_id_secondary_group": src_secondary, + "src_address_id": id_src, + "dst_address_id": id_dst, + "no_transactions": outr.no_transactions + + relations_update.no_transactions, + "value": nv, + "token_values": nv_token, + }, + ) + + changes.append(chng_in) + changes.append(chng_out) + + return changes, new_relations_in, new_relations_out + + +def prepare_entities_for_ingest( + delta: List[EntityDeltaAccount], + resolve_identifier: Dict[str, int], + bytes_to_row_address: Dict[str, Any], + new_rel_in: dict, + new_rel_out: dict, + id_bucket_size: int, + get_address_prefix: Callable[[str], Tuple[str, str]], +) -> Tuple[List[DbChange], int]: + changes = [] + nr_new_entities = 0 + for update in delta: + int_ident, entity = ( + resolve_identifier[update.identifier], + bytes_to_row_address[update.identifier], + ) + + group = get_id_group(int_ident, id_bucket_size) + if entity is not None: + """old Address""" + + assert getattr(entity, "address_id") == int_ident + + # recast so we can calculate without handling None all the time + new_value = EntityDeltaAccount.from_db(entity).merge(update) + # bytes to hex + bytes_ = new_value.identifier + bytes_.hex() + assert new_value.first_tx_id <= new_value.last_tx_id + + # Nr. of addresses (no_addresses) is currently not updated for clusters + # Since no merges happen there should not be a difference + generic_data = { + "no_incoming_txs": new_value.no_incoming_txs, + "no_outgoing_txs": new_value.no_outgoing_txs, + "no_incoming_txs_zero_value": new_value.no_incoming_txs_zero_value, + "no_outgoing_txs_zero_value": new_value.no_outgoing_txs_zero_value, + "first_tx_id": new_value.first_tx_id, + "last_tx_id": new_value.last_tx_id, + "total_received": new_value.total_received, + "total_spent": new_value.total_spent, + "total_tokens_received": new_value.total_tokens_received, + "total_tokens_spent": new_value.total_tokens_spent, + "in_degree": entity.in_degree + new_rel_in[update.identifier], + "out_degree": entity.out_degree + new_rel_out[update.identifier], + "in_degree_zero_value": entity.in_degree_zero_value, # todo too broad + "out_degree_zero_value": entity.out_degree_zero_value, # todo too broad + "address_id": int_ident, + "address_id_group": group, + } + + chng = DbChange.update( + table="address", + data=generic_data, + ) + + changes.append(chng) + else: + """new address""" + assert update.first_tx_id <= update.last_tx_id + nr_new_entities += 1 + + data = { + "no_incoming_txs": update.no_incoming_txs, + "no_outgoing_txs": update.no_outgoing_txs, + "no_incoming_txs_zero_value": update.no_incoming_txs_zero_value, + "no_outgoing_txs_zero_value": update.no_outgoing_txs_zero_value, + "first_tx_id": update.first_tx_id, + "last_tx_id": update.last_tx_id, + "total_received": update.total_received, + "total_spent": update.total_spent, + "total_tokens_received": update.total_tokens_received, + "total_tokens_spent": update.total_tokens_spent, + "address_id": int_ident, + "address_id_group": group, + "in_degree": new_rel_in[update.identifier], + "out_degree": new_rel_out[update.identifier], + "in_degree_zero_value": 0, + # update.no_incoming_txs_zero_value, # todo too broad + "out_degree_zero_value": 0, + # update.no_outgoing_txs_zero_value, # todo too broad + "is_contract": False, # todo + } + data["address"] = update.identifier + chng = DbChange.new(table="address", data=data) + changes.append(chng) + address, address_prefix = get_address_prefix(update.identifier) + + changes.append( + DbChange.new( + table="address_ids_by_address_prefix", + data={ + "address": address, + "address_id": int_ident, + "address_prefix": address_prefix, + }, + ) + ) + return changes, nr_new_entities + + +def prepare_entity_txs_for_ingest( + delta: List[RawEntityTxAccount], + id_bucket_size: int, + currency: str, + block_bucket_size_address_txs: int, +) -> List[DbChange]: + """ + Creating new address transaction + """ + changes = [] + for atx in delta: + ident = atx.identifier + is_token_transfer = len(atx.token_values.keys()) > 0 + for tokenname in atx.token_values.keys(): + ( + address_id_group, + address_id_secondary_group, + ) = get_id_group_with_secondary_addresstransactions( + ident, id_bucket_size, atx.block_id, block_bucket_size_address_txs + ) + chng = DbChange.new( + table="address_transactions", + data={ + "address_id_group": address_id_group, + "address_id_secondary_group": address_id_secondary_group, + "address_id": ident, + "currency": tokenname, + "transaction_id": atx.tx_id, + "is_outgoing": atx.is_outgoing, + "tx_reference": atx.tx_reference, + }, + ) + changes.append(chng) + + if not is_token_transfer: + ( + address_id_group, + address_id_secondary_group, + ) = get_id_group_with_secondary_addresstransactions( + ident, id_bucket_size, atx.block_id, block_bucket_size_address_txs + ) + chng = DbChange.new( + table="address_transactions", + data={ + "address_id_group": address_id_group, + "address_id_secondary_group": address_id_secondary_group, + "address_id": ident, + "currency": currency, + "transaction_id": atx.tx_id, + "is_outgoing": atx.is_outgoing, + "tx_reference": atx.tx_reference, + }, + ) + + changes.append(chng) + return changes + + +def get_bookkeeping_changes( + base_statistics: MutableNamedTuple, + current_statistics: NamedTuple, + last_block_processed: int, + nr_new_address_relations: int, + nr_new_addresses: int, + nr_new_tx: int, + highest_address_id: int, + runtime_seconds: int, + bts: Dict[int, datetime], + new_blocks: int, + patch_mode: bool, +) -> List[DbChange]: + """Creates changes for the bookkeeping tables like summary statistics after + other data has been updated. + + Args: + base_statistics (MutableNamedTuple): statistics db row, all the other + parameters are note data is updated in this process + current_statistics (NamedTuple): Current value of db statistics for comparison + last_block_processed (int): Last block processed + nr_new_address_relations (int): Delta new addresses relations in changeset + nr_new_addresses (int): Delta new addresses in changeset + nr_new_tx (int): Delta new txs in changeset + highest_address_id (int): current highest address_id + runtime_seconds (int): runtime to create the last changes in seconds + bts (Dict[int, datetime]): mapping from block to its timestamp + delta values + """ + changes = [] + with LoggerScope.debug(logger, "Creating summary_statistics updates") as lg: + lb_date = bts[last_block_processed] + stats = base_statistics + stats.no_blocks = current_statistics.no_blocks + new_blocks + stats.timestamp = int(lb_date.timestamp()) + stats.no_address_relations += nr_new_address_relations + stats.no_addresses += nr_new_addresses + stats.no_transactions += nr_new_tx + + statistics = stats.as_dict() + + if current_statistics.no_blocks != stats.no_blocks: + if not patch_mode: + assert current_statistics.no_blocks < stats.no_blocks + + changes.append(DbChange.new(table="summary_statistics", data=statistics)) + + lg.debug(f"Statistics: {statistics}") + + data_history = { + "last_synced_block": last_block_processed, + "last_synced_block_timestamp": lb_date, + "highest_address_id": highest_address_id, + "timestamp": datetime.now(), + "write_new": False, + "write_dirty": False, + "runtime_seconds": runtime_seconds, + } + changes.append(DbChange.new(table=TABLE_NAME_DELTA_HISTORY, data=data_history)) + + lg.debug(f"History: {data_history}") + + return changes diff --git a/src/graphsenselib/deltaupdate/update/account/createdeltas.py b/src/graphsenselib/deltaupdate/update/account/createdeltas.py new file mode 100644 index 0000000..0c8396c --- /dev/null +++ b/src/graphsenselib/deltaupdate/update/account/createdeltas.py @@ -0,0 +1,689 @@ +import logging +from dataclasses import dataclass +from typing import Dict, List, Tuple + +import pandas as pd +from cassandra.cqlengine.columns import Integer +from cassandra.cqlengine.usertype import UserType + +from graphsenselib.deltaupdate.update.account.modelsdelta import ( + BalanceDelta, + EntityDeltaAccount, + RawEntityTxAccount, + RelationDeltaAccount, +) +from graphsenselib.deltaupdate.update.account.modelsraw import Block, Trace, Transaction +from graphsenselib.deltaupdate.update.account.tokens import TokenTransfer +from graphsenselib.deltaupdate.update.generic import DeltaScalar, DeltaValue + +logger = logging.getLogger(__name__) + +currency_to_decimals = { + "ETH": 18, + "TRX": 6, +} + + +@dataclass +class TxReference(UserType): + trace_index: Integer(required=False) + log_index: Integer(required=False) + + +def only_call_traces(traces: List) -> List: + return [trace for trace in traces if trace.call_type == "call"] + + +def get_prices( + value, decimals, block_rates, usd_equivalent, coin_equivalent +) -> List[int]: + euro_per_eth = block_rates[0] + dollar_per_eth = block_rates[1] + dollar_per_euro = dollar_per_eth / euro_per_eth + + if usd_equivalent == 1: + dollar_value = value / 10**decimals + elif coin_equivalent == 1: + dollar_value = value / 10**decimals * dollar_per_eth + else: + raise Exception( + "Unknown price type. only native coin and dollar equivalent supported atm" + ) + + euro_value = dollar_value / dollar_per_euro + return [euro_value, dollar_value] + + +def get_prices_coin(value, currency, block_rates): + coin_decimals = currency_to_decimals[currency] + return get_prices(value, coin_decimals, block_rates, 0, 1) + + +def get_entitytx_from_tokentransfer( + tokentransfer: TokenTransfer, is_outgoing, rates, hash_to_id, address_hash_to_id +) -> RawEntityTxAccount: + tx_id = hash_to_id[tokentransfer.tx_hash] + + address_hash = ( + tokentransfer.from_address if is_outgoing else tokentransfer.to_address + ) + address_id = address_hash_to_id[address_hash] + + dv = DeltaValue( + tokentransfer.value, + get_prices( + tokentransfer.value, + tokentransfer.decimals, + rates[tokentransfer.block_id], + tokentransfer.usd_equivalent, + tokentransfer.coin_equivalent, + ), + ) + + token_values = {tokentransfer.asset: dv} + + tx_reference = { + "trace_index": None, + "log_index": tokentransfer.log_index, + } + tx_reference = TxReference(**tx_reference) + + reta = RawEntityTxAccount( + identifier=address_id, + is_outgoing=is_outgoing, + tx_id=tx_id, + tx_reference=tx_reference, + value=0, + token_values=token_values, + block_id=tokentransfer.block_id, + ) + return reta + + +def get_entitytx_from_transaction( + tx: Transaction, is_outgoing, hash_to_id, address_hash_to_id +) -> RawEntityTxAccount: + tx_id = hash_to_id[tx.tx_hash] + address_hash = tx.from_address if is_outgoing else tx.to_address + + address_id = address_hash_to_id[address_hash] + + tx_reference = { + "trace_index": None, + "log_index": None, + } + tx_reference = TxReference(**tx_reference) + + reta = RawEntityTxAccount( + identifier=address_id, + is_outgoing=is_outgoing, + tx_id=tx_id, + tx_reference=tx_reference, + value=tx.value, + token_values={}, + block_id=tx.block_id, + ) + return reta + + +def balance_updates_traces_txs( + relation_updates: List[RelationDeltaAccount], + address_hash_to_id: Dict[bytes, int], + currency: str, +) -> List[BalanceDelta]: + excludedCallTypes = [ + "delegatecall", + "callcode", + "staticcall", + ] + filtered_updates = [ + update for update in relation_updates if update.type not in excludedCallTypes + ] + + return [ + BalanceDelta( + address_hash_to_id[update.src_identifier], + {currency: DeltaScalar(-update.value.value)}, + ) + for update in filtered_updates + ] + [ + BalanceDelta( + address_hash_to_id[update.dst_identifier], + {currency: DeltaScalar(update.value.value)}, + ) + for update in filtered_updates + if update.dst_identifier is not None + ] + + +def balance_updates_tokens( + relation_updates: List[RelationDeltaAccount], address_hash_to_id: Dict[bytes, int] +) -> List[BalanceDelta]: + updates = [] + for update in relation_updates: + for token, value in update.token_values.items(): + updates.append( + BalanceDelta( + address_hash_to_id[update.src_identifier], + {token: DeltaScalar(-value.value)}, + ) + ) + updates.append( + BalanceDelta( + address_hash_to_id[update.dst_identifier], + {token: DeltaScalar(value.value)}, + ) + ) + + return updates + + +def get_entitytx_from_trace( + trace: Trace, is_outgoing: bool, hash_to_id: dict, address_hash_to_id: dict +) -> RawEntityTxAccount: + tx_id = hash_to_id[trace.tx_hash] + address_hash = trace.from_address if is_outgoing else trace.to_address + address_id = address_hash_to_id[address_hash] + + tx_reference = { + "trace_index": trace.trace_index, + "log_index": None, + } + tx_reference = TxReference(**tx_reference) + + reta = RawEntityTxAccount( + identifier=address_id, + is_outgoing=is_outgoing, + tx_id=tx_id, + tx_reference=tx_reference, + value=trace.value, + token_values={}, # we dont support TRC10 right now + block_id=trace.block_id, + ) + return reta + + +def get_entitydelta_from_trace( + trace: Trace, + is_outgoing: bool, + rates: Dict[int, Tuple[float, float]], + hash_to_id: dict, + currency: str, +) -> EntityDeltaAccount: + identifier = trace.from_address if is_outgoing else trace.to_address + total_received_value = 0 if is_outgoing else trace.value + total_spent_value = trace.value if is_outgoing else 0 + total_received = DeltaValue( + total_received_value, + get_prices_coin(total_received_value, currency, rates[trace.block_id]), + ) + total_spent = DeltaValue( + total_spent_value, + get_prices_coin(total_spent_value, currency, rates[trace.block_id]), + ) + total_tokens_received = ( + {} + ) # for now we dont support TRC10, so an empty dict is fine + total_tokens_spent = {} # for now we dont support TRC10, so an empty dict is fine + if trace.tx_hash is None: + first_tx_id = -1 + last_tx_id = -1 + no_incoming_txs = 0 # spark logic + else: + first_tx_id = hash_to_id[trace.tx_hash] + last_tx_id = hash_to_id[trace.tx_hash] + no_incoming_txs = int(not is_outgoing) + + no_outgoing_txs = int(is_outgoing) + no_zerovalue = int((trace.value == 0)) # and trace.call_type == "call") + no_incoming_txs_zero_value = 0 if is_outgoing else no_zerovalue + no_outgoing_txs_zero_value = no_zerovalue if is_outgoing else 0 + + eda = EntityDeltaAccount( + identifier=identifier, + total_received=total_received, + total_spent=total_spent, + total_tokens_received=total_tokens_received, + total_tokens_spent=total_tokens_spent, + first_tx_id=first_tx_id, + last_tx_id=last_tx_id, + no_incoming_txs=no_incoming_txs, + no_outgoing_txs=no_outgoing_txs, + no_incoming_txs_zero_value=no_incoming_txs_zero_value, + no_outgoing_txs_zero_value=no_outgoing_txs_zero_value, + ) + return eda + + +def get_entitydelta_from_tokentransfer( + tokentransfer: TokenTransfer, + is_outgoing: bool, + rates: Dict[int, Tuple[float, float]], + hash_to_id: dict, +) -> EntityDeltaAccount: + identifier = tokentransfer.from_address if is_outgoing else tokentransfer.to_address + + fiat_values = get_prices( + tokentransfer.value, + tokentransfer.decimals, + rates[tokentransfer.block_id], + tokentransfer.usd_equivalent, + tokentransfer.coin_equivalent, + ) + dv = DeltaValue(tokentransfer.value, fiat_values) + + total_received = DeltaValue(0, [0, 0]) + total_spent = DeltaValue(0, [0, 0]) + total_tokens_received = {tokentransfer.asset: dv} if not is_outgoing else {} + total_tokens_spent = {tokentransfer.asset: dv} if is_outgoing else {} + first_tx_id = hash_to_id[tokentransfer.tx_hash] + last_tx_id = hash_to_id[tokentransfer.tx_hash] + no_incoming_txs = int(not is_outgoing) + no_outgoing_txs = int(is_outgoing) + no_incoming_txs_zero_value = 0 + no_outgoing_txs_zero_value = 0 + + eda = EntityDeltaAccount( + identifier=identifier, + total_received=total_received, + total_spent=total_spent, + total_tokens_received=total_tokens_received, + total_tokens_spent=total_tokens_spent, + first_tx_id=first_tx_id, + last_tx_id=last_tx_id, + no_incoming_txs=no_incoming_txs, + no_outgoing_txs=no_outgoing_txs, + no_incoming_txs_zero_value=no_incoming_txs_zero_value, + no_outgoing_txs_zero_value=no_outgoing_txs_zero_value, + ) + return eda + + +def get_entitydelta_from_transaction( + tx: Transaction, + is_outgoing: bool, + rates: Dict[int, Tuple[float, float]], + hash_to_id: Dict[str, int], + currency: str, +) -> EntityDeltaAccount: + identifier = tx.from_address if is_outgoing else tx.to_address + + total_received_value = 0 if is_outgoing else tx.value + total_spent_value = tx.value if is_outgoing else 0 + + total_received = DeltaValue( + total_received_value, + get_prices_coin(total_received_value, currency, rates[tx.block_id]), + ) + total_spent = DeltaValue( + total_spent_value, + get_prices_coin(total_spent_value, currency, rates[tx.block_id]), + ) + total_tokens_received = {} + total_tokens_spent = {} + first_tx_id = hash_to_id[tx.tx_hash] + last_tx_id = hash_to_id[tx.tx_hash] + + no_incoming_txs = int(not is_outgoing) + no_outgoing_txs = int(is_outgoing) + no_incoming_txs_zero_value = 0 if is_outgoing else int(tx.value == 0) + no_outgoing_txs_zero_value = int(tx.value == 0) if is_outgoing else 0 + + eda = EntityDeltaAccount( + identifier=identifier, + total_received=total_received, + total_spent=total_spent, + total_tokens_received=total_tokens_received, + total_tokens_spent=total_tokens_spent, + first_tx_id=first_tx_id, + last_tx_id=last_tx_id, + no_incoming_txs=no_incoming_txs, + no_outgoing_txs=no_outgoing_txs, + no_incoming_txs_zero_value=no_incoming_txs_zero_value, + no_outgoing_txs_zero_value=no_outgoing_txs_zero_value, + ) + return eda + + +def relationdelta_from_trace( + trace: Trace, rates: Dict[int, Tuple[float, float]], currency: str +) -> RelationDeltaAccount: + fadr, tadr = trace.from_address, trace.to_address + value = DeltaValue( + trace.value, get_prices_coin(trace.value, currency, rates[trace.block_id]) + ) + token_values = {} + + no_transactions = 1 + return RelationDeltaAccount( + src_identifier=fadr, + dst_identifier=tadr, + no_transactions=no_transactions, + value=value, + token_values=token_values, + type=trace.call_type, + ) + + +def relationdelta_from_transaction( + tx: Transaction, rates: Dict[int, Tuple[float, float]], currency: str +) -> RelationDeltaAccount: + iadr, oadr = tx.from_address, tx.to_address + value = DeltaValue( + tx.value, get_prices_coin(tx.value, currency, rates[tx.block_id]) + ) + token_values = {} + no_transactions = 1 + + return RelationDeltaAccount( + src_identifier=iadr, + dst_identifier=oadr, + no_transactions=no_transactions, + value=value, + token_values=token_values, + type="tx", + ) + + +def relationdelta_from_tokentransfer( + tokentransfer: TokenTransfer, rates: Dict[int, Tuple[float, float]] +) -> RelationDeltaAccount: + iadr, oadr = tokentransfer.from_address, tokentransfer.to_address + value = tokentransfer.value + dollar_value, euro_value = get_prices( + tokentransfer.value, + tokentransfer.decimals, + rates[tokentransfer.block_id], + tokentransfer.usd_equivalent, + tokentransfer.coin_equivalent, + ) + value = DeltaValue(value, [dollar_value, euro_value]) + + token_values = {tokentransfer.asset: value} + no_transactions = 1 + return RelationDeltaAccount( + src_identifier=iadr, + dst_identifier=oadr, + no_transactions=no_transactions, + value=DeltaValue(0, [0, 0]), + token_values=token_values, + type="token", + ) + + +def get_sorted_unique_addresses( + traces_s: List[Trace], + reward_traces: List[Trace], + token_transfers: List[TokenTransfer], + transactions: List[Transaction], +) -> pd.Series: + addresses_sorting_df_to_tokens = [ + { + "address": obj.to_address, + "block_id": obj.block_id, + "is_log": True, + "index": obj.log_index, + "is_from_address": False, + } + for obj in token_transfers + ] + + addresses_sorting_df_from_tokens = [ + { + "address": obj.from_address, + "block_id": obj.block_id, + "is_log": True, + "index": obj.log_index, + "is_from_address": True, + } + for obj in token_transfers + ] + + addresses_sorting_df_to_traces = [ + { + "address": obj.to_address, + "block_id": obj.block_id, + "is_log": False, + "index": obj.trace_index, + "is_from_address": False, + } + for obj in traces_s + reward_traces + ] + + addresses_sorting_df_from_traces = [ + { + "address": obj.from_address, + "block_id": obj.block_id, + "is_log": False, + "index": obj.trace_index, + "is_from_address": True, + } + for obj in traces_s + ] + + addresses_sorting_df_from_txs = [ + { + "address": obj.from_address, + "block_id": obj.block_id, + "is_log": False, + # this is a hack to imitate spark; we assume there a max 1M tx per block + "index": obj.transaction_index - 1_000_000, + "is_from_address": True, + } + for obj in transactions + if obj.from_address is not None + ] + addresses_sorting_df_to_txs = [ + { + "address": obj.to_address, + "block_id": obj.block_id, + "is_log": False, + # this is a hack to imitate spark; we assume there a max 1M tx per block + "index": obj.transaction_index - 1_000_000, + "is_from_address": False, + } + for obj in transactions + ] + + addresses_sorting_df_data = ( + addresses_sorting_df_from_traces + + addresses_sorting_df_to_traces + + addresses_sorting_df_from_txs + + addresses_sorting_df_to_txs + + addresses_sorting_df_from_tokens + + addresses_sorting_df_to_tokens + ) + + addresses_sorting_df = pd.DataFrame(addresses_sorting_df_data) + + addresses_sorting_df.sort_values( + inplace=True, by=["block_id", "is_log", "index", "is_from_address"] + ) + df_sorted_unique = addresses_sorting_df.drop_duplicates( + keep="first", subset=["address"] + ) + addresses = df_sorted_unique["address"] + return addresses + + +def get_entity_transaction_updates_trace_token( + traces_s_filtered: List[Trace], + token_transfers: List[TokenTransfer], + hash_to_id: dict, + address_hash_to_id: dict, + rates: dict, +) -> List[RawEntityTxAccount]: + trace_outgoing = [ + get_entitytx_from_trace(trace, True, hash_to_id, address_hash_to_id) + for trace in traces_s_filtered + ] + + trace_incoming = [ + get_entitytx_from_trace(trace, False, hash_to_id, address_hash_to_id) + for trace in traces_s_filtered + ] + + token_outgoing = [ + get_entitytx_from_tokentransfer(tt, True, rates, hash_to_id, address_hash_to_id) + for tt in token_transfers + ] + token_incoming = [ + get_entitytx_from_tokentransfer( + tt, False, rates, hash_to_id, address_hash_to_id + ) + for tt in token_transfers + ] + + entity_transactions_traces_tokens = ( + trace_outgoing + trace_incoming + token_outgoing + token_incoming + ) + return entity_transactions_traces_tokens + + +def get_entity_updates_trace_token( + traces_s_filtered: List[Trace], + token_transfers: List[TokenTransfer], + reward_traces: List[Trace], + hash_to_id: dict, + currency: str, + rates: dict, +): + trace_outgoing = [ + get_entitydelta_from_trace(trace, True, rates, hash_to_id, currency) + for trace in traces_s_filtered + ] + + trace_incoming = [ + get_entitydelta_from_trace(trace, False, rates, hash_to_id, currency) + for trace in traces_s_filtered + reward_traces + ] + + token_outgoing = [ + get_entitydelta_from_tokentransfer(tt, True, rates, hash_to_id) + for tt in token_transfers + ] + + token_incoming = [ + get_entitydelta_from_tokentransfer(tt, False, rates, hash_to_id) + for tt in token_transfers + ] + + entity_deltas_traces_tokens = ( + trace_outgoing + trace_incoming + token_outgoing + token_incoming + ) + return entity_deltas_traces_tokens + + +def get_entity_transactions_updates_tx( + transactions: List[Transaction], + hash_to_id: Dict[str, int], + address_hash_to_id: Dict[bytes, int], +) -> List[RawEntityTxAccount]: + outgoing = [ + get_entitytx_from_transaction(tx, True, hash_to_id, address_hash_to_id) + for tx in transactions + if tx.from_address is not None + ] + incoming = [ + get_entitytx_from_transaction(tx, False, hash_to_id, address_hash_to_id) + for tx in transactions + if tx.to_address is not None + ] + entity_transactions_tx = outgoing + incoming + return entity_transactions_tx + + +def get_entity_updates_tx( + transactions: List[Transaction], + hash_to_id: Dict[str, int], + currency: str, + rates: Dict[int, Tuple[float, float]], +) -> List[EntityDeltaAccount]: + outgoing = [ + get_entitydelta_from_transaction(tx, True, rates, hash_to_id, currency) + for tx in transactions + if tx.from_address is not None + ] + incoming = [ + get_entitydelta_from_transaction(tx, False, rates, hash_to_id, currency) + for tx in transactions + if tx.to_address is not None + ] + + entity_deltas_tx = outgoing + incoming + + return entity_deltas_tx + + +def get_balance_deltas( + relation_updates_trace: List[RelationDeltaAccount], + relation_updates_tx: List[RelationDeltaAccount], + relation_updates_tokens: List[RelationDeltaAccount], + reward_traces: List[Trace], + transactions: List[Transaction], + blocks: List[Block], + address_hash_to_id: Dict[bytes, int], + currency: str, +) -> List[BalanceDelta]: + credits_debits_tokens_eth = [] + credits_debits_tokens_eth += balance_updates_traces_txs( + relation_updates_trace + relation_updates_tx, address_hash_to_id, currency + ) + credits_debits_tokens_eth += balance_updates_tokens( + relation_updates_tokens, address_hash_to_id + ) + + miner_rewards = [ + BalanceDelta(address_hash_to_id[t.to_address], {currency: DeltaScalar(t.value)}) + for t in reward_traces + ] + + if currency == "TRX": + txFeeDebits = [] + burntFees = [] + txFeeCredits = [ + BalanceDelta( + address_hash_to_id[tx.from_address], + {currency: DeltaScalar(-tx.fee)}, + ) + for tx in transactions + if tx.from_address in address_hash_to_id + ] + elif currency == "ETH": + block_to_miner_id = { + block.block_id: address_hash_to_id[block.miner] for block in blocks + } + txFeeDebits = [ + BalanceDelta( + block_to_miner_id[tx.block_id], + {currency: DeltaScalar(tx.receipt_gas_used * tx.gas_price)}, + ) + for tx in transactions + ] + + burntFees = [ + BalanceDelta( + block_to_miner_id[b.block_id], + {currency: DeltaScalar(-b.base_fee_per_gas * b.gas_used)}, + ) + for b in blocks + ] + txFeeCredits = [ + BalanceDelta( + address_hash_to_id[tx.from_address], + {currency: DeltaScalar(-tx.receipt_gas_used * tx.gas_price)}, + ) + for tx in transactions + if tx.from_address in address_hash_to_id + ] + else: + raise ValueError(f"Unknown currency {currency}") + + balance_updates = ( + credits_debits_tokens_eth + + txFeeDebits + + txFeeCredits + + burntFees + + miner_rewards + ) + return balance_updates diff --git a/src/graphsenselib/deltaupdate/update/account/modelsdelta.py b/src/graphsenselib/deltaupdate/update/account/modelsdelta.py new file mode 100644 index 0000000..8f36fb6 --- /dev/null +++ b/src/graphsenselib/deltaupdate/update/account/modelsdelta.py @@ -0,0 +1,252 @@ +from __future__ import annotations + +from dataclasses import dataclass +from functools import reduce +from typing import List + +from cassandra.cqlengine.usertype import UserType + +from graphsenselib.deltaupdate.update.generic import ( + DeltaScalar, + DeltaUpdate, + DeltaValue, + merge_asset_dicts, + minusone_respecting_function, +) +from graphsenselib.utils import group_by, groupby_property + + +@dataclass +class EntityDeltaAccount(DeltaUpdate): + identifier: str + total_received: DeltaValue + total_spent: DeltaValue + total_tokens_received: dict[str, DeltaValue] + total_tokens_spent: dict[str, DeltaValue] + first_tx_id: int + last_tx_id: int + no_incoming_txs: int + no_outgoing_txs: int + no_incoming_txs_zero_value: int + no_outgoing_txs_zero_value: int + + @classmethod + def from_db(Cls, db_row): + identifier = db_row.address + + # empty mapping is None in cassandra + # python cassandra driver saves {} as None, so we dont + # need to worry about empty dicts not + # fitting into cassandra later + if db_row.total_tokens_spent is None: + total_tokens_spent = {} + else: + total_tokens_spent = { + k: DeltaValue.from_db(v) for k, v in db_row.total_tokens_spent.items() + } + + if db_row.total_tokens_received is None: + total_tokens_received = {} + else: + total_tokens_received = { + k: DeltaValue.from_db(v) + for k, v in db_row.total_tokens_received.items() + } + + return Cls( + identifier=identifier, + total_received=DeltaValue.from_db(db_row.total_received), + total_spent=DeltaValue.from_db(db_row.total_spent), + total_tokens_received=total_tokens_received, + total_tokens_spent=total_tokens_spent, + first_tx_id=db_row.first_tx_id, + last_tx_id=db_row.last_tx_id, + no_incoming_txs=db_row.no_incoming_txs, + no_outgoing_txs=db_row.no_outgoing_txs, + no_incoming_txs_zero_value=db_row.no_incoming_txs_zero_value, + no_outgoing_txs_zero_value=db_row.no_outgoing_txs_zero_value, + ) + + def merge(self, other_delta): + assert self.identifier == other_delta.identifier + + # self and other total_tokens_received + # may not have the same keys, fix the following: + total_tokens_received = merge_asset_dicts( + self.total_tokens_received, other_delta.total_tokens_received + ) + + total_tokens_spent = merge_asset_dicts( + self.total_tokens_spent, other_delta.total_tokens_spent + ) + + return EntityDeltaAccount( + identifier=self.identifier, + total_received=self.total_received.merge(other_delta.total_received), + total_spent=self.total_spent.merge(other_delta.total_spent), + total_tokens_received=total_tokens_received, + total_tokens_spent=total_tokens_spent, + first_tx_id=minusone_respecting_function( + self.first_tx_id, other_delta.first_tx_id, min + ), + last_tx_id=minusone_respecting_function( + self.last_tx_id, other_delta.last_tx_id, max + ), + no_incoming_txs=self.no_incoming_txs + other_delta.no_incoming_txs, + no_outgoing_txs=self.no_outgoing_txs + other_delta.no_outgoing_txs, + no_incoming_txs_zero_value=self.no_incoming_txs_zero_value + + other_delta.no_incoming_txs_zero_value, + no_outgoing_txs_zero_value=self.no_outgoing_txs_zero_value + + other_delta.no_outgoing_txs_zero_value, + ) + + +@dataclass +class RawEntityTxAccount: + identifier: str + is_outgoing: bool + tx_id: int + tx_reference: UserType + block_id: int + value: int + token_values: dict[str, int] + + +@dataclass +class RelationDeltaAccount(DeltaUpdate): + src_identifier: bytes + dst_identifier: bytes + no_transactions: int + value: DeltaValue + token_values: dict[str, DeltaValue] + type: str # noqa + + @classmethod + def from_db(Cls, db_row): + return Cls( + src_identifier=getattr(db_row, "src_address"), + dst_identifier=getattr(db_row, "dst_address"), + no_transactions=db_row.no_transactions, + value=DeltaValue.from_db(db_row.value), + token_values={ + k: DeltaValue.from_db(v) for k, v in db_row.token_values.items() + }, + type="from_db", + ) + + def merge(self, other_delta): + assert self.src_identifier == other_delta.src_identifier + assert self.dst_identifier == other_delta.dst_identifier + + token_values = merge_asset_dicts(self.token_values, other_delta.token_values) + + return RelationDeltaAccount( + src_identifier=self.src_identifier, + dst_identifier=self.dst_identifier, + value=self.value.merge(other_delta.value), + token_values=token_values, + no_transactions=self.no_transactions + other_delta.no_transactions, + type="merged", + ) + + +@dataclass +class BalanceDelta(DeltaUpdate): + identifier: int + asset_balances: dict[str, DeltaScalar] + + @classmethod + def from_db(Cls, identifier, db_row_list): + if len(db_row_list) == 0: + return Cls( + identifier=identifier, + asset_balances={}, + ) + + asset_balances = {x.currency: DeltaScalar(x.balance) for x in db_row_list} + return Cls( + identifier=identifier, + asset_balances=asset_balances, + ) + + def merge(self, other_delta): + assert self.identifier == other_delta.identifier + + asset_balances = { + k: self.asset_balances.get(k, DeltaScalar(0)).merge( + other_delta.asset_balances.get(k, DeltaScalar(0)) + ) + for k in set(self.asset_balances.keys()) + | set(other_delta.asset_balances.keys()) + } + return BalanceDelta( + identifier=self.identifier, + asset_balances=asset_balances, + ) + + def left_join(self, other_delta): + assert self.identifier == other_delta.identifier + asset_balances = { + k: self.asset_balances.get(k, DeltaScalar(0)).merge( + other_delta.asset_balances.get(k, DeltaScalar(0)) + ) + for k in self.asset_balances.keys() + } + return BalanceDelta( + identifier=self.identifier, + asset_balances=asset_balances, + ) + + +@dataclass +class DbDeltaAccount: + entity_updates: List[EntityDeltaAccount] + new_entity_txs: List[RawEntityTxAccount] + relation_updates: List[RelationDeltaAccount] + balance_updates: List[BalanceDelta] + + def concat(self, other): + return DbDeltaAccount( + entity_updates=self.entity_updates + other.entity_updates, + new_entity_txs=self.new_entity_txs + other.new_entity_txs, + relation_updates=self.relation_updates + other.relation_updates, + balance_updates=self.balance_updates + other.balance_updates, + ) + + @staticmethod + def merge(change_sets: List[DbDeltaAccount]) -> "DbDeltaAccount": + return reduce(lambda x, y: x.concat(y), change_sets).compress() + + def compress(self): + grouped = groupby_property( + self.entity_updates, "identifier", sort_by="first_tx_id" + ) + entity_updates_merged = { + k: reduce(lambda x, y: x.merge(y), v) for k, v in grouped.items() + } + assert len(entity_updates_merged.keys()) == len( + set(entity_updates_merged.keys()) + ) + + grouped = group_by( + self.relation_updates, lambda x: (x.src_identifier, x.dst_identifier) + ) + relations_updates_merged = { + (src, dst): reduce(lambda x, y: x.merge(y), v) + for (src, dst), v in grouped.items() + } + + grouped = group_by(self.balance_updates, lambda x: x.identifier) + balance_updates_merged = { + k: reduce(lambda x, y: x.merge(y), v) for k, v in grouped.items() + } + + return DbDeltaAccount( + entity_updates=sorted( + entity_updates_merged.values(), + key=lambda x: (x.first_tx_id, x.last_tx_id), + ), + new_entity_txs=self.new_entity_txs, + relation_updates=list(relations_updates_merged.values()), + balance_updates=list(balance_updates_merged.values()), + ) diff --git a/src/graphsenselib/deltaupdate/update/account/modelsraw.py b/src/graphsenselib/deltaupdate/update/account/modelsraw.py new file mode 100644 index 0000000..04a4f55 --- /dev/null +++ b/src/graphsenselib/deltaupdate/update/account/modelsraw.py @@ -0,0 +1,124 @@ +from typing import Optional + +from pydantic import BaseModel + + +class BlockchainAdapter: + datamodel = None + name_remapping = {} + field_processing = {} + dataclass_name = "" + + def dict_to_dataclass(self, data_dict): + return self.datamodel.parse_obj(data_dict) + + def process_fields(self, data_object): + # Check if the object is an instance of a dataclass + for field_name, field_processor in self.field_processing.items(): + setattr( + data_object, + field_name, + field_processor(getattr(data_object, field_name)), + ) + return data_object + + def rename_dict(self, data_dict): + for old_name, new_name in self.name_remapping.items(): + if old_name in data_dict: + data_dict[new_name] = data_dict.pop(old_name) + return data_dict + + def dict_to_renamed_dataclass(self, data_dict): + dc = self.datamodel + renamed_dict = self.rename_dict(data_dict) + data_req = {k: v for k, v in renamed_dict.items() if k in dc.__annotations__} + return dc(**data_req) + + def dicts_to_dataclasses(self, data_dicts): + return [self.dict_to_dataclass(data_dict) for data_dict in data_dicts] + + def dicts_to_renamed_dataclasses(self, data_dicts): + return [self.dict_to_renamed_dataclass(data_dict) for data_dict in data_dicts] + + def process_fields_in_list(self, data_list): + return [self.process_fields(data_object) for data_object in data_list] + + +class Trace(BaseModel): + block_id: int + tx_hash: Optional[bytes] + trace_index: int + from_address: Optional[bytes] + to_address: Optional[bytes] + value: int + call_type: Optional[str] + status: int + + +class Transaction(BaseModel): + transaction_index: int + tx_hash: bytes + from_address: Optional[bytes] + to_address: Optional[bytes] + value: int + gas_price: int + transaction_type: int + receipt_gas_used: int + receipt_status: int + block_id: int + + +class TronTransaction(Transaction): + fee: Optional[int] + + +class Log(BaseModel): + block_id: int + tx_hash: bytes + log_index: int + address: bytes + topics: list + data: bytes + + +class Block(BaseModel): + block_id: int + miner: bytes + base_fee_per_gas: int + gas_used: int + + +class AccountTraceAdapter(BlockchainAdapter): + datamodel = Trace + + +class AccountTransactionAdapter(BlockchainAdapter): + datamodel = Transaction + + +class TrxTransactionAdapter(BlockchainAdapter): + datamodel = TronTransaction + + +class AccountLogAdapter(BlockchainAdapter): + datamodel = Log + + +class TrxTraceAdapter(AccountTraceAdapter): + def __init__(self): + self.name_remapping = { + "caller_address": "from_address", + "transferto_address": "to_address", + "rejected": "status", + "note": "call_type", + "call_value": "value", + } + self.field_processing = {"status": lambda x: int(not x)} # cast boolean to int + + +class EthTraceAdapter(AccountTraceAdapter): + pass + + +class AccountBlockAdapter(BlockchainAdapter): + datamodel = Block diff --git a/src/graphsenselib/deltaupdate/update/account/tokens.py b/src/graphsenselib/deltaupdate/update/account/tokens.py new file mode 100644 index 0000000..9b1416d --- /dev/null +++ b/src/graphsenselib/deltaupdate/update/account/tokens.py @@ -0,0 +1,106 @@ +from dataclasses import dataclass + +from eth_abi import decode_single +from eth_utils import function_abi_to_4byte_selector, to_hex +from web3 import Web3 + +from graphsenselib.deltaupdate.update.resources.supported_tokens_eth import ( + SUPPORTED_TOKENS as eth_tokens, +) +from graphsenselib.deltaupdate.update.resources.supported_tokens_trx import ( + SUPPORTED_TOKENS as trx_tokens, +) + + +@dataclass +class TokenTransfer: + from_address: bytes + to_address: bytes + value: int + asset: str + decimals: int + coin_equivalent: int + usd_equivalent: float + block_id: int + tx_hash: bytes + log_index: int + + +class ERC20Decoder: + def __init__(self, network="eth"): + self.w3 = Web3() + + self.token_transfer_event_abi = { + "anonymous": False, + "inputs": [ + {"indexed": True, "name": "from", "type": "address"}, + {"indexed": True, "name": "to", "type": "address"}, + {"indexed": False, "name": "value", "type": "uint256"}, + ], + "name": "Transfer", + "type": "event", + } + + self.token_transfer_event_selector = self.get_event_selector( + self.token_transfer_event_abi + ) + self.network = network + + # todo this should be in a config file + if self.network == "eth": + self.supported_tokens = eth_tokens + elif self.network == "trx": + self.supported_tokens = trx_tokens + else: + raise Exception("Unsupported network") + + def get_event_selector(self, event_abi): + return to_hex(function_abi_to_4byte_selector(event_abi)) + + def log_to_transfer(self, log): + if "0x" + log.address.hex() in self.supported_tokens["address"].values: + return self.decode_transfer(log) + + def decode_transfer(self, log): + if "0x" + log.topics[0].hex()[:8] == self.token_transfer_event_selector: + if "0x" + log.address.hex() not in self.supported_tokens["address"].values: + raise Exception( + "Unsupported token, use the log_to_transfer function instead" + ) + + try: + sender = bytes.fromhex( + self.w3.toChecksumAddress(decode_single("address", log.topics[1]))[ + 2: + ] + ) + recipient = bytes.fromhex( + self.w3.toChecksumAddress(decode_single("address", log.topics[2]))[ + 2: + ] + ) + value = decode_single("uint256", log.data) + mask = self.supported_tokens["address"] == "0x" + log.address.hex() + asset = self.supported_tokens[mask]["asset"].values[0] + coin_equivalent = self.supported_tokens[mask]["coin_equivalent"].values[ + 0 + ] + usd_equivalent = self.supported_tokens[mask]["usd_equivalent"].values[0] + decimals = self.supported_tokens[mask]["decimals"].values[0] + + return TokenTransfer( + from_address=sender, + to_address=recipient, + value=value, + asset=asset, + decimals=decimals, + coin_equivalent=coin_equivalent, + usd_equivalent=usd_equivalent, + block_id=log.block_id, + tx_hash=log.tx_hash, + log_index=log.log_index, + ) + except Exception: + return None # cant be decoded + else: + return None # not a transfer event diff --git a/src/graphsenselib/deltaupdate/update/account/update.py b/src/graphsenselib/deltaupdate/update/account/update.py new file mode 100644 index 0000000..3236f34 --- /dev/null +++ b/src/graphsenselib/deltaupdate/update/account/update.py @@ -0,0 +1,741 @@ +import logging +import time +from typing import Dict, List, Tuple + +import pandas as pd +from diskcache import Cache + +from graphsenselib.config.config import DeltaUpdaterConfig +from graphsenselib.db import DbChange +from graphsenselib.deltaupdate.update.abstractupdater import ( + TABLE_NAME_DELTA_HISTORY, + UpdateStrategy, +) +from graphsenselib.deltaupdate.update.account.createchanges import ( + get_bookkeeping_changes, + prepare_balances_for_ingest, + prepare_entities_for_ingest, + prepare_entity_txs_for_ingest, + prepare_relations_for_ingest, + prepare_txs_for_ingest, +) +from graphsenselib.deltaupdate.update.account.createdeltas import ( + get_balance_deltas, + get_entity_transaction_updates_trace_token, + get_entity_transactions_updates_tx, + get_entity_updates_trace_token, + get_entity_updates_tx, + get_sorted_unique_addresses, + only_call_traces, + relationdelta_from_tokentransfer, + relationdelta_from_trace, + relationdelta_from_transaction, +) +from graphsenselib.deltaupdate.update.account.modelsdelta import ( + BalanceDelta, + DbDeltaAccount, + RawEntityTxAccount, + RelationDeltaAccount, +) +from graphsenselib.deltaupdate.update.account.modelsraw import ( + AccountBlockAdapter, + AccountLogAdapter, + AccountTransactionAdapter, + Block, + EthTraceAdapter, + Log, + Trace, + Transaction, + TrxTraceAdapter, + TrxTransactionAdapter, +) +from graphsenselib.deltaupdate.update.account.tokens import ERC20Decoder +from graphsenselib.deltaupdate.update.generic import ApplicationStrategy, Tx +from graphsenselib.deltaupdate.update.utxo.update import apply_changes +from graphsenselib.schema.schema import GraphsenseSchemas +from graphsenselib.utils import DataObject as MutableNamedTuple +from graphsenselib.utils import no_nones +from graphsenselib.utils.account import ( + get_id_group_with_secondary_addresstransactions, + get_id_group_with_secondary_relations, +) +from graphsenselib.utils.cache import TableBasedCache +from graphsenselib.utils.errorhandling import CrashRecoverer +from graphsenselib.utils.logging import LoggerScope + +logger = logging.getLogger(__name__) + + +COINBASE_PSEUDO_ADDRESS = None +PSEUDO_ADDRESS_AND_IDS = {COINBASE_PSEUDO_ADDRESS: -1} + +DEFAULT_SUMMARY_STATISTICS = MutableNamedTuple( + **{ + "id": 0, + "timestamp": 0, + "timestamp_transform": 0, + "no_blocks": 0, + "no_blocks_transform": 0, + "no_transactions": 0, + "no_addresses": 0, + "no_address_relations": 0, + "no_clusters": 0, + "no_cluster_relations": 0, + } +) + + +class UpdateStrategyAccount(UpdateStrategy): + def __init__( + self, + db, + du_config: DeltaUpdaterConfig, + pedantic: bool, + application_strategy: ApplicationStrategy = ApplicationStrategy.TX, + patch_mode: bool = False, + forward_fill_rates: bool = False, + ): + super().__init__(db, du_config.currency, forward_fill_rates=forward_fill_rates) + self.du_config = du_config + crash_file = ( + "/tmp/account_deltaupdate_" + f"{self._db.raw.get_keyspace()}_{self._db.transformed.get_keyspace()}" + "_crashreport.err" + ) + stats_value = self._db.transformed.get_summary_statistics() + """ Make statistics row mutable""" + self._statistics = ( + MutableNamedTuple(**stats_value._asdict()) + if stats_value is not None + else DEFAULT_SUMMARY_STATISTICS + ) + # get ingest config + self._pedantic = pedantic + self._patch_mode = patch_mode + self.changes = None + self.application_strategy = application_strategy + logger.info(f"Updater running in {application_strategy} mode.") + self.crash_recoverer = CrashRecoverer(crash_file) + + def consume_transaction_id_composite(self, block_id, transaction_index): + return (block_id << 32) + transaction_index + + def clear_cache(self): + cache = Cache(self.du_config.fs_cache.directory) + cache.clear() + + def persist_updater_progress(self): + if self.changes is not None: + atomic = ApplicationStrategy.TX == self.application_strategy + apply_changes( + self._db, self.changes, self._pedantic, try_atomic_writes=atomic + ) + self.changes = None + self._time_last_batch = time.time() - self._batch_start_time + + def prepare_database(self): + with LoggerScope.debug(logger, "Preparing database"): + if self._db.transformed.has_delta_updater_v1_tables(): + raise Exception( + "Tables of the delta-updater v1 detected. " + "please delete new_addresses, dirty_address, " + "detla_updater_state and delta-updater_history " + "before using delta updater v2." + ) + GraphsenseSchemas().ensure_table_exists_by_name( + self._db.transformed, + TABLE_NAME_DELTA_HISTORY, + truncate=False, + ) + + def get_block_data(self, cache, block): + txs = cache.get(("transaction", block), []) + traces = cache.get(("trace", block), []) + logs = cache.get(("log", block), []) + blocks = cache.get(("block", block), []) + return txs, traces, logs, blocks + + def get_fee_data(self, cache, txs): + return cache.get(("fee", txs), [{"fee": None}])[0]["fee"] + + def process_batch_impl_hook(self, batch): + rates = {} + transactions = [] + traces = [] + logs = [] + blocks = [] + bts = {} + """ + Read transaction and exchange rates data + """ + with LoggerScope.debug(logger, "Checking recovery state.") as lg: + if self.crash_recoverer.is_in_recovery_mode(): + """ + If we are in recovery mode we start with a block earlier to catch up + the delta otherwise would start with whats in the db +1 + In case of an error in between blocks this would mean skipping to + the next block + """ + mb = max(0, min(batch) - 1) + lg.warning( + "Delta update is in crash recovery mode. Crash hint is " + f"{self.crash_recoverer.get_recovery_hint()} in " + f"{self.crash_recoverer.get_recovery_hint_filename()} " + f" restarting at block {mb}." + ) + batch = [mb] + batch + + with LoggerScope.debug(logger, "Reading transaction and rates data") as log: + missing_rates_in_block = False + cache = TableBasedCache(Cache(self.du_config.fs_cache.directory)) + for block in batch: + txs_new, traces_new, logs_new, blocks_new = self.get_block_data( + cache, block + ) + transactions.extend(txs_new) + traces.extend(traces_new) + logs.extend(logs_new) + blocks.extend(blocks_new) + + fiat_values = self._db.transformed.get_exchange_rates_by_block( + block + ).fiat_values + if fiat_values is None: + missing_rates_in_block = True + fiat_values = [0, 0] + rates[block] = fiat_values + bts[block] = self._db.raw.get_block_timestamp(block) + + if missing_rates_in_block: + log.warning("Block Range has missing exchange rates. Using Zero.") + + if self.application_strategy == ApplicationStrategy.BATCH: + if self.crash_recoverer.is_in_recovery_mode(): + raise Exception("Batch mode is not allowed in recovery mode.") + + if self.currency == "trx": + trace_adapter = TrxTraceAdapter() + transaction_adapter = TrxTransactionAdapter() + for tx in transactions: + tx["fee"] = self.get_fee_data(cache, tx["tx_hash"]) + + elif self.currency == "eth": + trace_adapter = EthTraceAdapter() + transaction_adapter = AccountTransactionAdapter() + + # convert dictionaries to dataclasses and unify naming + log_adapter = AccountLogAdapter() + block_adapter = AccountBlockAdapter() + traces = trace_adapter.dicts_to_renamed_dataclasses(traces) + traces = trace_adapter.process_fields_in_list(traces) + transactions = transaction_adapter.dicts_to_dataclasses(transactions) + logs = log_adapter.dicts_to_dataclasses(logs) + blocks = block_adapter.dicts_to_dataclasses(blocks) + + changes = [] + if 0 in [len(blocks), len(transactions)]: + logger.debug("No blocks to process. Might not be in the cache.") + return # no blocks to process + + (tx_changes, nr_new_addresses, nr_new_address_relations) = self.get_changes( + transactions, traces, logs, blocks, rates + ) + + changes.extend(tx_changes) + last_block_processed = batch[-1] + + if self.currency == "trx": + nr_new_tx = len([tx for tx in transactions if tx.receipt_status == 1]) + else: + nr_new_tx = len(transactions) + + runtime_seconds = int(time.time() - self.batch_start_time) + + bookkeeping_changes = get_bookkeeping_changes( + self._statistics, + self._db.transformed.get_summary_statistics(), + last_block_processed, + nr_new_address_relations, + nr_new_addresses, + nr_new_tx, + self.highest_address_id, + runtime_seconds, + bts, + len(batch), + patch_mode=self._patch_mode, + ) + + changes.extend(bookkeeping_changes) + + # Store changes to be written + # They are applied at the end of the batch in + # persist_updater_progress + self.changes = changes + + else: + raise ValueError( + f"Unknown application strategy {self.application_strategy}" + ) + + def get_changes( + self, + transactions: List[Transaction], + traces: List[Trace], + logs: List[Log], + blocks: List[Block], + rates: Dict[int, List], + ) -> Tuple[List[DbChange], int, int]: + currency = self.currency.upper() + id_bucket_size = self._db.transformed.get_address_id_bucket_size() + block_bucket_size = self._db.transformed.get_block_id_bucket_size() + block_bucket_size_address_txs = ( + self._db.transformed.get_address_transactions_id_bucket_size() + ) + relations_nbuckets = self._db.transformed.get_addressrelations_ids_nbuckets() + tdb = self._db.transformed + + def get_next_address_ids_with_aliases(address: str): + return ( + self.consume_address_id() + if address not in PSEUDO_ADDRESS_AND_IDS + else PSEUDO_ADDRESS_AND_IDS[address] + ) + + def get_tx_prefix(tx_hash): + tx_hash = tdb.to_db_tx_hash(tx_hash) + return (tx_hash.db_encoding, tx_hash.prefix) + + def get_address_prefix(address_str): + address = tdb.to_db_address(address_str) + return (address.db_encoding, address.prefix) + + if currency == "TRX": + transactions = [tx for tx in transactions if tx.to_address is not None] + transactions = [tx for tx in transactions if tx.receipt_status == 1] + + elif currency == "ETH": + pass + else: + raise ValueError(f"Unknown currency {currency}") + + hash_to_id = { + tx.tx_hash: self.consume_transaction_id_composite( + tx.block_id, tx.transaction_index + ) + for tx in transactions + } + + tx_hashes = [tx.tx_hash for tx in transactions] + reward_traces = [t for t in traces if t.tx_hash is None] + # traces without reward traces: + traces = [trace for trace in traces if trace.tx_hash is not None] + # calculate successful traces: + traces_s = [trace for trace in traces if trace.status == 1] + + hash_to_tx = dict(zip(tx_hashes, transactions)) + + with LoggerScope.debug(logger, "Decode logs to token transfers"): + tokendecoder = ERC20Decoder(self.currency) + token_transfers = no_nones( + [tokendecoder.log_to_transfer(log) for log in logs] + ) + + with LoggerScope.debug(logger, "Compute unique addresses in correct order"): + if currency == "TRX": + transactions_for_addresses = transactions + elif currency == "ETH": + transactions_for_addresses = [] + else: + raise ValueError(f"Unknown currency {currency}") + + addresses = get_sorted_unique_addresses( + traces_s, reward_traces, token_transfers, transactions_for_addresses + ) + len_addr = len(addresses) + + with LoggerScope.debug( + logger, f"Checking existence for {len_addr} addresses" + ) as _: + addr_ids = dict(tdb.get_address_id_async_batch(list(addresses))) + + with LoggerScope.debug(logger, "Reading addresses to be updated"): + existing_addr_ids = no_nones( + [address_id.result_or_exc.one() for adr, address_id in addr_ids.items()] + ) + + global addresses_resolved + addresses_resolved = dict( + tdb.get_address_async_batch( + [adr.address_id for adr in existing_addr_ids] + ) + ) + + def get_resolved_address(addr_id_exc): + addr_id = addr_id_exc.result_or_exc.one() + return ( + (None, None) + if addr_id is None + else ( + addr_id.address_id, + addresses_resolved[ + addr_id.address_id + ].result_or_exc.one(), # noqa + ) + ) + + addresses_to_id__rows = { + adr: get_resolved_address(address_id) + for adr, address_id in addr_ids.items() + } + + del addresses_resolved + + bytes_to_row_address = { + address: row[1] for address, row in addresses_to_id__rows.items() + } + for addr in addresses: + addr_id, address = addresses_to_id__rows[addr] + if addr_id is None: + new_addr_id = get_next_address_ids_with_aliases(addr) + addresses_to_id__rows[addr] = (new_addr_id, None) + + address_hash_to_id = { + address: id_row[0] for address, id_row in addresses_to_id__rows.items() + } + + with LoggerScope.debug(logger, "Get transactions to insert into the database"): + txs_to_insert = [] + + for tx_hash in tx_hashes: + tx_id = hash_to_id[tx_hash] + tx_index = hash_to_tx[tx_hash].transaction_index + block_id = hash_to_tx[tx_hash].block_id + failed = hash_to_tx[tx_hash].receipt_status == 0 + txs_to_insert.append( + Tx( + block_id=block_id, + tx_id=tx_id, + tx_hash=tx_hash, + tx_index=tx_index, + failed=failed, + ) + ) + + with LoggerScope.debug( + logger, "Get entity transaction updates - traces and tokens" + ): + entity_transactions = [] + entity_deltas = [] + + if currency == "TRX": + traces_s_filtered = only_call_traces(traces_s) # successful and call + elif currency == "ETH": + traces_s_filtered = traces_s + else: + raise ValueError(f"Unknown currency {currency}") + + entity_transactions += get_entity_transaction_updates_trace_token( + traces_s_filtered, + token_transfers, + hash_to_id, + address_hash_to_id, + rates, + ) + + with LoggerScope.debug(logger, "Get entity updates - traces and tokens"): + entity_deltas += get_entity_updates_trace_token( + traces_s_filtered, + token_transfers, + reward_traces, + hash_to_id, + currency, + rates, + ) + + with LoggerScope.debug(logger, "Get relation updates - traces and tokens"): + relation_updates_trace = [ + relationdelta_from_trace(trace, rates, currency) + for trace in traces_s_filtered + ] + relation_updates_tokens = [ + relationdelta_from_tokentransfer(tt, rates) for tt in token_transfers + ] + relation_updates = relation_updates_trace + relation_updates_tokens + + with LoggerScope.debug( + logger, + "Get entity and entity transaction " + "updates from tranasactions (only tron)", + ): + # in eth we disregard the eth values because they are already in the traces + # in tron only traces that are not the initial transaction have values, + # so we still need to add the value from the transaction + if currency == "TRX": + entity_transactions_tx = get_entity_transactions_updates_tx( + transactions, hash_to_id, address_hash_to_id + ) + + entity_deltas_tx = get_entity_updates_tx( + transactions, + hash_to_id, + currency, + rates, + ) + + entity_deltas += entity_deltas_tx + + relation_updates_tx = [ + relationdelta_from_transaction(tx, rates, currency) + for tx in transactions + if tx.from_address is not None + ] + + entity_transactions += entity_transactions_tx + relation_updates += relation_updates_tx + + elif currency == "ETH": + relation_updates_tx = [] + else: + raise ValueError(f"Unknown currency {currency}") + + with LoggerScope.debug(logger, "Get balance updates"): + """Get balance updates""" + balance_updates = get_balance_deltas( + relation_updates_trace, + relation_updates_tx, + relation_updates_tokens, + reward_traces, + transactions, + blocks, + address_hash_to_id, + currency, + ) + + with LoggerScope.debug(logger, "Create dbdelta and compress"): + """Combine all updates except the pure inserts into a delta object""" + dbdelta = DbDeltaAccount( + entity_deltas, entity_transactions, relation_updates, balance_updates + ) + """ Group and merge deltas before merge with db deltas """ + dbdelta = dbdelta.compress() + + with LoggerScope.debug(logger, "Query data from database"): + # Query outrelations + rel_to_query = [ + ( + addresses_to_id__rows[update.src_identifier][0], + addresses_to_id__rows[update.dst_identifier][0], + ) + for update in dbdelta.relation_updates + ] + addr_outrelations_q = tdb.get_address_outrelations_async_batch_account( + rel_to_query + ) + addr_outrelations = { + (update.src_identifier, update.dst_identifier): qr + for update, qr in zip(dbdelta.relation_updates, addr_outrelations_q) + } + + # Query inrelations + rel_to_query = [ + ( + addresses_to_id__rows[update.dst_identifier][0], + addresses_to_id__rows[update.src_identifier][0], + ) + for update in dbdelta.relation_updates + ] + addr_inrelations_q = tdb.get_address_inrelations_async_batch_account( + rel_to_query + ) + addr_inrelations = { + (update.src_identifier, update.dst_identifier): qr + for update, qr in zip(dbdelta.relation_updates, addr_inrelations_q) + } + + # Query balances of addresses + address_ids = [addresses_to_id__rows[address][0] for address in addresses] + addr_balances_q = ( + tdb.get_balance_async_batch_account( # could probably query less + address_ids + ) + ) + addr_balances = { + addr_id: BalanceDelta.from_db(addr_id, qr.result_or_exc.all()) + for addr_id, qr in zip(address_ids, addr_balances_q) + } + + with LoggerScope.debug(logger, "Prepare changes"): + changes = [] + + """ Inserts of new transactions """ + changes += prepare_txs_for_ingest( + txs_to_insert, + id_bucket_size, + block_bucket_size, + get_tx_prefix, + ) + + """ Merging max secondary ID """ + changes += self.prepare_and_query_max_secondary_id( + dbdelta.relation_updates, + dbdelta.new_entity_txs, + id_bucket_size, + address_hash_to_id, + ) + + """ Merging entity transactions """ + changes += prepare_entity_txs_for_ingest( + dbdelta.new_entity_txs, + id_bucket_size, + currency, + block_bucket_size_address_txs, + ) + + """ Merging balances""" + changes += prepare_balances_for_ingest( + dbdelta.balance_updates, id_bucket_size, addr_balances + ) + + """ Merging relations deltas """ + ( + changes_relations, + new_rels_in, + new_rels_out, + ) = prepare_relations_for_ingest( + dbdelta.relation_updates, + address_hash_to_id, + addr_inrelations, + addr_outrelations, + id_bucket_size, + relations_nbuckets, + ) + changes += changes_relations + + """ Merging entity deltas """ + entity_changes, nr_new_entities = prepare_entities_for_ingest( + dbdelta.entity_updates, + address_hash_to_id, + bytes_to_row_address, + new_rels_in, + new_rels_out, + id_bucket_size, + get_address_prefix, + ) + changes += entity_changes + + assert sum(new_rels_in.values()) == sum(new_rels_out.values()) + nr_new_rels = sum(new_rels_in.values()) + nr_new_entities_created = nr_new_entities + + return ( + changes, + nr_new_entities_created, + nr_new_rels, + ) + + def prepare_and_query_max_secondary_id( + self, + relation_updates: List[RelationDeltaAccount], + new_entity_txs: List[RawEntityTxAccount], + id_bucket_size: int, + address_hash_to_id: Dict[bytes, int], + ): + relations_nbuckets = self._db.transformed.get_addressrelations_ids_nbuckets() + + def max_secondary_dict_from_db(df, id_group_col, grp_col): + # query max secondary ids from database + unique_address_id_groups = list(df[grp_col]) + max_secondary_atx = self._db.transformed.get_max_secondary_ids_async( + unique_address_id_groups, tablename, id_group_col + ) + max_secondary_atx = [qr.result_or_exc.one() for qr in max_secondary_atx] + # use placeholder -1 if there is nothing in the database yet. + # Will be consumed by max + # and not 0, or otherwise the logic will say it shouldnt update + max_secondary_atx = [ + res[1] if res is not None else -1 for res in max_secondary_atx + ] + max_secondary_atx = dict(zip(unique_address_id_groups, max_secondary_atx)) + return max_secondary_atx + + def get_max_secondary_changes(data, tablename, grp_col, sec_col): + max_col = "max_secondary_id" + df = pd.DataFrame(data, columns=[grp_col, sec_col]) + df = df.groupby(grp_col).max() + df = df.reset_index() + df = df.rename(columns={sec_col: max_col}) + max_secondary_atx = max_secondary_dict_from_db(df, grp_col, grp_col) + df[max_col + "old"] = df[grp_col].map(max_secondary_atx) + df[max_col] = df[[max_col, max_col + "old"]].max(axis=1) + # convert to Db changes + + changes = [ + DbChange.update( + table=tablename, + data={ + grp_col: row[grp_col], + "max_secondary_id": row[max_col], + }, + ) + for _, row in df.iterrows() + if row[max_col] != row[max_col + "old"] + ] + return changes + + block_bucket_size_address_txs = ( + self._db.transformed.get_address_transactions_id_bucket_size() + ) + """ secondary group id for address transactions and address in/out relations""" + tablename = "address_transactions_secondary_ids" + grp_col, sec_col = "address_id_group", "address_id_secondary_group" + secondary_group_data = [ + get_id_group_with_secondary_addresstransactions( + tx.identifier, + id_bucket_size, + tx.block_id, + block_bucket_size_address_txs, + ) + for tx in new_entity_txs + ] + changes_secondary_atx = get_max_secondary_changes( + secondary_group_data, tablename, grp_col, sec_col + ) + + tablename = "address_outgoing_relations_secondary_ids" + grp_col, sec_col = "src_address_id_group", "src_address_id_secondary_group" + + secondary_group_data = [ + get_id_group_with_secondary_relations( + address_hash_to_id[tx.src_identifier], + address_hash_to_id[tx.dst_identifier], + id_bucket_size, + relations_nbuckets, + ) + for tx in relation_updates + ] + changes_secondary_aor = get_max_secondary_changes( + secondary_group_data, tablename, grp_col, sec_col + ) + + tablename = "address_incoming_relations_secondary_ids" + grp_col, sec_col = "dst_address_id_group", "dst_address_id_secondary_group" + + secondary_group_data = [ + get_id_group_with_secondary_relations( + address_hash_to_id[tx.dst_identifier], + address_hash_to_id[tx.src_identifier], + id_bucket_size, + relations_nbuckets, + ) + for tx in relation_updates + ] + + changes_secondary_air = get_max_secondary_changes( + secondary_group_data, tablename, grp_col, sec_col + ) + + changes = [] + changes += changes_secondary_atx + changes += changes_secondary_aor + changes += changes_secondary_air + return changes diff --git a/src/graphsenselib/deltaupdate/update/factory.py b/src/graphsenselib/deltaupdate/update/factory.py index cb32b75..384ddf0 100644 --- a/src/graphsenselib/deltaupdate/update/factory.py +++ b/src/graphsenselib/deltaupdate/update/factory.py @@ -1,16 +1,21 @@ +from graphsenselib.deltaupdate.update.account.accountlegacy import ( + UpdateStrategyAccountLegacy, +) +from graphsenselib.deltaupdate.update.generic import ApplicationStrategy +from graphsenselib.deltaupdate.update.utxo.utxolegacy import UpdateStrategyUtxoLegacy + from ...config import currency_to_schema_type +from ...config.config import DeltaUpdaterConfig from ...db import AnalyticsDb from .abstractupdater import AbstractUpdateStrategy from .account import UpdateStrategyAccount -from .generic import ApplicationStrategy from .utxo import UpdateStrategyUtxo -from .utxolegacy import UpdateStrategyUtxoLegacy class UpdaterFactory: def get_updater( self, - currency: str, + du_config: DeltaUpdaterConfig, db: AnalyticsDb, version: int, write_new: bool, @@ -20,6 +25,7 @@ def get_updater( patch_mode: bool, forward_fill_rates: bool = False, ) -> AbstractUpdateStrategy: + currency = du_config.currency schema_type = currency_to_schema_type[currency] if schema_type == "utxo" and version == 1: return UpdateStrategyUtxoLegacy(db, currency, write_new, write_dirty) @@ -36,12 +42,22 @@ def get_updater( forward_fill_rates=forward_fill_rates, ) if (schema_type == "account" or schema_type == "account_trx") and version == 1: - return UpdateStrategyAccount( + return UpdateStrategyAccountLegacy( db, currency, write_new, write_dirty, forward_fill_rates=forward_fill_rates, ) + if (schema_type == "account" or schema_type == "account_trx") and version == 2: + app_strat = ApplicationStrategy.BATCH + return UpdateStrategyAccount( + db, + du_config, + pedantic, + app_strat, + patch_mode, + forward_fill_rates=forward_fill_rates, + ) else: raise Exception(f"Unsupported schema type {schema_type} or {version}") diff --git a/src/graphsenselib/deltaupdate/update/generic.py b/src/graphsenselib/deltaupdate/update/generic.py index f2c3a7c..7fd1861 100644 --- a/src/graphsenselib/deltaupdate/update/generic.py +++ b/src/graphsenselib/deltaupdate/update/generic.py @@ -11,6 +11,7 @@ from ...datatypes import EntityType from ...db import DbChange from ...utils import group_by, groupby_property +from ...utils.account import get_id_group class ApplicationStrategy(Enum): @@ -27,6 +28,14 @@ def merge(self, other_delta): pass +@dataclass +class DeltaScalar(DeltaUpdate): + value: int + + def merge(self, other): + return DeltaScalar(self.value + other.value) + + @dataclass class DeltaValue(DeltaUpdate): value: int @@ -37,6 +46,8 @@ def from_db(Cls, db_row): return Cls(value=db_row.value, fiat_values=list(db_row.fiat_values)) def merge(self, other): + if other is None: + return self assert self.fiat_values is not None and other.fiat_values is not None assert len(self.fiat_values) == len(other.fiat_values) return DeltaValue( @@ -45,9 +56,25 @@ def merge(self, other): ) +def merge_asset_dicts(d1, d2): # probably better to wrap in class and define __add__ + d = {} + for k in set(d1.keys()) | set(d2.keys()): + d[k] = d1.get(k, DeltaValue(0, [0, 0])).merge(d2.get(k, DeltaValue(0, [0, 0]))) + return d + + +@dataclass +class Tx: + block_id: int + tx_id: int + tx_hash: bytes + tx_index: int + failed: bool + + @dataclass class EntityDelta(DeltaUpdate): - """The identifier is either an address of cluster identifier""" + """The identifier is either an address or cluster identifier""" identifier: Union[str, int] total_received: DeltaValue @@ -60,11 +87,11 @@ class EntityDelta(DeltaUpdate): @classmethod def from_db(Cls, db_row, mode: EntityType): if mode == EntityType.CLUSTER: - idetifier = db_row.cluster_id + identifier = db_row.cluster_id elif mode == EntityType.ADDRESS: - idetifier = db_row.address + identifier = db_row.address return Cls( - identifier=idetifier, + identifier=identifier, total_received=DeltaValue.from_db(db_row.total_received), total_spent=DeltaValue.from_db(db_row.total_spent), first_tx_id=db_row.first_tx_id, @@ -86,6 +113,20 @@ def merge(self, other_delta): ) +def minusone_respecting_function(x, y, f): + """ + -1 is a placeholder for first and last tx id in reward traces + which dont have a tx_id + """ + if x == -1 and y == -1: + return -1 + if x == -1: + return y + if y == -1: + return x + return f(x, y) + + @dataclass class RawEntityTx: identifier: Union[str, int] @@ -201,7 +242,7 @@ def prepare_txs_for_ingest( chng = DbChange.new( table=f"{mode}_transactions", data={ - f"{mode}_id_group": ident // id_bucket_size, + f"{mode}_id_group": get_id_group(ident, id_bucket_size), f"{mode}_id": ident, "tx_id": atx.tx_id, "is_outgoing": atx.is_outgoing, @@ -232,7 +273,7 @@ def prepare_entities_for_ingest( resolve_identifier(update.identifier), resolve_entity(update.identifier), ) - group = int_ident // id_bucket_size + group = get_id_group(int_ident, id_bucket_size) if entity is not None: """old Address/cluster""" @@ -347,8 +388,8 @@ def prepare_relations_for_ingest( id_src = resolve_identifier(relations_update.src_identifier) id_dst = resolve_identifier(relations_update.dst_identifier) - src_group = id_src // id_bucket_size - dst_group = id_dst // id_bucket_size + src_group = get_id_group(id_src, id_bucket_size) + dst_group = get_id_group(id_dst, id_bucket_size) if outr is None: """new address/cluster relation to insert""" diff --git a/src/graphsenselib/deltaupdate/update/resources/__init__.py b/src/graphsenselib/deltaupdate/update/resources/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/graphsenselib/deltaupdate/update/resources/supported_tokens_eth.py b/src/graphsenselib/deltaupdate/update/resources/supported_tokens_eth.py new file mode 100644 index 0000000..90fd809 --- /dev/null +++ b/src/graphsenselib/deltaupdate/update/resources/supported_tokens_eth.py @@ -0,0 +1,12 @@ +from io import StringIO + +import pandas as pd + +data = """ +asset,assettype,decimals,address,coin_equivalent,usd_equivalent +USDT,ERC20,6,0xdac17f958d2ee523a2206206994597c13d831ec7,0,1 +USDC,ERC20,6,0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48,0,1 +WETH,ERC20,18,0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2,1,0 +""" + +SUPPORTED_TOKENS = pd.read_csv(StringIO(data)) diff --git a/src/graphsenselib/deltaupdate/update/resources/supported_tokens_trx.py b/src/graphsenselib/deltaupdate/update/resources/supported_tokens_trx.py new file mode 100644 index 0000000..9e4b121 --- /dev/null +++ b/src/graphsenselib/deltaupdate/update/resources/supported_tokens_trx.py @@ -0,0 +1,10 @@ +from io import StringIO + +import pandas as pd + +data = """asset,assettype,decimals,address,coin_equivalent,usd_equivalent +USDT,TRC20,6,0xa614f803b6fd780986a42c78ec9c7f77e6ded13c,0,1 +USDC,TRC20,6,0x3487b63d30b5b2c87fb7ffa8bcfade38eaac1abe,0,1 +WTRX,TRC20,6,0x891cdb91d149f23b1a45d9c5ca78a88d0cb44c18,1,0 +""" +SUPPORTED_TOKENS = pd.read_csv(StringIO(data)) diff --git a/src/graphsenselib/deltaupdate/update/utxo/__init__.py b/src/graphsenselib/deltaupdate/update/utxo/__init__.py new file mode 100644 index 0000000..e91306b --- /dev/null +++ b/src/graphsenselib/deltaupdate/update/utxo/__init__.py @@ -0,0 +1,2 @@ +# flake8: noqa: F401 +from .update import UpdateStrategyUtxo diff --git a/src/graphsenselib/deltaupdate/update/utxo.py b/src/graphsenselib/deltaupdate/update/utxo/update.py similarity index 98% rename from src/graphsenselib/deltaupdate/update/utxo.py rename to src/graphsenselib/deltaupdate/update/utxo/update.py index 46790dd..1794b5e 100644 --- a/src/graphsenselib/deltaupdate/update/utxo.py +++ b/src/graphsenselib/deltaupdate/update/utxo/update.py @@ -5,24 +5,13 @@ from cassandra import InvalidRequest -from ...datatypes import DbChangeType, EntityType -from ...db import AnalyticsDb, DbChange -from ...rates import convert_to_fiat -from ...utils import DataObject as MutableNamedTuple -from ...utils import group_by, no_nones -from ...utils.errorhandling import CrashRecoverer -from ...utils.logging import LoggerScope -from ...utils.utxo import ( - get_regflow, - get_total_input_sum, - get_unique_addresses_from_transaction, - get_unique_addresses_from_transactions, - get_unique_ordered_input_addresses_from_transactions, - get_unique_ordered_output_addresses_from_transactions, - regularize_inoutputs, +from graphsenselib.datatypes import DbChangeType, EntityType +from graphsenselib.db import AnalyticsDb, DbChange +from graphsenselib.deltaupdate.update.abstractupdater import ( + TABLE_NAME_DELTA_HISTORY, + UpdateStrategy, ) -from .abstractupdater import TABLE_NAME_DELTA_HISTORY, UpdateStrategy -from .generic import ( +from graphsenselib.deltaupdate.update.generic import ( ApplicationStrategy, DbDelta, DeltaValue, @@ -33,6 +22,20 @@ prepare_relations_for_ingest, prepare_txs_for_ingest, ) +from graphsenselib.rates import convert_to_fiat +from graphsenselib.utils import DataObject as MutableNamedTuple +from graphsenselib.utils import group_by, no_nones +from graphsenselib.utils.errorhandling import CrashRecoverer +from graphsenselib.utils.logging import LoggerScope +from graphsenselib.utils.utxo import ( + get_regflow, + get_total_input_sum, + get_unique_addresses_from_transaction, + get_unique_addresses_from_transactions, + get_unique_ordered_input_addresses_from_transactions, + get_unique_ordered_output_addresses_from_transactions, + regularize_inoutputs, +) logger = logging.getLogger(__name__) @@ -1194,6 +1197,9 @@ def __init__( logger.info(f"Updater running in {application_strategy} mode.") self.crash_recoverer = CrashRecoverer(crash_file) + def clear_cache(self): + pass + def persist_updater_progress(self): if self.changes is not None: atomic = ApplicationStrategy.TX == self.application_strategy diff --git a/src/graphsenselib/deltaupdate/update/utxolegacy.py b/src/graphsenselib/deltaupdate/update/utxo/utxolegacy.py similarity index 98% rename from src/graphsenselib/deltaupdate/update/utxolegacy.py rename to src/graphsenselib/deltaupdate/update/utxo/utxolegacy.py index 469d95a..e18a35e 100644 --- a/src/graphsenselib/deltaupdate/update/utxolegacy.py +++ b/src/graphsenselib/deltaupdate/update/utxo/utxolegacy.py @@ -1,6 +1,6 @@ import logging -from .abstractupdater import ( +from graphsenselib.deltaupdate.update.abstractupdater import ( TABLE_NAME_DIRTY, TABLE_NAME_NEW, LegacyUpdateStrategy, diff --git a/src/graphsenselib/ingest/account.py b/src/graphsenselib/ingest/account.py index f1b4b81..2105431 100644 --- a/src/graphsenselib/ingest/account.py +++ b/src/graphsenselib/ingest/account.py @@ -9,6 +9,7 @@ from typing import Dict, Iterable, List, Optional, Tuple import grpc +from diskcache import Cache from ethereumetl.jobs.export_blocks_job import ExportBlocksJob from ethereumetl.jobs.export_receipts_job import ExportReceiptsJob from ethereumetl.jobs.export_traces_job import ( @@ -31,10 +32,12 @@ batch, check_timestamp, first_or_default, - hex_to_bytearray, + hex_to_bytes, parse_timestamp, remove_prefix, ) +from ..utils.account import get_id_group +from ..utils.cache import TableBasedCache from ..utils.logging import configure_logging, suppress_log_level from ..utils.signals import graceful_ctlc_shutdown from ..utils.tron import evm_to_bytes, strip_tron_prefix @@ -89,6 +92,7 @@ def export_item(self, item) -> None: item_type = item.get("type", None) if item_type is None: raise ValueError(f"type key is not found in item {item}") + self.items[item_type].append(item) def close(self) -> None: @@ -103,6 +107,11 @@ class AccountStreamerAdapter: """Standard Ethereum API style streaming adapter to export blocks, transactions, receipts, logs and traces.""" + # TODO: the adapter setup is sub optimal, + # currently we create a Exporter*Job per call/batch + # which in turn spins up a new thread pool for every call + # which is not efficient and unnecessary overhead + def __init__( self, batch_web3_provider: ThreadLocalProxy, @@ -228,6 +237,7 @@ def export_traces( grpc_endpoint=self.grpc_endpoint, max_workers=self.max_workers, ) + return job.run() # traces = exporter.get_items("trace") # return traces @@ -340,7 +350,7 @@ def prepare_logs_inplace(items: Iterable, block_bucket_size: int): # rename/add columns item["tx_hash"] = item.pop("transaction_hash") item["block_id"] = item.pop("block_number") - item["block_id_group"] = item["block_id"] // block_bucket_size + item["block_id_group"] = get_id_group(item["block_id"], block_bucket_size) # Used for partitioning in parquet files # ignored otherwise @@ -357,7 +367,7 @@ def prepare_logs_inplace(items: Iterable, block_bucket_size: int): # key columns in cassandra and can not be filtered item["topic0"] = tpcs[0] if len(tpcs) > 0 else "0x" - item["topics"] = [hex_to_bytearray(t) for t in tpcs] + item["topics"] = [hex_to_bytes(t) for t in tpcs] # if topics contain duplicates if ( @@ -376,7 +386,7 @@ def prepare_logs_inplace(items: Iterable, block_bucket_size: int): item.pop("transaction_hash") for elem in blob_colums: - item[elem] = hex_to_bytearray(item[elem]) + item[elem] = hex_to_bytes(item[elem]) def ingest_logs( @@ -411,7 +421,7 @@ def prepare_blocks_inplace_eth( item.pop("type") # rename/add columns item["block_id"] = item.pop("number") - item["block_id_group"] = item["block_id"] // block_bucket_size + item["block_id_group"] = get_id_group(item["block_id"], block_bucket_size) item["block_hash"] = item.pop("hash") # Used for partitioning in parquet files @@ -420,7 +430,7 @@ def prepare_blocks_inplace_eth( # convert hex strings to byte arrays (blob in Cassandra) for elem in blob_colums: - item[elem] = hex_to_bytearray(item[elem]) + item[elem] = hex_to_bytes(item[elem]) def prepare_blocks_inplace_trx(items, block_bucket_size): @@ -451,7 +461,7 @@ def prepare_transactions_inplace_eth( hash_slice = slice(2, 2 + tx_hash_prefix_len) item["tx_hash_prefix"] = item["tx_hash"][hash_slice] item["block_id"] = item.pop("block_number") - item["block_id_group"] = item["block_id"] // block_bucket_size + item["block_id_group"] = get_id_group(item["block_id"], block_bucket_size) # Used for partitioning in parquet files # ignored otherwise @@ -459,7 +469,7 @@ def prepare_transactions_inplace_eth( # convert hex strings to byte arrays (blob in Cassandra) for elem in blob_colums: - item[elem] = hex_to_bytearray(item[elem]) + item[elem] = hex_to_bytes(item[elem]) def prepare_transactions_inplace_trx( @@ -481,7 +491,7 @@ def prepare_traces_inplace_eth(items: Iterable, block_bucket_size: int): # rename/add columns item["tx_hash"] = item.pop("transaction_hash") item["block_id"] = item.pop("block_number") - item["block_id_group"] = item["block_id"] // block_bucket_size + item["block_id_group"] = get_id_group(item["block_id"], block_bucket_size) # Used for partitioning in parquet files # ignored otherwise @@ -494,7 +504,7 @@ def prepare_traces_inplace_eth(items: Iterable, block_bucket_size: int): ) # convert hex strings to byte arrays (blob in Cassandra) for elem in blob_colums: - item[elem] = hex_to_bytearray(item[elem]) + item[elem] = hex_to_bytes(item[elem]) def prepare_traces_inplace_trx(items: Iterable, block_bucket_size: int): @@ -503,7 +513,7 @@ def prepare_traces_inplace_trx(items: Iterable, block_bucket_size: int): # rename/add columns item["tx_hash"] = item.pop("transaction_hash") item["block_id"] = item.pop("block_number") - item["block_id_group"] = item["block_id"] // block_bucket_size + item["block_id_group"] = get_id_group(item["block_id"], block_bucket_size) item["transferto_address"] = item.pop("transferTo_address") # Used for partitioning in parquet files @@ -997,7 +1007,7 @@ def ingest_async( ) # make sure that only supported sinks are selected. - if not all((x in ["cassandra", "parquet"]) for x in sink_config.keys()): + if not all((x in ["cassandra", "parquet", "fs-cache"]) for x in sink_config.keys()): raise BadUserInputError( "Unsupported sink selected, supported: cassandra," f" parquet; got {list(sink_config.keys())}" @@ -1089,6 +1099,12 @@ def initializer_worker(thrd_ctx, db, sink_config, strategy, loglevel): new_db_conn = db.clone() new_db_conn.open() thrd_ctx.db = new_db_conn + + if "fs-cache" in sink_config: + odirectory = sink_config["fs-cache"]["output_directory"] + sink_config["fs-cache"]["cache"] = TableBasedCache( + Cache(odirectory, eviction_policy="none") + ) thrd_ctx.adapter = strategy.get_source_adapter() thrd_ctx.strategy = strategy thrd_ctx.sink_config = sink_config @@ -1096,7 +1112,6 @@ def initializer_worker(thrd_ctx, db, sink_config, strategy, loglevel): thrd_ctx.BLOCK_BUCKET_SIZE = BLOCK_BUCKET_SIZE def process_task(thrd_ctx, task, data): - # print(task, data) return task.run(thrd_ctx, data) def submit_tasks(ex, thrd_ctx, tasks, data=None): @@ -1120,7 +1135,7 @@ def submit_tasks(ex, thrd_ctx, tasks, data=None): transform_strategy, logger.getEffectiveLevel(), ), - max_workers=15, # we write at most 4 tables in parallel + max_workers=4, # we write at most 4 tables in parallel ) as ex: time1 = datetime.now() count = 0 diff --git a/src/graphsenselib/ingest/cli.py b/src/graphsenselib/ingest/cli.py index 816c775..af731ba 100644 --- a/src/graphsenselib/ingest/cli.py +++ b/src/graphsenselib/ingest/cli.py @@ -1,5 +1,6 @@ import logging import sys +from typing import Dict import click @@ -7,13 +8,41 @@ from ..config import config, currency_to_schema_type from ..db import DbFactory from ..schema import GraphsenseSchemas +from ..utils import subkey_get from .common import INGEST_SINKS from .factory import IngestFactory -from .parquet import SCHEMA_MAPPING +from .parquet import SCHEMA_MAPPING as PARQUET_SCHEMA_MAPPING logger = logging.getLogger(__name__) +def create_sink_config(sink: str, network: str, ks_config: Dict): + schema_type = currency_to_schema_type[network] + sink_config = ks_config.ingest_config.dict().get("raw_keyspace_file_sinks", None) + if (sink == "parquet" and schema_type.startswith("account")) or sink == "fs-cache": + file_sink_dir = subkey_get(sink_config, f"{sink}.directory".split(".")) + if file_sink_dir is None: + logger.warning( + f"No {sink} file output directory " + f"({sink}.directory) is configured for {network}. " + "Ignoring sink." + ) + return None + + sc = {"output_directory": file_sink_dir} + + if sink == "parquet": + sc["schema"] = PARQUET_SCHEMA_MAPPING[schema_type] + if sink == "fs-cache": + sc["ignore_tables"] = ["trc10", "configuration"] + if network == "trx": + sc["key_by"] = {"fee": "tx_hash", "default": "block_id"} + + return sc + else: + return {} + + @click.group() def ingest_cli(): pass @@ -126,9 +155,6 @@ def ingest( """ ks_config = config.get_keyspace_config(env, currency) sources = ks_config.ingest_config.all_node_references - parquet_file_sink_config = ks_config.ingest_config.raw_keyspace_file_sinks.get( - "parquet", None - ) if ( ( @@ -144,34 +170,19 @@ def ingest( ) sys.exit(11) - parquet_file_sink = ( - parquet_file_sink_config.directory - if parquet_file_sink_config is not None - else None - ) - if create_schema: GraphsenseSchemas().create_keyspace_if_not_exist( env, currency, keyspace_type="raw" ) - def create_sink_config(sink, currency): - schema_type = currency_to_schema_type[currency] - return ( - { - "output_directory": parquet_file_sink, - "schema": SCHEMA_MAPPING[schema_type], - } - if sink == "parquet" and schema_type == "account" - else {} - ) + sink_configs = [(k, create_sink_config(k, currency, ks_config)) for k in sinks] with DbFactory().from_config(env, currency) as db: IngestFactory().from_config(env, currency, version).ingest( db=db, currency=currency, sources=sources, - sink_config={k: create_sink_config(k, currency) for k in sinks}, + sink_config={k: v for k, v in sink_configs if v is not None}, user_start_block=start_block, user_end_block=end_block, batch_size=batch_size, diff --git a/src/graphsenselib/ingest/common.py b/src/graphsenselib/ingest/common.py index 01ec28a..3f3f3e5 100644 --- a/src/graphsenselib/ingest/common.py +++ b/src/graphsenselib/ingest/common.py @@ -4,7 +4,7 @@ from ..db import AnalyticsDb from .parquet import write_parquet -INGEST_SINKS = ["parquet", "cassandra"] +INGEST_SINKS = ["parquet", "cassandra", "fs-cache"] CASSANDRA_INGEST_DEFAULT_CONCURRENCY = 100 @@ -37,6 +37,18 @@ def write_to_sinks( "in the keyspace config." ) write_parquet(path, table_name, parameters, schema_table) + elif sink == "fs-cache": + c = config.get("cache", None) + kc = config.get("key_by", {"default": "block_id"}) + key = kc.get(table_name, None) or kc["default"] + ignore_tables = config.get("ignore_tables", []) + if table_name in ignore_tables: + return + + if c is None: + raise Exception("Cache not set. Error.") + + c.put_items_keyed_by(table_name, parameters, key=key) else: logger.warning(f"Encountered unknown sink type {sink}, ignoring.") diff --git a/src/graphsenselib/ingest/csv.py b/src/graphsenselib/ingest/csv.py index 7cc2bc5..4542abc 100644 --- a/src/graphsenselib/ingest/csv.py +++ b/src/graphsenselib/ingest/csv.py @@ -2,6 +2,8 @@ from csv import DictWriter from typing import Iterable +from ..utils.account import get_id_group + BLOCK_HEADER = [ "parent_hash", "nonce", @@ -100,7 +102,7 @@ def format_blocks_csv( item.pop("withdrawals_root") # rename/add columns item["block_id"] = item.pop("number") - item["block_id_group"] = item["block_id"] // block_bucket_size + item["block_id_group"] = get_id_group(item["block_id"], block_bucket_size) item["block_hash"] = item.pop("hash") return items @@ -141,7 +143,7 @@ def format_traces_csv( # rename/add columns item["tx_hash"] = item.pop("transaction_hash") item["block_id"] = item.pop("block_number") - item["block_id_group"] = item["block_id"] // block_bucket_size + item["block_id_group"] = get_id_group(item["block_id"], block_bucket_size) item["trace_address"] = ( "|".join(map(str, item["trace_address"])) if item["trace_address"] is not None @@ -163,7 +165,7 @@ def format_logs_csv( # rename/add columns item["tx_hash"] = item.pop("transaction_hash") item["block_id"] = item.pop("block_number") - item["block_id_group"] = item["block_id"] // block_bucket_size + item["block_id_group"] = get_id_group(item["block_id"], block_bucket_size) tpcs = item["topics"] diff --git a/src/graphsenselib/ingest/utxo.py b/src/graphsenselib/ingest/utxo.py index 707a7ff..5289dfe 100644 --- a/src/graphsenselib/ingest/utxo.py +++ b/src/graphsenselib/ingest/utxo.py @@ -18,7 +18,8 @@ from ..config import GRAPHSENSE_DEFAULT_DATETIME_FORMAT, get_approx_reorg_backoff_blocks from ..db import AnalyticsDb -from ..utils import bytes_to_hex, flatten, hex_to_bytearray, parse_timestamp, strip_0x +from ..utils import bytes_to_hex, flatten, hex_to_bytes, parse_timestamp, strip_0x +from ..utils.account import get_id_group from ..utils.bch import bch_address_to_legacy from ..utils.logging import suppress_log_level from ..utils.signals import graceful_ctlc_shutdown @@ -258,7 +259,7 @@ def ingest_block_transactions( for block, block_txs in mapping.items(): items.append( { - "block_id_group": block // block_bucket_size, + "block_id_group": get_id_group(block, block_bucket_size), "block_id": block, "txs": [tx_stats(x) for x in block_txs], } @@ -308,11 +309,11 @@ def prepare_blocks_inplace(blocks: Iterable, block_bucket_size: int) -> None: block.pop(i) for elem in blob_columns: - block[elem] = hex_to_bytearray( + block[elem] = hex_to_bytes( block[elem] ) # convert hex strings to byte arrays (blob in Cassandra) - block["block_id_group"] = block["number"] // block_bucket_size + block["block_id_group"] = get_id_group(block["number"], block_bucket_size) block["block_id"] = block.pop("number") block["block_hash"] = block.pop("hash") @@ -556,7 +557,7 @@ def prepare_transactions_inplace( tx["tx_prefix"] = tx["hash"][:tx_hash_prefix_len] for elem in blob_columns: - tx[elem] = hex_to_bytearray( + tx[elem] = hex_to_bytes( tx[elem] ) # convert hex strings to byte arrays (blob in Cassandra) @@ -574,17 +575,17 @@ def prepare_transactions_inplace( tx["total_output"] = tx.pop("output_value") tx["tx_hash"] = tx.pop("hash") - tx["tx_id_group"] = next_tx_id // tx_bucket_size + tx["tx_id_group"] = get_id_group(next_tx_id, tx_bucket_size) tx["tx_id"] = next_tx_id next_tx_id += 1 def get_tx_refs(spending_tx_hash: str, raw_inputs: Iterable, tx_hash_prefix_len: int): tx_refs = [] - spending_tx_hash = hex_to_bytearray(spending_tx_hash) + spending_tx_hash = hex_to_bytes(spending_tx_hash) for inp in raw_inputs: spending_input_index = inp["index"] - spent_tx_hash = hex_to_bytearray(inp["spent_transaction_hash"]) + spent_tx_hash = hex_to_bytes(inp["spent_transaction_hash"]) spent_output_index = inp["spent_output_index"] if spending_tx_hash is not None and spent_tx_hash is not None: # in zcash refs can be None in case of shielded txs. diff --git a/src/graphsenselib/schema/resources/transformed_account_schema.sql b/src/graphsenselib/schema/resources/transformed_account_schema.sql index 4848b4e..3ac479b 100644 --- a/src/graphsenselib/schema/resources/transformed_account_schema.sql +++ b/src/graphsenselib/schema/resources/transformed_account_schema.sql @@ -147,6 +147,8 @@ CREATE TABLE balance ( CREATE TABLE configuration ( keyspace_name text PRIMARY KEY, bucket_size int, + block_bucket_size_address_txs int, + addressrelations_ids_nbuckets int, address_prefix_length int, tx_prefix_length int, fiat_currencies list diff --git a/src/graphsenselib/schema/schema.py b/src/graphsenselib/schema/schema.py index b6824b5..0d0459d 100644 --- a/src/graphsenselib/schema/schema.py +++ b/src/graphsenselib/schema/schema.py @@ -9,7 +9,11 @@ from ..config import config, currency_to_schema_type, keyspace_types from ..datatypes import BadUserInputError from ..db import DbFactory -from ..db.cassandra import build_create_stmt, normalize_cql_statement +from ..db.cassandra import ( + build_create_stmt, + build_truncate_stmt, + normalize_cql_statement, +) from ..utils import flatten, split_list_on_condition from ..utils.parsing import ( anything, @@ -404,3 +408,32 @@ def _validate_against_db_intermal(self, db, schemas): ) return report + + def get_table_columns_from_file(self, keyspace_type: str, table: str): + schemas = self.get_by_schema_type(keyspace_type)[0][1].statements_str + potential_schemas = [s for s in schemas if table in s] + assert len(potential_schemas) == 1 + schema_str = potential_schemas[0] + columns = schema_str.split("(")[1].split(")")[0].split(",")[:-1] + columns = [c.strip() for c in columns] + pk_columns = schema_str.split("primary key (")[1].split(")")[0].split(",") + return columns, pk_columns + + def ensure_table_exists_by_name( + self, db_transformed, table_name: str, truncate: bool = False + ): + keyspace = db_transformed.get_keyspace() + columns, pk_columns = self.get_table_columns_from_file(keyspace, table_name) + db_transformed._db.execute( + build_create_stmt( + columns, + pk_columns, + table_name, + fail_if_exists=False, + keyspace=keyspace, + ) + ) + if truncate: + db_transformed._db.execute( + build_truncate_stmt(table_name, keyspace=keyspace) + ) diff --git a/src/graphsenselib/utils/account.py b/src/graphsenselib/utils/account.py new file mode 100644 index 0000000..ee8a9bc --- /dev/null +++ b/src/graphsenselib/utils/account.py @@ -0,0 +1,170 @@ +from dataclasses import dataclass +from math import floor +from typing import Iterable, Set + +from ..datatypes import FlowDirection +from .generic import flatten + + +@dataclass +class SlimTx: + address: str + block_id: int + timestamp: int + tx_hash: str + direction: FlowDirection + + +def get_total_input_sum(input_list: list) -> int: + """Simple sum of all input + + Args: + input_list (list): list of inputs + + Returns: + int: sum of input + """ + if input_list is None: + return 0 + return sum([inp.value for inp in input_list]) + + +def get_regflow(regin: dict, regout: dict, address: str) -> int: + """Calculates the in/out flow an address. + + Args: + regin (dict): regularized inputs + regout (dict): regularized outputs + address (str): address of interest + + Returns: + int: Negative if net outflow, positive if net inflow + """ + return regout.get(address, 0) - regin.get(address, 0) + + +def get_unique_addresses_from_trace(trace) -> Iterable[str]: + return {adr.address for adr in get_slim_tx_from_trace(trace)} + + +def get_slim_tx_from_trace(trace) -> Iterable[SlimTx]: + # Only take first address from address array + # this is equivalent to the spark job, but is ignoring multisig + + sending_addresses = [(trace.from_address, FlowDirection.OUT)] + receiving_addresses = [(trace.to_address, FlowDirection.IN)] + + addresses = sending_addresses + receiving_addresses + + return [ + SlimTx( + addr, + trace.block_id, + 0, # todo #trace.timestamp, + trace.tx_hash, + direction, + ) + for addr, direction in addresses + ] + + +def get_slim_tx_from_traces(traces) -> Iterable[SlimTx]: + return flatten([get_slim_tx_from_trace(tx) for tx in traces]) + + +def get_unique_addresses_from_traces(traces) -> Set[str]: + return {adr.address for adr in get_slim_tx_from_traces(traces)} + + +def get_unique_ordered_receiver_addresses_from_traces( + traces, +) -> Iterable[str]: + """Returns all unique output addresses in the order they appear in the txs. + This is useful to assign address ids where order should matter. + + Args: + traces (TYPE): Iterable of dbtraces + + Returns: + Iterable[str]: order preserving Iterable + """ + """ + Construction see + https://stackoverflow.com/questions/1653970/does-python-have-an-ordered-set + """ + return list( + dict.fromkeys( + [ + tx.address + for tx in get_slim_tx_from_traces(traces) + if tx.direction == FlowDirection.OUT + ] + ) + ) + + +def get_unique_ordered_addresses( + address_containing_objects, + mode: str, +) -> Iterable[str]: + """Returns all unique input addresses in the order they appear in the txs. + This is useful to assign address ids where order should matter. + + Args: + traces (TYPE): Iterable of dbtraces + + Returns: + Iterable[str]: order preserving Iterable + """ + """ + Construction see + https://stackoverflow.com/questions/1653970/does-python-have-an-ordered-set + """ + if mode == "sender": + list_to_prepare = [x.to_address for x in address_containing_objects] + elif mode == "receiver": + list_to_prepare = [x.from_address for x in address_containing_objects] + elif mode == "both": + list_to = [x.to_address for x in address_containing_objects] + list_from = [x.from_address for x in address_containing_objects] + list_to_prepare = list_to + list_from + else: + raise Exception("Unknown mode") + return list(dict.fromkeys(list_to_prepare[::-1]))[::-1] + + +def calculate_id_group_with_overflow(tx_id: int, bucket_size: int): + blub = int(floor(float(tx_id) / bucket_size)) + + if blub.bit_length() >= 31: + # downcast to 32bit integer + # blub = ctypes.c_uint32(blub).value + blub = (blub + 2**31) % 2**32 - 2**31 + return blub + + +def get_id_group(id_, bucket_size): + gid = floor(int(id_) / bucket_size) + if gid.bit_length() > 31: + # tron tx_id are long and the group is int + # thus we need to also consider overflows in this case + # additionally spark does not calculate ids on int basis but + # based on floats which can lead to rounding errors. + gid = calculate_id_group_with_overflow(id_, bucket_size) + return gid + + +def get_id_group_with_secondary_addresstransactions( + iid, bucket_size, block_id, block_bucket_size_address_txs +): + address_id_group = get_id_group(iid, bucket_size) + address_id_secondary_group = block_id // block_bucket_size_address_txs + return address_id_group, address_id_secondary_group + + +def get_id_group_with_secondary_relations( + iid, id_for_secondary, bucket_size, relations_nbuckets +): + address_id_group = get_id_group(iid, bucket_size) + address_id_secondary_group = id_for_secondary % relations_nbuckets + return address_id_group, address_id_secondary_group diff --git a/src/graphsenselib/utils/accountmodel.py b/src/graphsenselib/utils/accountmodel.py index 4d4f769..e908531 100644 --- a/src/graphsenselib/utils/accountmodel.py +++ b/src/graphsenselib/utils/accountmodel.py @@ -38,6 +38,6 @@ def to_int(string: Union[str, int]) -> int: @typechecked -def hex_to_bytearray(hex_str: Optional[str]) -> Optional[bytearray]: +def hex_to_bytes(hex_str: Optional[str]) -> Optional[bytes]: """Convert hexstring (starting with 0x) to bytearray.""" - return bytearray.fromhex(strip_0x(hex_str)) if hex_str is not None else None + return bytes.fromhex(strip_0x(hex_str)) if hex_str is not None else None diff --git a/src/graphsenselib/utils/cache.py b/src/graphsenselib/utils/cache.py new file mode 100644 index 0000000..410c25a --- /dev/null +++ b/src/graphsenselib/utils/cache.py @@ -0,0 +1,65 @@ +from typing import Dict, Tuple, Union + +from . import group_by + +DEFAULT_KEY_ENCODERS = { + bytes: lambda x: x.hex(), + int: lambda x: str(x), + str: lambda x: x, +} + + +class TableBasedCache: + def __init__( + self, + internal_key_value_cache: Dict, + table_delimiter: str = "|", + key_encoder=DEFAULT_KEY_ENCODERS, + ): + self.internal_key_value_cache = internal_key_value_cache + self.delim = table_delimiter + self.key_encoder = key_encoder + + def get_key(self, table: str, key: Union[bytes, int, str]): + assert self.delim not in table + kt = type(key) + if kt not in self.key_encoder: + raise ValueError(f"Don't know how to encode key of type {kt}") + ekey = self.key_encoder[kt](key) + return f"{ekey}{self.delim}{table}" + + def __delitem__(self, kv: Tuple[str, Union[bytes, int, str]]): + table, key = kv + self.delete_item(table, key) + + def delete_item(self, table: str, key: Union[bytes, int, str]): + self.internal_key_value_cache.delete(self.get_key(table, key), retry=True) + + def put_item(self, table: str, key: Union[bytes, int, str], item): + self.internal_key_value_cache[self.get_key(table, key)] = item + + def put_items(self, table: str, items: Tuple[str, object]): + for k, v in items: + self.put_item(table, k, v) + + def put_items_keyed_by(self, table: str, items: Tuple[str, object], key: str): + by_key = group_by(items, key=lambda x: x[key]) + + self.put_items(table, by_key.items()) + + def __setitem__(self, kv: Tuple[str, Union[bytes, int, str]], data): + table, key = kv + self.put_item(table, key, data) + + def __getitem__(self, kv: Tuple[str, Union[bytes, int, str]]): + table, key = kv + return self.get_item(table, key) + + def get(self, kv: Tuple[str, Union[bytes, int, str]], default=None): + try: + return self[kv] + except KeyError: + return default + + def get_item(self, table: str, key: str): + return self.internal_key_value_cache[self.get_key(table, key)] diff --git a/src/graphsenselib/utils/errors.py b/src/graphsenselib/utils/errors.py new file mode 100644 index 0000000..2df3f91 --- /dev/null +++ b/src/graphsenselib/utils/errors.py @@ -0,0 +1,32 @@ +class UserFacingExceptions(Exception): + """Hierarchy of exceptions that end up being communicated + to the end user, but do not produce error logs""" + + +class NotFoundException(UserFacingExceptions): + """this exception should be used if some + item is not found e.g. the database.""" + + +class NetworkNotFoundException(NotFoundException): + def __init__(self, network): + super().__init__(f"Network {network} not supported") + + +class BlockNotFoundException(NotFoundException): + def __init__(self, network, height): + super().__init__(f"Block {height} not found in network {network}") + + +class TransactionNotFoundException(NotFoundException): + def __init__(self, network, tx_hash, token_id=None): + msg = ( + ( + f"Token transaction {tx_hash}:{token_id} " + f"in network {network} not " + "found" + ) + if token_id + else f"Transaction {tx_hash} not found in network {network}" + ) + super().__init__(msg) diff --git a/tests/deltaupdate/resources/account/eth/blocks.pkl b/tests/deltaupdate/resources/account/eth/blocks.pkl new file mode 100644 index 0000000..0008f97 Binary files /dev/null and b/tests/deltaupdate/resources/account/eth/blocks.pkl differ diff --git a/tests/deltaupdate/resources/account/eth/logs.pkl b/tests/deltaupdate/resources/account/eth/logs.pkl new file mode 100644 index 0000000..a432312 Binary files /dev/null and b/tests/deltaupdate/resources/account/eth/logs.pkl differ diff --git a/tests/deltaupdate/resources/account/eth/traces.pkl b/tests/deltaupdate/resources/account/eth/traces.pkl new file mode 100644 index 0000000..6499b89 Binary files /dev/null and b/tests/deltaupdate/resources/account/eth/traces.pkl differ diff --git a/tests/deltaupdate/resources/account/eth/transactions.pkl b/tests/deltaupdate/resources/account/eth/transactions.pkl new file mode 100644 index 0000000..7eb055f Binary files /dev/null and b/tests/deltaupdate/resources/account/eth/transactions.pkl differ diff --git a/tests/deltaupdate/resources/account/trx/blocks.pkl b/tests/deltaupdate/resources/account/trx/blocks.pkl new file mode 100644 index 0000000..b26718e Binary files /dev/null and b/tests/deltaupdate/resources/account/trx/blocks.pkl differ diff --git a/tests/deltaupdate/resources/account/trx/logs.pkl b/tests/deltaupdate/resources/account/trx/logs.pkl new file mode 100644 index 0000000..c69cd16 Binary files /dev/null and b/tests/deltaupdate/resources/account/trx/logs.pkl differ diff --git a/tests/deltaupdate/resources/account/trx/traces.pkl b/tests/deltaupdate/resources/account/trx/traces.pkl new file mode 100644 index 0000000..8645926 Binary files /dev/null and b/tests/deltaupdate/resources/account/trx/traces.pkl differ diff --git a/tests/deltaupdate/resources/account/trx/transactions.pkl b/tests/deltaupdate/resources/account/trx/transactions.pkl new file mode 100644 index 0000000..4079b9f Binary files /dev/null and b/tests/deltaupdate/resources/account/trx/transactions.pkl differ diff --git a/tests/deltaupdate/test_accountupdate.py b/tests/deltaupdate/test_accountupdate.py new file mode 100644 index 0000000..7abc178 --- /dev/null +++ b/tests/deltaupdate/test_accountupdate.py @@ -0,0 +1,323 @@ +import pickle + +from graphsenselib.deltaupdate.update.account.createdeltas import ( + get_sorted_unique_addresses, +) +from graphsenselib.deltaupdate.update.account.modelsraw import ( + AccountBlockAdapter, + AccountLogAdapter, + AccountTransactionAdapter, + EthTraceAdapter, + TrxTraceAdapter, + TrxTransactionAdapter, +) +from graphsenselib.deltaupdate.update.account.tokens import ERC20Decoder, TokenTransfer + +currencies = ["trx", "eth"] +folder = "tests/deltaupdate/resources/account" +filetypes = ["transactions", "traces", "logs", "blocks"] + +tx_schema = { + "transaction_index": int, + "tx_hash": bytes, + "from_address": bytes, + "to_address": bytes, + "value": int, + "gas_price": int, + "transaction_type": int, + "receipt_gas_used": int, + "receipt_status": int, + "block_id": int, +} + +log_schema = { + "block_id": int, + "tx_hash": bytes, + "log_index": int, + "address": bytes, + "topics": list, + "data": bytes, +} +block_schema = { + "block_id": int, + "miner": bytes, + "base_fee_per_gas": int, + "gas_used": int, +} + +trace_schema = { + "block_id": int, + "tx_hash": bytes, + "trace_index": bytes, + "from_address": bytes, + "to_address": bytes, + "value": int, + "call_type": str, + "status": int, +} + + +def load_data(): + # load txs, traces, logs, and blocks + # read the jsons + data = { + currency: { + filetype: pickle.load(open(f"{folder}/{currency}/{filetype}.pkl", "rb")) + for filetype in filetypes + } + for currency in currencies + } + return data + + +def load_reference_data(): + lengths = { + "trx": { + "transactions": 293, + "traces": 2, + "logs": 76, + "blocks": 1, + }, + "eth": { + "transactions": 117, + "traces": 671, + "logs": 300, + "blocks": 1, + }, + } + + blocks = {"trx": 50000011, "eth": 18000011} + + return lengths, blocks + + +def test_adapters_regression(): + data = load_data() + lengths_ref, blocks_ref = load_reference_data() + for currency in currencies: + data_currency = data[currency] + transactions, traces, logs, blocks = ( + data_currency["transactions"], + data_currency["traces"], + data_currency["logs"], + data_currency["blocks"], + ) + + if currency == "trx": + trace_adapter = TrxTraceAdapter() + transaction_adapter = TrxTransactionAdapter() + + elif currency == "eth": + trace_adapter = EthTraceAdapter() + transaction_adapter = AccountTransactionAdapter() + + # convert dictionaries to dataclasses and unify naming + log_adapter = AccountLogAdapter() + block_adapter = AccountBlockAdapter() + traces = trace_adapter.dicts_to_renamed_dataclasses(traces) + traces = trace_adapter.process_fields_in_list(traces) + transactions = transaction_adapter.dicts_to_dataclasses(transactions) + logs = log_adapter.dicts_to_dataclasses(logs) + blocks = block_adapter.dicts_to_dataclasses(blocks) + + length_ref = lengths_ref[currency] + assert len(transactions) == length_ref["transactions"] + assert len(traces) == length_ref["traces"] + assert len(logs) == length_ref["logs"] + assert len(blocks) == length_ref["blocks"] + + assert traces[0].block_id == blocks_ref[currency] + + # check that the files have the correct schema + for transaction in transactions: + assert isinstance(transaction, transaction_adapter.datamodel) + assert isinstance(transaction.tx_hash, bytes) + assert ( + isinstance(transaction.from_address, bytes) + or transaction.from_address is None + ) + assert isinstance(transaction.to_address, bytes) + assert isinstance(transaction.value, int) + assert isinstance(transaction.gas_price, int) + assert isinstance(transaction.transaction_type, int) + assert isinstance(transaction.receipt_gas_used, int) + assert isinstance(transaction.receipt_status, int) + assert isinstance(transaction.block_id, int) + + for trace in traces: + assert isinstance(trace, trace_adapter.datamodel) + assert isinstance(trace.tx_hash, bytes) or trace.tx_hash is None + assert isinstance(trace.from_address, bytes) or trace.from_address is None + assert isinstance(trace.to_address, bytes) + assert isinstance(trace.value, int) + assert isinstance(trace.call_type, str) or trace.call_type is None + assert isinstance(trace.status, int) + assert isinstance(trace.block_id, int) + + for log in logs: + assert isinstance(log, log_adapter.datamodel) + assert isinstance(log.block_id, int) + assert isinstance(log.tx_hash, bytes) + assert isinstance(log.log_index, int) + assert isinstance(log.address, bytes) + assert isinstance(log.topics, list) + assert isinstance(log.data, bytes) + + for block in blocks: + assert isinstance(block, block_adapter.datamodel) + assert isinstance(block.block_id, int) + assert isinstance(block.miner, bytes) + assert isinstance(block.base_fee_per_gas, int) + assert isinstance(block.gas_used, int) + + +def test_tokens_detected(): + tokendecoder = ERC20Decoder("eth") + data = load_data() + eth_data = data["eth"] + logs = eth_data["logs"] + log_adapter = AccountLogAdapter() + logs = log_adapter.dicts_to_dataclasses(logs) + token_transfers = [tokendecoder.log_to_transfer(log) for log in logs] + token_transfers = [x for x in token_transfers if x is not None] + + assert len(token_transfers) == 51 + + +def test_token_decoding(): + adapter = AccountLogAdapter() + # todo move to tests + example_log = { + "log_index": 0, + "transaction_index": 1, + "block_hash": b"\x00\x00\x00\x00\x02\xfa\xf0\xe5A\xeab\x1d\xed\xc7%\x00\x074^" + b"\x10\xaa5\xe7\xbd\xb7\xa9\x1c\xee\x99\x0f96", + "address": b"\xa6\x14\xf8\x03\xb6\xfdx\t\x86\xa4,x\xec\x9c\x7fw\xe6\xde\xd1<", + "data": b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xba\x81@", + "topics": [ + b"\xdd\xf2R\xad\x1b\xe2\xc8\x9bi\xc2\xb0h\xfc7\x8d\xaa\x95+\xa7\xf1c" + b"\xc4\xa1\x16(\xf5ZM\xf5#\xb3\xef", + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xb3\xa8"da\xf0\xe6' + b"\xa9\xa1\x06?\xeb\xea\x88\xc6\xf6\xa5\xa0\x85~", + b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0LT\xf6\xb6\xa2" + b'\x9a\xf0ZT\x95\x8cIt\xc3\x83\xb4\xd9"\xac', + ], + "tx_hash": b"\xe0}\xe10\xe5\xc2\xb1\xde\x13\xcd\x88\xee!\xfa\x1e\xca]e\xba\xbb" + b"\xecVG\xd3\x1c\xb7\x90\x1f\xc92wo", + "block_id": 50000101, + "block_id_group": 50000, + "partition": 500, + "topic0": b"\xdd\xf2R\xad\x1b\xe2\xc8\x9bi\xc2\xb0h\xfc7\x8d\xaa\x95+\xa7\xf1c" + b"\xc4\xa1\x16(\xf5ZM\xf5#\xb3\xef", + } + + example_log = adapter.dict_to_dataclass(example_log) + + decoder = ERC20Decoder("eth") + decoded_transfer = decoder.log_to_transfer(example_log) + assert decoded_transfer is None + + example_log = { + "log_index": 0, + "transaction_index": 1, + "block_hash": b"\x00\x00\x00\x00\x02\xfa\xf0\xe5A\xeab\x1d\xed\xc7%\x00\x074^" + b"\x10\xaa5\xe7\xbd\xb7\xa9\x1c\xee\x99\x0f96", + "address": bytes.fromhex("dac17f958d2ee523a2206206994597c13d831ec7"), + "data": b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xba\x81@", + "topics": [ + b"\xdd\xf2R\xad\x1b\xe2\xc8\x9bi\xc2\xb0h" + b"\xfc7\x8d\xaa\x95+\xa7\xf1c\xc4\xa1" + b"\x16(\xf5ZM\xf5#\xb3\xef", + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xb3\xa8"da\xf0\xe6\xa9' + b"\xa1\x06?\xeb\xea\x88\xc6\xf6\xa5\xa0\x85~", + b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0LT\xf6\xb6\xa2\x9a" + b'\xf0ZT\x95\x8cIt\xc3\x83\xb4\xd9"\xac', + ], + "tx_hash": b"\xe0}\xe10\xe5\xc2\xb1\xde\x13\xcd\x88\xee!\xfa\x1e\xca]e\xba\xbb" + b"\xecVG\xd3\x1c\xb7\x90\x1f\xc92wo", + "block_id": 50000101, + "block_id_group": 50000, + "partition": 500, + "topic0": b"\xdd\xf2R\xad\x1b\xe2\xc8\x9bi\xc2\xb0h\xfc7\x8d\xaa\x95+\xa7\xf1c" + b"\xc4\xa1\x16(\xf5ZM\xf5#\xb3\xef", + } + + example_log = adapter.dict_to_dataclass(example_log) + decoder = ERC20Decoder("eth") + decoded_transfer = decoder.log_to_transfer(example_log) + check = TokenTransfer( + from_address=bytes.fromhex("B3a8226461F0e6A9a1063fEBeA88C6f6A5a0857E"), + to_address=bytes.fromhex("F04C54F6b6A29aF05A54958c4974C383B4D922ac"), + value=29000000, + asset="USDT", + coin_equivalent=0, + usd_equivalent=1, + block_id=50000101, + tx_hash=b"\xe0}\xe10\xe5\xc2\xb1\xde\x13\xcd\x88\xee!\xfa\x1e\xca]e\xba\xbb" + b"\xecVG\xd3\x1c\xb7\x90\x1f\xc92wo", + log_index=0, + decimals=6, + ) + + assert decoded_transfer == check + + decoder = ERC20Decoder("trx") + decoded_transfer = decoder.log_to_transfer(example_log) + assert decoded_transfer is None + + +def test_address_sorting(): + class SortableAssetTransfer: + def __init__( + self, + from_address=None, + to_address=None, + block_id=None, + log_index=None, + trace_index=None, + transaction_index=None, + ): + self.from_address = from_address + self.to_address = to_address + self.block_id = block_id + self.log_index = log_index + self.trace_index = trace_index + self.transaction_index = transaction_index + + traces_s = [ + SortableAssetTransfer( + from_address="0x1", to_address="0x2", block_id=1, trace_index=2 + ), + SortableAssetTransfer( + from_address="0x3", to_address="0x4", block_id=2, trace_index=1 + ), + ] + reward_traces = [SortableAssetTransfer(to_address="0x5", block_id=3, trace_index=0)] + token_transfers = [ + SortableAssetTransfer( + from_address="0x2", to_address="0x3", block_id=1, log_index=1 + ), + SortableAssetTransfer( + from_address="0x4", to_address="0x1", block_id=2, log_index=2 + ), + ] + transactions = [ + SortableAssetTransfer( + from_address="0x1", to_address="0x0", block_id=1, transaction_index=1000001 + ), + SortableAssetTransfer( + from_address="0x2", to_address="0x4", block_id=2, transaction_index=1000002 + ), + ] + + expected_addresses = ["0x0", "0x1", "0x2", "0x3", "0x4", "0x5"] + + result_addresses = list( + get_sorted_unique_addresses( + traces_s, reward_traces, token_transfers, transactions + ) + ) + + assert result_addresses == expected_addresses diff --git a/tests/deltaupdate/test_createchanges.py b/tests/deltaupdate/test_createchanges.py new file mode 100644 index 0000000..c609a76 --- /dev/null +++ b/tests/deltaupdate/test_createchanges.py @@ -0,0 +1,162 @@ +# flake8: noqa + +import logging +import unittest +from collections import defaultdict +from datetime import datetime +from typing import Any, Callable, Dict, List, NamedTuple, Tuple +from unittest.mock import MagicMock, patch + +import graphsenselib +from graphsenselib.db import DbChange +from graphsenselib.deltaupdate.update.abstractupdater import TABLE_NAME_DELTA_HISTORY +from graphsenselib.deltaupdate.update.account.createchanges import ( + prepare_balances_for_ingest, + prepare_entities_for_ingest, + prepare_entity_txs_for_ingest, + prepare_relations_for_ingest, + prepare_txs_for_ingest, +) +from graphsenselib.deltaupdate.update.account.modelsdelta import ( + BalanceDelta, + EntityDeltaAccount, + RawEntityTxAccount, + RelationDeltaAccount, +) +from graphsenselib.deltaupdate.update.generic import DeltaScalar, Tx +from graphsenselib.utils import DataObject as MutableNamedTuple +from graphsenselib.utils.account import ( + get_id_group, + get_id_group_with_secondary_addresstransactions, + get_id_group_with_secondary_relations, +) +from graphsenselib.utils.logging import LoggerScope + + +class TestPrepareTxsForIngest(unittest.TestCase): + def setUp(self): + self.mock_delta = [ + Tx(tx_id=1, tx_hash=b"0x1234567", block_id=123, failed=False, tx_index=0), + Tx( + tx_id=2, tx_hash=b"0x1222234", block_id=234, failed=True, tx_index=1 + ), # This should be skipped in block_transactions + # Add more mock Tx objects as needed + ] + self.id_bucket_size = 10 + self.block_bucket_size = 5 + self.mock_get_transaction_prefix = MagicMock(return_value=("prefix", "1")) + + def test_prepare_txs_for_ingest(self): + # Mock the external function behaviors + self.mock_get_transaction_prefix.return_value = ("prefix", "1") + + changes = prepare_txs_for_ingest( + self.mock_delta, + self.id_bucket_size, + self.block_bucket_size, + self.mock_get_transaction_prefix, + ) + print(changes) + + n_changes_transaction_ids_by_transaction_id_group = len( + [x for x in changes if x.table == "transaction_ids_by_transaction_id_group"] + ) + n_changes_block_transactions = len( + [x for x in changes if x.table == "block_transactions"] + ) + + self.assertEqual(n_changes_transaction_ids_by_transaction_id_group, 2) + self.assertEqual(n_changes_block_transactions, 1) + + +class TestPrepareBalancesForIngest(unittest.TestCase): + def setUp(self): + # Mock BalanceDelta objects and addr_balances + self.mock_delta = [ + BalanceDelta(identifier=123, asset_balances={"USDT": DeltaScalar(1)}), + BalanceDelta( + identifier=234, + asset_balances={"USDT": DeltaScalar(2), "ETH": DeltaScalar(1)}, + ), + # BalanceDelta(identifier=123, asset_balances={'USDT': DeltaScalar(2)}), cant be twice because it is compressed before + BalanceDelta(identifier=456, asset_balances={"ETH": DeltaScalar(2)}), + ] + self.addr_balances = { + 234: BalanceDelta(identifier=234, asset_balances={"USDT": DeltaScalar(2)}), + 123: BalanceDelta( + identifier=123, + asset_balances={"USDT": DeltaScalar(2), "ETH": DeltaScalar(1)}, + ), + } + + self.addr_balances_ref = { + 234: {"USDT": 4, "ETH": 1}, + 123: {"USDT": 3}, + 456: {"ETH": 2}, + } + + self.id_bucket_size = 100 + + def test_prepare_balances_for_ingest(self): + changes = prepare_balances_for_ingest( + self.mock_delta, self.id_bucket_size, self.addr_balances + ) + + print(changes) + # Check the total number of DbChange instances created + self.assertEqual( + len(changes), 4, "Incorrect number of DbChange instances created" + ) + + # transform DbChange to dict and compare with expected + change_dict = defaultdict(dict) + + for change in changes: + data = change.data + change_dict[data["address_id"]][data["currency"]] = data["balance"] + + for k, v in self.addr_balances_ref.items(): + for currency, balance in v.items(): + self.assertEqual(change_dict[k][currency], balance) + + +class TestPrepareEntityTxsForIngest(unittest.TestCase): + def setUp(self): + # Setup your mock data here + self.mock_delta = [ + RawEntityTxAccount( + identifier=222, + tx_id=123, + block_id=100, + is_outgoing=True, + tx_reference="ref1", # should be UserType but its fine + token_values={"tokenA": 100, "tokenB": 200}, + value=0, # A RawEntityTxAccount can only have either value or token_values the way the functions are written. Could write a unit test for that too + ), + RawEntityTxAccount( + identifier=111, + tx_id=234, + block_id=200, + is_outgoing=False, + tx_reference="ref2", + token_values={}, # This should test the non-token transfer scenario + value=1, + ), + ] + self.id_bucket_size = 10 + self.currency = "currency1" + self.block_bucket_size_address_txs = 5 + + self.expected_changes_count = 3 + + def test_prepare_entity_txs_for_ingest(self): + changes = prepare_entity_txs_for_ingest( + self.mock_delta, + self.id_bucket_size, + self.currency, + self.block_bucket_size_address_txs, + ) + + print(changes) + + self.assertEqual(len(changes), self.expected_changes_count) diff --git a/tests/deltaupdate/test_utxoupdate.py b/tests/deltaupdate/test_utxoupdate.py index 365e543..f332a52 100644 --- a/tests/deltaupdate/test_utxoupdate.py +++ b/tests/deltaupdate/test_utxoupdate.py @@ -1,5 +1,5 @@ from graphsenselib.deltaupdate.update.generic import DbDelta -from graphsenselib.deltaupdate.update.utxo import dbdelta_from_utxo_transaction +from graphsenselib.deltaupdate.update.utxo.update import dbdelta_from_utxo_transaction from graphsenselib.utils import group_by, groupby_property from graphsenselib.utils.utxo import get_unique_addresses_from_transaction diff --git a/tests/utils/test_tablecache.py b/tests/utils/test_tablecache.py new file mode 100644 index 0000000..cfa2854 --- /dev/null +++ b/tests/utils/test_tablecache.py @@ -0,0 +1,46 @@ +import pytest +from diskcache import Cache + +from graphsenselib.utils.cache import TableBasedCache + + +def test_cache(): + c = TableBasedCache(Cache()) + + c[("abc", 1)] = 1 + c[("abc", 2)] = 2 + + assert c[("abc", 1)] == 1 + assert c[("abc", 2)] == 2 + + c.put_items("abc", [(1, 3), (2, 4)]) + + assert c[("abc", 1)] == 3 + assert c[("abc", 2)] == 4 + + data = [ + {"a": 1, "b": 2}, + {"a": 3, "b": 2}, + {"a": 1, "b": 3}, + ] + + c.put_items_keyed_by("abc", data, key="a") + + assert len(c.get_item("abc", 1)) == 2 + assert c.get_item("abc", 1) == [{"a": 1, "b": 2}, {"a": 1, "b": 3}] + assert len(c.get_item("abc", 3)) == 1 + assert c.get_item("abc", 2) == 4 + + c.delete_item("abc", 1) + with pytest.raises(KeyError): + assert len(c.get_item("abc", 1)) == 2 + + assert c.get_item("abc", 2) == 4 + + del c[("abc", 2)] + + with pytest.raises(KeyError): + c.get_item("abc", 2) + + assert c.get(("abc", 2), None) is None + assert c.get(("abc", 2), []) == []