From 59026f9a1d74ff5effed42911a4ad2268120503d Mon Sep 17 00:00:00 2001 From: Camilo Soto Montoya Date: Sat, 8 May 2021 13:18:37 -0500 Subject: [PATCH] shared class updates --- README.md | 4 +- src/pycoornet/shared.py | 193 +++++++++++++++++++--------------------- src/pycoornet/utils.py | 4 - tests/test_pycoornet.py | 14 ++- 4 files changed, 107 insertions(+), 108 deletions(-) diff --git a/README.md b/README.md index 6cc1baa..4e99376 100644 --- a/README.md +++ b/README.md @@ -40,8 +40,8 @@ def main(): # Init CrowdTangle with api key crowd_tangle = CrowdTangle("abc123def345") ct_df = crowd_tangle.get_shares(urls=links_df, url_column='clean_url', date_column='date',clean_urls=True, platforms='facebook', sleep_time=30) - shared = Shared() - crowtangle_shares_df, shares_graph, q = shared.coord_shares(ct_df, clean_urls=True) + shared = Shared(ct_df) + crowtangle_shares_df, shares_graph, q = shared.coord_shares(clean_urls=True) #Build Gephi File for node in shares_graph.nodes(data=True): diff --git a/src/pycoornet/shared.py b/src/pycoornet/shared.py index adf4bc8..8abb62e 100644 --- a/src/pycoornet/shared.py +++ b/src/pycoornet/shared.py @@ -9,14 +9,24 @@ logger = logging.getLogger(__name__) + + class Shared: - """Shared class docsgring - """ - # def __init__(self): - # pass + """Coordinated Link Sharing Behavor (CLSB) detector. - def __estimate_coord_interval(self, crowtangle_shares_df, q=0.1, p=0.5, clean_urls=False, keep_ourl_only=False): + Args: + crowdtangle_shares_df (pandas.DataFrame): the pandas dataframe of link posts resulting from the function + CrowdTangle shares + Returns: + 2-elementos tuple containing + - **graph** (networkx.Graph): An graph (highly_connected_g) with networks of coordinated entities whose edges also contains a t_coord_share attribute (vector) reporting the timestamps of every time the edge was detected as coordinated sharing. + - **q** (float): Percentile edge weight number of leeped repetedly coordinated link sharing. + """ + def __init__(self, crowdtangle_shares_df): + self.__crowdtangle_shares_df = crowdtangle_shares_df + + def estimate_coord_interval(self, q=0.1, p=0.5, clean_urls=False, keep_ourl_only=False): """Estimates a threshold in seconds that defines a coordinated link share. While it is common that multiple (pages/groups/account) entities share the same link, some tend to perform these actions in an unusually short period of time. Unusual is thus defined here as a function of the median co-share time difference. More specifically, the function ranks all @@ -24,7 +34,6 @@ def __estimate_coord_interval(self, crowtangle_shares_df, q=0.1, p=0.5, clean_ur The value returned is the median time in seconds spent by these URLs to cumulate the p\% (default 0.1) of their total shares. Args: - crowtangle_shares_df (pandas.Dataframe): the dataframe of link posts q (float, optional): controls the quantile of quickest URLs to be filtered. Defaults to 0.1. @@ -37,9 +46,8 @@ def __estimate_coord_interval(self, crowtangle_shares_df, q=0.1, p=0.5, clean_ur Returns: 2-element tuple containing - - **summary** (pandas.DataFrame): summary statistics of q\% quickest second share performing URLs. - - **time** (integer): time in seconds corresponding to the median time spent by these URLs to cumulate the % of their total shares. + - **coordination interval** (integer): time in seconds corresponding to the median time spent by these URLs to cumulate the % of their total shares. """ if 0 1 urls_df = urls_df[urls_df['ct_shares'] >1] - # filter the crowtangle_shares_df that join with urls_df - crowtangle_shares_df = crowtangle_shares_df[crowtangle_shares_df.set_index('expanded').index.isin(urls_df.set_index('URL').index)] + # filter the crowdtangle_shares_df that join with urls_df + crowdtangle_shares_df = crowdtangle_shares_df[crowdtangle_shares_df.set_index('expanded').index.isin(urls_df.set_index('URL').index)] #metrics creation - crowtangle_shares_df['date'] = crowtangle_shares_df['date'].astype('datetime64[ns]') - ranked_shares_df = crowtangle_shares_df[['expanded', 'date']] - shares_gb = crowtangle_shares_df.groupby('expanded') + crowdtangle_shares_df['date'] = crowdtangle_shares_df['date'].astype('datetime64[ns]') + ranked_shares_df = crowdtangle_shares_df[['expanded', 'date']] + shares_gb = crowdtangle_shares_df.groupby('expanded') ranked_shares_df['ct_shares_count']=shares_gb['id'].transform('nunique') ranked_shares_df['first_share_date'] = shares_gb['date'].transform('min') ranked_shares_df['rank'] = shares_gb['date'].rank(ascending=True, method='first') ranked_shares_df['perc_of_shares'] = ranked_shares_df['rank']/ranked_shares_df['ct_shares_count'] - #ranked_shares_df['perc_of_shares'] = shares_gb['date'].rank(ascending=True) ranked_shares_df['sec_from_first_share'] = (ranked_shares_df['date'] - ranked_shares_df['first_share_date']).dt.total_seconds() ranked_shares_df = ranked_shares_df.sort_values(by = 'expanded') @@ -126,7 +134,7 @@ def __estimate_coord_interval(self, crowtangle_shares_df, q=0.1, p=0.5, clean_ur return coord_interval - def __buid_graph(self, crowtangle_shares_df, coordinated_shares_df, percentile_edge_weight = 90, timestamps = False): + def __buid_graph(self, crowdtangle_shares_df, coordinated_shares_df, percentile_edge_weight = 90, timestamps = False): logger.info("Bulding graph") coord_df = coordinated_shares_df[['account_url', 'url', 'share_date']].reset_index(drop=True) coord_graph = nx.from_pandas_edgelist(coord_df, 'account_url', 'url', create_using=nx.DiGraph()) @@ -148,19 +156,19 @@ def __buid_graph(self, crowtangle_shares_df, coordinated_shares_df, percentile_e full_graph = bipartite.weighted_projected_graph(bipartite_graph, account_urls) #pandas helper dataframe to calcule graph node attribues - crowtangle_shares_df['account_name'] = crowtangle_shares_df['account_name'].astype(str) - crowtangle_shares_df['account_handle'] = crowtangle_shares_df['account_handle'].astype(str) - crowtangle_shares_df['account_pageAdminTopCountry'] = crowtangle_shares_df['account_pageAdminTopCountry'].astype(str) - crowtangle_shares_gb = crowtangle_shares_df.groupby('account_url') - crowtangle_shares_df['name_changed']=(crowtangle_shares_gb['account_name'].transform("nunique"))>1 - crowtangle_shares_df['handle_changed']=(crowtangle_shares_gb['account_handle'].transform("nunique"))>1 - crowtangle_shares_df['page_admin_top_country_changed']=(crowtangle_shares_gb['account_pageAdminTopCountry'].transform("nunique"))>1 - crowtangle_shares_df['account_name'] = crowtangle_shares_gb['account_name'].transform(lambda col: '|'.join(col.unique())) - crowtangle_shares_df['account_handle'] = crowtangle_shares_gb['account_handle'].transform(lambda col: '|'.join(col.unique())) - crowtangle_shares_df['account_pageAdminTopCountry'] = crowtangle_shares_gb['account_pageAdminTopCountry'].transform(lambda col: '|'.join(col.unique())) - crowtangle_shares_df[['account_name','account_handle','account_pageAdminTopCountry','name_changed','handle_changed','page_admin_top_country_changed']] - - crowtangle_shares_gb = crowtangle_shares_df.reset_index().groupby(['account_url']) + crowdtangle_shares_df['account_name'] = crowdtangle_shares_df['account_name'].astype(str) + crowdtangle_shares_df['account_handle'] = crowdtangle_shares_df['account_handle'].astype(str) + crowdtangle_shares_df['account_pageAdminTopCountry'] = crowdtangle_shares_df['account_pageAdminTopCountry'].astype(str) + crowtangle_shares_gb = crowdtangle_shares_df.groupby('account_url') + crowdtangle_shares_df['name_changed']=(crowtangle_shares_gb['account_name'].transform("nunique"))>1 + crowdtangle_shares_df['handle_changed']=(crowtangle_shares_gb['account_handle'].transform("nunique"))>1 + crowdtangle_shares_df['page_admin_top_country_changed']=(crowtangle_shares_gb['account_pageAdminTopCountry'].transform("nunique"))>1 + crowdtangle_shares_df['account_name'] = crowtangle_shares_gb['account_name'].transform(lambda col: '|'.join(col.unique())) + crowdtangle_shares_df['account_handle'] = crowtangle_shares_gb['account_handle'].transform(lambda col: '|'.join(col.unique())) + crowdtangle_shares_df['account_pageAdminTopCountry'] = crowtangle_shares_gb['account_pageAdminTopCountry'].transform(lambda col: '|'.join(col.unique())) + crowdtangle_shares_df[['account_name','account_handle','account_pageAdminTopCountry','name_changed','handle_changed','page_admin_top_country_changed']] + + crowtangle_shares_gb = crowdtangle_shares_df.reset_index().groupby(['account_url']) account_info_df = crowtangle_shares_gb['index'].agg([('shares','count')]) account_info_df = account_info_df.merge(pd.DataFrame(crowtangle_shares_gb['is_coordinated'].apply(lambda x: (x==True).sum())).rename(columns={'is_coordinated':'coord_shares'}), left_index=True, right_index=True) @@ -249,22 +257,15 @@ def __buid_graph(self, crowtangle_shares_df, coordinated_shares_df, percentile_e return highly_connected_graph, q - def coord_shares(self, dataframe, coordination_interval=None, parallel=False, percentile_edge_weight=90, clean_urls=False, keep_ourl_only=False, gtimestamps=False): + def coord_shares(self, coordination_interval=None, percentile_edge_weight=90, clean_urls=False, keep_ourl_only=False, gtimestamps=False): """Given a dataframe of CrowdTangle shares and a time threshold, this function detects networks of entities (pages, accounts and groups) that performed coordinated link sharing behavior. Args: - dataframe (pandas.DataFrame): the pandas dataframe of link posts resulting from the function - CrowdTangle shares - coordination_interval (int, optional): a threshold in seconds that defines a coordinated share. Given a dataset of CrowdTangle shares, this threshold is automatically estimated. Alternatively it can be manually passed to the function in seconds. Defaults to None. - parallel (bool, optional): enables parallel processing to speed up the process taking advantage - of multiple cores. The number of cores is automatically set to all the available cores minus one. - Defaults to False. - percentile_edge_weight (float, optional): defines the percentile of the edge distribution to keep in order to identify a network of coordinated entities. In other terms, this value determines the minimum number of times that two entities had to coordinate in order to be considered part of a network. Defaults to 0.90. @@ -278,14 +279,16 @@ def coord_shares(self, dataframe, coordination_interval=None, parallel=False, pe Slow on large networks. Defaults to False. Returns: - 2-element tuple containing + 3-element tuple containing - **coordinated_df** (pandas.DataFrame): The input dataframe of shares with an additional boolean variable (coordinated) that identifies coordinated shares. - **graph** (networkx.Graph): An graph (highly_connected_g) with networks of coordinated entities whose edges also contains a t_coord_share attribute (vector) reporting the timestamps of every time the edge was detected as coordinated sharing. + - **q** (networkx.Graph): Percentile edge weight number of leeped repetedly coordinated link sharing. """ # estimate the coordination interval if not specified by the users + dataframe = self.__crowdtangle_shares_df.copy(deep=True) if coordination_interval == None: - coordination_interval = self.__estimate_coord_interval(dataframe, clean_urls=clean_urls, keep_ourl_only=keep_ourl_only) + coordination_interval = self.estimate_coord_interval(clean_urls=clean_urls, keep_ourl_only=keep_ourl_only) coordination_interval = coordination_interval[1] if coordination_interval == 0: @@ -305,59 +308,51 @@ def coord_shares(self, dataframe, coordination_interval=None, parallel=False, pe urls_df = urls_df[urls_df['ct_shares'] >1] urls_df = urls_df.sort_values('URL') - crowtangle_shares_df = dataframe[dataframe.set_index('expanded').index.isin(urls_df.set_index('URL').index)] - - if parallel: - pass - else: - data_list = [] - urls_count = urls_df.shape[0] - i=0 - - with tqdm(total=urls_df.shape[0]) as pbar: - for index, row in urls_df.iterrows(): - pbar.update(1) - i=i+1 - logger.debug(f"processing {i} of {urls_count}, url={row['URL']}") - summary_df = crowtangle_shares_df[crowtangle_shares_df['expanded'] == row['URL']].copy(deep=True) - if summary_df.groupby('account_url')['account_url'].nunique().shape[0]>1: - summary_df['date'] = summary_df['date'].astype('datetime64[ns]') - #summary_df['cut'] = pd.cut(summary_df['date'], int(coordination_interval)) - date_serie = summary_df['date'].astype('int64') // 10 ** 9 - max = date_serie.max() - min = date_serie.min() - div = (max-min)/coordination_interval + 1 - summary_df["cut"] = pd.cut(summary_df['date'],int(div)).apply(lambda x: x.left).astype('datetime64[ns]') - cut_gb = summary_df.groupby('cut') - summary_df.loc[:,'count'] = cut_gb['cut'].transform('count') - #summary_df = summary_df[['cut', 'count']].copy(deep=True) - # summary_df = summary_df.rename(columns = {'date': 'share_date'}) - summary_df.loc[:,'url'] = row['URL'] - summary_df.loc[:,'account_url'] = cut_gb['account_url'].transform(lambda x: [x.tolist()]*len(x)) - summary_df.loc[:,'share_date'] = cut_gb['date'].transform(lambda x: [x.tolist()]*len(x)) - summary_df = summary_df[['cut', 'count', 'account_url','share_date', 'url']] - summary_df = summary_df[summary_df['count']>1] - if summary_df.shape[0]>1: - summary_df = summary_df.loc[summary_df.astype(str).drop_duplicates().index] - #summary_df['account_url'] = [account_url] * summary_df.shape[0] - #summary_df['share_date'] = [dates] * summary_df.shape[0] - data_list.append(summary_df) - - data_df = pd.concat(data_list) - if data_df.shape[0] == 0: - logger.info('there are not enough shares!') - return None - - coordinated_shares_df = data_df.reset_index(drop=True).apply(pd.Series.explode).reset_index(drop=True) - - crowtangle_shares_df = crowtangle_shares_df.reset_index(drop=True) - crowtangle_shares_df.loc[:,'coord_expanded']=crowtangle_shares_df['expanded'].isin(coordinated_shares_df['url']) - crowtangle_shares_df.loc[:,'coord_date']=crowtangle_shares_df['date'].astype('datetime64[ns]').isin(coordinated_shares_df['share_date']) - crowtangle_shares_df.loc[:,'coord_account_url']=crowtangle_shares_df['account_url'].isin(coordinated_shares_df['account_url']) - - crowtangle_shares_df.loc[:,'is_coordinated'] = crowtangle_shares_df.apply(lambda x : True if (x['coord_expanded'] and x['coord_date'] and x['coord_account_url']) else False, axis=1) - crowtangle_shares_df.drop(['coord_expanded','coord_date', 'coord_account_url'], inplace = True, axis=1) - - highly_connected_graph, q = self.__buid_graph(crowtangle_shares_df, coordinated_shares_df, percentile_edge_weight=percentile_edge_weight, timestamps=gtimestamps) - - return crowtangle_shares_df, highly_connected_graph, q + crowdtangle_shares_df = dataframe[dataframe.set_index('expanded').index.isin(urls_df.set_index('URL').index)] + + data_list = [] + urls_count = urls_df.shape[0] + i=0 + + with tqdm(total=urls_df.shape[0]) as pbar: + for index, row in urls_df.iterrows(): + pbar.update(1) + i=i+1 + logger.debug(f"processing {i} of {urls_count}, url={row['URL']}") + summary_df = crowdtangle_shares_df[crowdtangle_shares_df['expanded'] == row['URL']].copy(deep=True) + if summary_df.groupby('account_url')['account_url'].nunique().shape[0]>1: + summary_df['date'] = summary_df['date'].astype('datetime64[ns]') + date_serie = summary_df['date'].astype('int64') // 10 ** 9 + max = date_serie.max() + min = date_serie.min() + div = (max-min)/coordination_interval + 1 + summary_df["cut"] = pd.cut(summary_df['date'],int(div)).apply(lambda x: x.left).astype('datetime64[ns]') + cut_gb = summary_df.groupby('cut') + summary_df.loc[:,'count'] = cut_gb['cut'].transform('count') + summary_df.loc[:,'url'] = row['URL'] + summary_df.loc[:,'account_url'] = cut_gb['account_url'].transform(lambda x: [x.tolist()]*len(x)) + summary_df.loc[:,'share_date'] = cut_gb['date'].transform(lambda x: [x.tolist()]*len(x)) + summary_df = summary_df[['cut', 'count', 'account_url','share_date', 'url']] + summary_df = summary_df[summary_df['count']>1] + if summary_df.shape[0]>1: + summary_df = summary_df.loc[summary_df.astype(str).drop_duplicates().index] + data_list.append(summary_df) + + data_df = pd.concat(data_list) + if data_df.shape[0] == 0: + logger.info('there are not enough shares!') + return None + + coordinated_shares_df = data_df.reset_index(drop=True).apply(pd.Series.explode).reset_index(drop=True) + + crowdtangle_shares_df = crowdtangle_shares_df.reset_index(drop=True) + crowdtangle_shares_df.loc[:,'coord_expanded']=crowdtangle_shares_df['expanded'].isin(coordinated_shares_df['url']) + crowdtangle_shares_df.loc[:,'coord_date']=crowdtangle_shares_df['date'].astype('datetime64[ns]').isin(coordinated_shares_df['share_date']) + crowdtangle_shares_df.loc[:,'coord_account_url']=crowdtangle_shares_df['account_url'].isin(coordinated_shares_df['account_url']) + + crowdtangle_shares_df.loc[:,'is_coordinated'] = crowdtangle_shares_df.apply(lambda x : True if (x['coord_expanded'] and x['coord_date'] and x['coord_account_url']) else False, axis=1) + crowdtangle_shares_df.drop(['coord_expanded','coord_date', 'coord_account_url'], inplace = True, axis=1) + + highly_connected_graph, q = self.__buid_graph(crowdtangle_shares_df, coordinated_shares_df, percentile_edge_weight=percentile_edge_weight, timestamps=gtimestamps) + + return crowdtangle_shares_df, highly_connected_graph, q diff --git a/src/pycoornet/utils.py b/src/pycoornet/utils.py index ad67226..623a8c3 100644 --- a/src/pycoornet/utils.py +++ b/src/pycoornet/utils.py @@ -66,10 +66,6 @@ def clean_urls(dataframe, url_column): dataframe = dataframe.loc[dataframe[url_column].str.contains('^http://127.0.0.1|^https://127.0.0.1|http://localhost|https://localhost') == False] dataframe = dataframe.loc[dataframe[url_column].str.contains('http://|https://')] - #Remove the URL query parameters and set NaN to protocols that aren't http or https - #dataframe[url_column] = dataframe[url_column].apply(lambda x: urljoin(x, urlparse(x).path) if (urlparse(x).scheme == 'http' or urlparse(x).scheme == 'https') and urlparse(x).netloc != '127.0.0.1' and urlparse(x).netloc != 'localhost' else np.NaN) - #Drop NaN values - #dataframe=dataframe.dropna() dataframe=dataframe.reset_index(drop=True) return dataframe diff --git a/tests/test_pycoornet.py b/tests/test_pycoornet.py index 22f1572..1e00564 100644 --- a/tests/test_pycoornet.py +++ b/tests/test_pycoornet.py @@ -27,10 +27,18 @@ def test_crowdtangle(crowd_token, sample_source_df): def test_shared(sample_ct_df): - shared = Shared() - crowtangle_shares_df, highly_connected_graph, q = shared.coord_shares( - sample_ct_df, clean_urls=True) + shared = Shared(sample_ct_df) + crowtangle_shares_df, highly_connected_graph, q = shared.coord_shares(clean_urls=True) if crowtangle_shares_df.shape[0] > 0 and highly_connected_graph != None and q > 0: assert True else: assert False + +def test_estimate_coord_interval(sample_ct_df): + shared = Shared(sample_ct_df) + summary, coord_interval = shared.estimate_coord_interval(True) + if coord_interval > 0: + assert True + else: + assert False +