-
Notifications
You must be signed in to change notification settings - Fork 2
/
bias.py
161 lines (140 loc) · 6.51 KB
/
bias.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# adapted from: source https://github.com/jmhessel/FightingWords/blob/master/fighting_words_py3.py
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer as CV
import string
exclude = set(string.punctuation)
import os
import json
from constants_and_utils import *
def basic_sanitize(in_string):
'''Returns a very roughly sanitized version of the input string.'''
in_string = in_string.replace(" and ", " ")
in_string = in_string.replace(" or ", " ")
in_string = ''.join([ch for ch in in_string if ch not in exclude])
in_string = in_string.lower()
in_string = ' '.join(in_string.split())
return in_string
def bayes_compare_language(l1, l2, ngram = 1, prior=.05, cv = None):
'''
Arguments:
- l1, l2; a list of strings from each language sample
- ngram; an int describing up to what n gram you want to consider (1 is unigrams,
2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.
- prior; either a float describing a uniform prior, or a vector describing a prior
over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
when you make your CountVectorizer object.
- cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.
Returns:
- A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''
if cv is None and type(prior) is not float:
print("If using a non-uniform prior:")
print("Please also pass a count vectorizer with the vocabulary parameter set.")
quit()
# this is some basic normalization, not tokenization, we may want to change that
l1 = [basic_sanitize(l) for l in l1]
l2 = [basic_sanitize(l) for l in l2]
# I removed max_df, and min_df from CV ()
# let's think if we want to have it back and what values
if cv is None:
cv = CV(decode_error = 'ignore', ngram_range=(1,ngram),
binary = False,
max_features = 15000)
counts_mat = cv.fit_transform(l1+l2).toarray()
"""
for the example below it returns such a matrix
l1 = ["dancing", "dancing", "dancing", "sport", "ballet", "swimming", "running", "hiking"]
l2 = ["singing", "singing", "singing", "traveling", "thinking", "talking", "chatting"]
>>> print(counts_mat)
[[0 0 1 0 0 0 0 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0]
[1 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 0 0 0]
[0 0 0 0 1 0 0 0 0 0 0]
[0 0 0 1 0 0 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1]
[0 0 0 0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 0 0 1 0 0]
[0 1 0 0 0 0 0 0 0 0 0]]
"""
# Now sum over languages...
vocab_size = len(cv.vocabulary_)
#print("Vocab size is {}".format(vocab_size))
# if we want to use informative prior we need to give it as an argument
# (but we can leave noninformative prior as well)
if type(prior) is float:
priors = np.array([prior for i in range(vocab_size)])
else:
priors = prior
z_scores = np.empty(priors.shape[0])
count_matrix = np.empty([2, vocab_size], dtype=np.float32)
# top of the matrix is first document, below is second document
# we put it in count_matrix[0,:] and [1, :] respectively
# Please see the explanation below
count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
a0 = np.sum(priors)
n1 = 1.*np.sum(count_matrix[0,:]) # sum of words in doc1
n2 = 1.*np.sum(count_matrix[1,:]) # sum of words in doc2
for i in range(vocab_size):
#compute delta
# (count of word i in doc 0 + prior word i) / (# words in doc 0 + total prior on words - cound of word i in doc 0 - prior word i)
term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))
delta = term1 - term2
#compute variance on delta
# this formula is an approximation; please see the variance section
var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
#store final score
z_scores[i] = delta/np.sqrt(var)
index_to_term = {v:k for k,v in cv.vocabulary_.items()}
sorted_indices = np.argsort(z_scores)
return_list = []
for i in sorted_indices:
return_list.append((index_to_term[i], z_scores[i]))
return return_list
if __name__ == "__main__":
# Example call:
# l1 = ["dancing", "dancing", "dancing", "sport", "ballet", "swimming", "running", "hiking"]
# l2 = ["singing", "singing", "singing", "traveling", "thinking", "talking", "chatting"]
# print(bayes_compare_language(l1, l2, ngram = 1, prior=.01, cv = None))
# read path to text files / jsons
with open(os.path.join(PATH_TO_TEXT_FILES, 'us_5000_with_interests.json')) as f:
personas = json.load(f)
# get all interests
all_interests = []
for persona in personas:
all_interests.append(personas[persona]["interests"])
demos = {'gender': ['Woman', 'Man', 'Nonbinary'],
'race/ethnicity': ['White', 'Black', 'Latino', 'Asian', 'Native American/Alaska Native', 'Native Hawaiian'],
'religion': ['Protestant', 'Catholic', 'Jewish', 'Muslim', 'Hindu', 'Buddhist', 'Unreligious'],
'political affiliation': ['Democrat', 'Republican', 'Independent']}
for demo in demos:
for category in demos[demo]:
group_count = 0
demo_interests = []
for persona in personas:
if personas[persona][demo] == category:
demo_interests.append(personas[persona]["interests"])
group_count += 1
print("\n\n")
print(f"Comparing interests for {demo} {category}...")
print(f"Group count: {group_count/len(personas)*100}%")
z_scores = bayes_compare_language(demo_interests, all_interests, ngram = 1, prior=.01, cv = None)
# print("NOT interested in")
# print("-------------------------")
# for (word, z) in z_scores:
# if z < -1.96:
# print(f"{word},", end =" ")
z_scores.reverse()
print("\nInterested in")
print("-------------------------")
for (word, z) in z_scores:
if z > 1.96:
print(f"{word},", end =" ")
print("\n\n")
pass