-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
214 lines (154 loc) Β· 8.99 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import streamlit as st
import pandas as pd
st.set_page_config(page_title="Survey Sight", page_icon= ":bar_chart:")
st.title("Survey:blue[Sight]")
st.header("Freetext Survey Feedback Analyzer")
st.header("Powered by :blue[Chat GPT] :robot_face:",divider='rainbow')
st.write("Upload your freetext survey responses to get a detailed summary of what"
" your respondents are saying. No need to read through hundreds "
"of responses ever again!")
#st.markdown("<hr>", unsafe_allow_html=True)
st.header("π **FYI**")
st.write("This only works with Excel or CSV files.")
st.write("Your survey responses should be contained in a single column, with one row per response.")
st.write("Your Excel / CSV file can contain other non related columns too, as we will choose the specific response column after upload.")
st.write("Due to ChatGPT's limit on the number of words it can process, this app may not be able to analyse all of your responses. (GPT 4 can process twice as many words as GPT 3)")
st.markdown("<hr>", unsafe_allow_html=True)
st.write("First of all, do you have your own data?")
data = st.radio(options= ["Yes", "No"], label='hello', label_visibility ="hidden")
if data == "No":
st.write("That's fine! If you don't currently have a dataset to play around with, "
"you can use this. It's a collection of Google Play Store reviews "
"for an app called Any.do. The user reviews are in the *content* column."
" Click the button below to download the dataset and continue onto the next step." )
url1 = "https://raw.githubusercontent.com/soliverc/Streamlit-OpenAI-Survey-Analyzer/main/.testdata/anydo.csv"
@st.cache_data
def download_sample(link):
dfdownload = pd.read_csv(link)
return dfdownload
freefile = download_sample(url1)
import io
csv_buffer = io.StringIO()
freefile.to_csv(csv_buffer)
st.download_button(
label="Download data as CSV",
data=csv_buffer.getvalue(),
file_name='PlayStoreReviews.csv',
mime='text/csv',
)
else:
st.write("Great! Upload your file below. ")
uploaded_file = st.file_uploader("π½ Your data is deleted when you close the browser tab!", type=['xlsx','csv'])
if uploaded_file is None:
st.info("Upload your Excel or CSV file above. π")
st.stop()
@st.cache_data
def load_data(file):
try:
data = pd.read_csv(file)
except:
data = pd.read_excel(file)
return data
df = load_data(uploaded_file)
st.write(f"Great! Here's a sample of your data. There are {df.shape[1]} columns and {df.shape[0]:,} rows.")
st.dataframe(df.head())
st.write("Now please select which column contains the free-text user responses")
selected_column = st.selectbox(index=None, options = df.columns, label='π')
if selected_column:
@st.cache_data
def showsample(col):
for thing in col.sample(5):
st.write("βοΈ" + thing)
st.markdown("<hr>", unsafe_allow_html=True)
st.subheader("Here are five random responses.")
showsample(df[selected_column])
st.markdown("<hr>", unsafe_allow_html=True)
st.subheader("We are ready to extract the relevant topics.")
# def getsample(dataframe, samplenum):
# return dataframe.sample(samplenum)
st.write("Please write a short description of what "
"this survey is about. This will aid the model in "
"interpreting the results. For example: "
"***'These responses are comments from customers "
"of a large retail store that sells "
"a range of food, electronics and clothing. They were asked "
"how customer service could be improved'***")
theme = st.text_area(label='π')
# Using the "with" syntax
# with st.form(key='my_form'):
# theme = st.text_area(label='Enter some text')
# submit_button = st.form_submit_button(label='Submit')
if theme:
# st.write("Please paste your Open AI API Key below. This information is not saved or shared.")
# url = 'https://platform.openai.com/'
# api_key_user = st.text_input(label="Note, if you don't have an Open AI API key yet, you can get one [by clicking here](%s)" % url, type="password")
api_key_user = st.secrets["apikey"]
st.write("Which GPT model would you like to use?")
selected_model = st.selectbox(index=None, options = ['gpt-3.5-turbo','gpt-4'], label='π')
if st.button("Submit to begin survey analysis"):
st.write("π Let's get started! π")
from openai import OpenAI
client = OpenAI(api_key = api_key_user)
def get_completion(prompt, model=selected_model):
messages = [{"role": "user", "content": prompt}]
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=0
)
return response.choices[0].message.content
# some cleaning steps of the df first
# Filter rows where the content is not just one word
df = df[df[selected_column].str.split().apply(len) > 1]
import emoji
# Function to remove emojis
def remove_emojis(text):
return emoji.demojize(text)
# Apply the function to the 'Responses' column
df[selected_column] = df[selected_column].apply(remove_emojis)
# final cleaning step. gets rid of ::smiley:: type text
df[selected_column] = df[selected_column].str.replace(':[^:]+:', '', regex=True)
# next, get the number of tokens for each response
# we don't want to over load GPT. There is a token limit of 8000 currently.
# so the data we send to the api should be less than this, by about 1000
# which leaves some tokens for the api response
# token counter
import tiktoken
def num_tokens_from_string(string: str, encoding_name: str) -> int:
encoding = tiktoken.encoding_for_model(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens
# getting count of tokens
df['Tokens'] = df.apply(lambda row: num_tokens_from_string(row[selected_column], encoding_name=selected_model), axis=1)
# Shuffle DataFrame to randomize the order
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# Calculate cumulative tokens
df['CumulativeTokens'] = df['Tokens'].cumsum()
# depending on the model:
if selected_model == 'gpt-3.5-turbo':
limit = 3500
else:
limit = 7000
# Select responses within the token limit of 7000
selected_responses = df[df['CumulativeTokens'] <= limit][selected_column].to_list()
# def createlistfromcolumn(df_column):
# return ["```" + text + "```" for text in df_column if len(text) > 5]
number_of_responses = len(selected_responses)
pcent_of_analysed_responses = round(number_of_responses/df.shape[0]*100)
prompt = f"""
You will be provided by a list of comments taken from a survey. The person who uploaded the survey data has written a short desrcription of what the survey is about. Here is the context of the survey: {theme}.
After reading the context and background to the survey, you will assume the role of an expert in this area.
Your goal is to read through the responses and give the following outputs:
Positive Comments: Summarise common positive feedback in three sentences or less. Write in sentences, not bullet points. Add a title in bold to this section called "Positive Comments:"
Negative Comments: Summarise common negative feedback in three sentences or less. Write in sentences, not bullet points. Add a title in bold to this section called "Negative Comments:"
Sentiment Summary: Give a general summary of sentiment of the overall respondents. Write in sentences, not bullet points. Add a heatitleder in bold to this section called "Sentiment Summary:"
Recommendations. Give recommendations going forward, while keeping the context in mind. Write in sentences, not bullet points. Add a heatitleer in bold to this section called "Recommendations:"
Here is the feedback for you to study: {selected_responses}
"""
with st.spinner(f"Analysing {number_of_responses} responses. {pcent_of_analysed_responses}% of the total responses. Please wait..."):
response = get_completion(prompt)
st.success("Analysis Complete!", icon ="π€")
st.write(response)
st.markdown("<hr>", unsafe_allow_html=True)
st.subheader("Please press your browser refresh button π if you would like to clear all outputs and start again.")
st.markdown("<hr>", unsafe_allow_html=True)