forked from TeeOhh/tRECS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
interface.py
854 lines (730 loc) · 27.4 KB
/
interface.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
# -*- coding: utf-8 -*-
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, Event, State
import plotly.graph_objs as graph
import pickle
import pandas as pd
import iqss_interface
import utils
import os
from visualizations import lda_cluster_graph as lda_cluster
'''
Description: This app is designed to allow users to do in-depth preprocessing and cleaning on their language datasets,
then build different statistical NLP methods such as TF-IDF, LSA, LDA, and SpaCY (gloVe vectors), then
test a similarity/recommendation system, and finally analyze the dataset and models by visualizing the
different entities, topic clustering, etc. The idea is that this gives a user a high level application
that does all of this within a GUI rather than writing code.
Pages:
Uploading page - User uploads data and selected the necessary columns
Cleaning page - User selects the different cleaning options we provide
Cleaning page (2) - User can select thresholds to remove words that are not seen in a certain amount of documents.
Model building page - User can select the desired statistical models to be built
Testing page - User can test the models using a recommendation system, entering a document name and getting a certain
amount of recommended (similar) documents
Analyzing page - User can select to visualize the different categories of entities within their data and
visualize the clusters of topics within their data.
'''
app = dash.Dash()
app.config.supress_callback_exceptions=True
## Load current directory files for uploading of matrix
## Get current directory
cd = os.getcwd()
## Create list of files in current directory
files = list(os.listdir(cd))
# ----- LOAD EXTERNAL STYLING -----
my_css_url = "https://codepen.io/anon/pen/bRxMvz.css"
app.css.append_css({
"external_url": my_css_url
})
## keep track of current step so slide can be updated when button is pressed
step = 0
## build instance of the backend object
interface_obj = iqss_interface.iqss_interface()
## ----- START OF CONTENT -----
app.layout = html.Div([
## Container with top 'progress bar' and next button
html.Div([
html.P('VPAL RECOMMENDATION SYSTEM DEVELOPER')
], id='header'),
html.Div([
html.Div([
dcc.Slider(
id='slider',
min = 0,
max = 5,
marks = ['Load Data', 'Clean', 'Clean (part 2)', 'Build Model(s)', 'Test', 'Analyze'],
value = 0,
disabled='true',
)
], className='column', id='top-slider-container'),
html.Button('Next Step', id='next-button', className = 'next-btn-invis'),
## Content place holder for each page ('step') of the process
html.Div(id='content')], className = 'container')
])
# ----- START OF INTERACTIVITY -----
@app.callback(
## When slider is changed, check the global step variable and display the necessary content in the main container
Output(component_id='content', component_property='children'),
[Input(component_id='slider', component_property='value')]
)
def test(slider_value):
## ----- Load Data Page -----
# '''
# Description: This page allows the user to choose from a list of .csv files in the current directory to upload.
# The user can then choose the desired columns to run through the interface (label column, and data column).
# The user will see a live preview of the uploaded dataframe when both the label and data column are chosen.
# '''
global interface_obj
if slider_value == 0:
return (
html.Div(['Upload your data and select the columns and you \
will see a preview of your data here...'], id = 'preview-df'),
html.Div([
dcc.Dropdown(
id = 'file-chooser',
options = [
{'label' : file, 'value' : file} for file in files if '.csv' in file
],
placeholder = 'Choose file...'
)
], id='file-choosing-container'),
html.Div([
html.P('LOAD DATA'),
html.Button('Upload', id='upload-btn')
], className = 'section-container-s'),
html.Div([
html.P('CHOOSE COLUMNS'),
html.Div([
html.P('Label Column'),
dcc.Dropdown(
options=[],
id='label-col'
)], className = 'half-container'),
html.Div([
html.P('Data Column'),
dcc.Dropdown(
options=[],
id='data-col'
)], className = 'half-container')
], className = 'section-container-m'),
)
## ----- Clean Data Page -----
# '''
# Description: This page allows the user to select a number of cleaning steps to clean their data.
# Default cleaning steps: Removing non-ascii and removing punctuation.
# Optional cleaning steps: Removing HTML, remove words that occur in only one document, lowercase all words,
# remove all 'stop' words, remove all numbers, remove all non-english words, and stem or lem all words.
# '''
elif slider_value == 1:
## Build document frequency matrix from clean_df for slider functionality
interface_obj.default_clean()
interface_obj.build_doc_freq()
return (
html.Div(['If you choose to select additional options you will \
see a preview of your data after being cleaned here...'], id='clean-preview'),
html.Div([
html.P('DEFAULT OPTIONS'),
dcc.Checklist(
options = [
{'label' : 'Remove non-ascii', 'value' : 'ascii', 'disabled' : 'true'},
{'label' : 'Remove punctuation', 'value' : 'punctuation', 'disabled' : 'true'},
],
id= 'default-options',
values = ['ascii', 'punctuation'],
labelClassName = 'cleaning-options'
)
], className = 'default-container'),
html.Div([
html.P('ADDITIONAL OPTIONS'),
html.Div([
dcc.Dropdown(
options = [
{'label' : 'Remove HTML', 'value' : 'html'},
{'label' : 'Remove words that only occur in one document', 'value' : 'once'},
{'label' : 'Lowercase', 'value' : 'lower'},
{'label' : 'Remove stop words', 'value' : 'stop'},
{'label' : 'Remove all numbers', 'value' : 'numbers'},
{'label' : 'Remove all nonenglish words', 'value' : 'nonenglish'},
],
multi=True,
id='add-options'),
], className = 'half-container'),
html.Div([
dcc.Dropdown(
options = [
{'label' : 'Stem', 'value' : 'stem'},
{'label' : 'Lemmatize', 'value' : 'lemma'}
],
id='stem-lem-options'
)
], className = 'half-container'),
]),
)
## ----- Cutoff Page -----
# '''
# Description: This page allows the user to cutoff words that occur below or above a certain threshold of documents.
# (The standard % seen in research for this is below 1%)
# '''
elif slider_value == 2:
## Rebuild document frequency matrix after finishing cleaning
interface_obj.build_doc_freq()
return html.Div([
html.Div(id='download-confirmation'),
html.Button('DOWNLOAD DATA', id='download-btn', style={'marginBottom' : '3%'}),
html.Div([], id='cutoff-confirmation'),
html.Div([
html.P('Bottom Threshold'),
html.Div(id='bottom-thresh', className='cutoff_words')],
className='cutoff_container', style={'paddingRight' : '5%'}),
## --- GRAPH AND THRESHOLD CUTOFF CONTAINER ---
html.Div([
html.P('CUT OFF THRESHOLDS (Optional)',
),
## Graph of words and document frequency: x = doc. freq., y = ordered list of words
dcc.Graph(
id='graph-with-slider',
animate=True,
## Set data: x = doc. freq., y = index of dataframe (ordered word IDs)
figure={'data': []},
),
dcc.RangeSlider(
id='percent-slider',
min=0,
max=20,
step=.2,
marks=[int(x) for x in range(0, 101, 5)],
value = [0, 20]),
], id='graph-container'),
html.Div([
html.P('Top Threshold'),
html.Div(id='top-thresh', className='cutoff_words')],
className='cutoff_container', style={'paddingLeft' : '5%'}),
html.Div([
html.Button('CUTOFF', id='cutoff-btn')
], style={'clear' : 'both'})
], id='cutoff_main_container')
## ----- Build Model Page -----
# '''
# Description: This page allows the user to select and build a number of statistical language models for their data.
# Model options: TF-IDF, LDA, LSA, SpaCy
# '''
elif slider_value == 3:
return (html.Div([
html.Div([], id='build-confirmation'),
html.Div([
html.P('BUILD MODEL(S)')
], className='section-container-s'),
dcc.Checklist(
options = [
{'label' : 'Tf-Idf', 'value' : 'tf_idf'},
{'label' : 'LSA', 'value' : 'lsa'},
{'label' : 'LDA', 'value' : 'lda'},
{'label' : 'Spacy', 'value' : 'spacy'},
],
values = [],
id='model-choices',
labelClassName = 'model_choices'
),
]),
html.Div([
html.Div([
html.P('tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.')
], className='fourth-container'),
html.Div([
html.P('Latent semantic analysis (LSA) is a technique in natural language processing, in particular distributional semantics, of analyzing relationships between a set of documents and the terms.')
], className='fourth-container'),
html.Div([
html.P('latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar.')
], className='fourth-container'),
html.Div([
html.P('spaCy excels at large-scale information extraction tasks. It\'s written from the ground up in carefully memory-managed Cython. Independent research has confirmed that spaCy is the fastest.')
], className='fourth-container')
]),
html.Div([
html.Button('BUILD', id='build-btn')
], style={'clear' : 'both'}, className='section-container-m')
)
## ----- Test Page -----
# '''
# Description: This page allows the user to select a label (course name) from the list of data from the label column they chose when uploading.
# From here, the user then enters the number of similar labels (courses) they wish to output when ran.
# The user then presses the run button and the system outputs the top X amount of similar labels.
# '''
elif slider_value == 4:
courses = interface_obj.clean_df.iloc[:, 0]
return html.Div([
html.Div([
dcc.Input(
placeholder = 'Enter amount...',
type='number',
id = 'amount-entry'
),
html.Button('RUN', id='get-similar-btn')
]),
html.Div([
html.Div([
html.P('CHOOSE A COURSE', className = 'test-headers'),
dcc.Dropdown(
options = [
{'label' : course_name, 'value' : course_name} for course_name in courses
],
id = 'course-selection'
)
], className = 'half-container'),
html.Div([
html.P('SIMILAR COURSES', className = 'test-headers'),
html.Ul(
children = [],
id='similar-courses',
style={'listStyleType' : 'none'})
], id = 'course-recom-container')
], id = 'testing-container')
])
## ----- Analyze Page -----
# '''
# Description: This page allows the user to select two different methods of analyzing their data.
# Analyzing options: Detecting entities (using SpaCY) and analyze the topic models of LDA.
# '''
elif slider_value == 5:
return html.Div([
html.Div([
html.P('ANALYZING OPTIONS'),
dcc.Dropdown(
options = [
{'label' : 'Entities', 'value' : 'ents'},
{'label' : 'Visualize Topics', 'value' : 'topics'}
],
id = 'analyze-dropdown'
)], id='analyze-options'),
html.Div(id='analyze-content')
])
## ------- METHODS FOR APP FLOW AND BACKEND MANIPULATION -------
def upload_file(file_name):
# '''
# Description: Uploads the user selected file name and builds the dataframe in the backend object.
# Params: User selected filename from the dropdown.
# Returns: List of columns that exist in the csv file.
# '''
global interface_obj
interface_obj.load_data(file_name)
return interface_obj.df.columns.tolist()
def update_table(df, columns):
# '''
# Description: Builds an html table from the desired dataframe and columns.
# Params: Dataframe and the list of necessary column names
# Returns: Table sample of first few entries in dataframe
# '''
max_amount = len(df)
## If the data does not have 3 or more entries, display only one entry as a sample
if max_amount < 3:
return html.Table(
[html.Tr([html.Th(col) for col in columns])] +
[html.Tr([
html.Td(df.iloc[i][col][:150] + ' ...') for col in columns
]) for i in range(1)]
)
## If data has more than 3 entries, display first 3 entries as a sample
else:
return html.Table(
[html.Tr([html.Th(col) for col in columns])] +
[html.Tr([
html.Td(df.iloc[i][col][:150] + ' ...') for col in columns
]) for i in range(3)]
)
## ------- WHEN USER PRESSES NEXT STEP, CHANGE CONTENT AND INCREMENT SLIDER --------
@app.callback(
Output(component_id='slider', component_property='value'),
events=[Event('next-button', 'click')]
)
def next():
# '''
# Description: Used for switching of page content. When next step button pressed, increments or resets global variable step.
# Params: None
# Returns: Incremented or reset global step variable.
# '''
global step
if step == 5:
step = 0
return step
else:
step += 1
return step
## ------- LOAD DROPDOWNS WITH COLUMN OPTIONS -------
@app.callback(
Output('label-col', 'options'),
events=[Event('upload-btn', 'click')],
state=[State('file-chooser', 'value')]
)
def load_label_drop(file_name):
# '''
# Description: Loads the label dropdown selector with all of the column options from the uploaded dataframe.
# Params: The file name of the csv file.
# Returns: Dropdown values & labels to fill the dash dropdown selector.
# '''
columns = upload_file(file_name)
return [{'label' : column, 'value' : column} for column in columns]
@app.callback(
Output('data-col', 'options'),
events=[Event('upload-btn', 'click')],
state=[State('file-chooser', 'value')]
)
def load_data_drop(file_name):
# '''
# Description: Loads the label dropdown selector with all of the column options from the uploaded dataframe.
# Params: The file name of the csv file.
# Returns: Dropdown values & labels to fill the dash dropdown selector.
# '''
columns = upload_file(file_name)
return [{'label' : column, 'value' : column} for column in columns]
## ------- UPDATE LIVE DATAFRAME PREVIEW WITH SELECTED COLUMNS -------
@app.callback(
Output('preview-df', 'children'),
[Input('label-col', 'value'), Input('data-col', 'value')]
)
def reduce_df(column1, column2):
# '''
# Description: When user selects options for label and data column, load the dataframe preview with the current selected columns.
# Params: Column name of label column and column name of data column.
# Returns: Confirmation that the users dataframe has been uploaded and shows the dataframe preview returned from update_table() method.
# '''
col1 = str(column1)
col2 = str(column2)
if col1 !='None' and col2 != 'None':
global interface_obj
interface_obj.load_columns(col1, col2)
return (
html.P('DATAFRAME PREVIEW'),
update_table(interface_obj.df, [col1, col2]))
else:
return 'Upload your data and select the desired columns and you \
will see a preview of your data here...'
## ------- HIDE NEXT BUTTON UNTIL USER UPLOADS DATA -------
@app.callback(
Output('next-button', 'className'),
[Input('label-col', 'value'), Input('data-col', 'value')]
)
def show_next_button(column1, column2):
# '''
# Description: If user has selected both a label and data column then show next button.
# Params: Label column name and data column names.
# Returns: Invisible classname for next button if both columns not selected and visible class name if both are.
# '''
col1 = str(column1)
col2 = str(column2)
if col1 !='None' and col2 != 'None':
return 'next-btn-visible'
else:
return 'next-btn-invis'
@app.callback(Output('clean-preview', 'children'),
[Input('default-options', 'value'),
Input('add-options', 'value'),
Input('stem-lem-options', 'value')]
)
def clean(default, additional, stem_lem):
# '''
# Description: When user selects one of the additional cleaning options, iupdate the page with a live preview of the first entry after cleaning with selected options.
# Params: list of selected default options, list of additional options, string of value: 'stem' or 'lemma'
# Returns: Live update of first entry from dataframe after being cleaned
# '''
global interface_obj
## Recreate instance of clean dataframe from non-clean to refresh with newly
## selected options
## = only first two columns
interface_obj.clean_df = interface_obj.df.copy()
## Do additional and default cleaning
if additional:
interface_obj.additional_clean(additional)
## If stem or lem selected do those
if stem_lem:
interface_obj.stem_lem_clean(stem_lem)
return (
html.P('CLEANING PREVIEW'),
update_table(interface_obj.clean_df.head(1), interface_obj.clean_df.columns))
@app.callback(Output('graph-with-slider', 'figure'),
[Input('percent-slider', 'value')])
def update_graph(threshold):
# '''
# Description: Updates the threshold graph with only the words that are between the two user selected thersholds from the dash slider.
# Params: Threshold tuple (lower threshold, upper threshold)
# Returns: Graph data for only the words and word frequencies of the words between the two thresholds.
# '''
global interface_obj
## Get bottom & top thresholds and rebuild the word/doc. freq. dataframe
bottom_threshold = float(threshold[0]) * float(5) / float(100) * float(len(interface_obj.clean_df))
top_threshold = float(threshold[1]) * float(5) / float(100) * float(len(interface_obj.clean_df))
new_df1 = interface_obj.freq_matrix.where(interface_obj.freq_matrix[1] >= bottom_threshold).dropna()
new_df2 = interface_obj.freq_matrix.where(interface_obj.freq_matrix[1] <= top_threshold).dropna()
## Merge on words above bottom thresh. and words below top thresh.
interface_obj.freq_matrix_keep = pd.merge(new_df1, new_df2, how='inner', on=[0])
return {
## Set x as doc. freq. of words within bottom and top thresholds
'data' : [graph.Scattergl(
x = interface_obj.freq_matrix_keep['1_y'],
y = interface_obj.freq_matrix_keep.index,
mode = 'markers',
text = interface_obj.freq_matrix_keep[0]
)],
## Rebuild range of x and y axis with min and max of new dataframe
'layout' : graph.Layout(
xaxis = {'title' : 'Document Frequency'},
yaxis = {'title' : 'Word Ids (ordered by doc. freq.)'},
hovermode = 'closest',
)
}
@app.callback(Output('bottom-thresh', 'children'),
[Input('percent-slider', 'value')])
def populate_bottom(value):
# '''
# Description: Populate the bottom list of excluded words that are outside of the threshold, when the slider is changed.
# Params: Threshold tuple from slider value (lower threshold, upper threshold)
# Returns: List of words that are below the user selected threshold.
# '''
global interface_obj
threshold = float(value[0]) * float(5) / float(100) * float(len(interface_obj.clean_df))
bottom_words = interface_obj.freq_matrix.where(interface_obj.freq_matrix[1] < threshold).dropna()
bottom_words = bottom_words[0].tolist()[::-1]
return ", ".join(word for word in bottom_words)
@app.callback(Output('top-thresh', 'children'), [Input('percent-slider', 'value')])
def populate_top(value):
# '''
# Description: Populate the top list of excluded words that are outside of the threshold, when the slider is changed.
# Params: Threshold tuple from slider value (lower threshold, upper threshold)
# Returns: List of words that are above the user selected threshold.
# '''
global interface_obj
threshold = float(value[1]) * float(5) / float(100) * float(len(interface_obj.clean_df))
top_words = interface_obj.freq_matrix.where(interface_obj.freq_matrix[1] > threshold).dropna()
top_words = top_words[0].tolist()
return ", ".join(word for word in top_words)
@app.callback(Output('cutoff-confirmation', 'children'),
events=[Event('cutoff-btn', 'click')])
def cutoff_words():
# '''
# Description: Based on where the current slider values are, builds a list of words that are within the thresholds.
# Then remove all of the words that are not contained in this list of words from the uploaded data.
# Params: None
# Returns: Confirmation that the words have successfully been cutoff (removed) from the dataframe.
# '''
global interface_obj
interface_obj.remove_cutoff()
return html.P('Cut off completed successfully.')
@app.callback(Output('download-confirmation', 'children'),
events = [Event('download-btn', 'click')]
)
def download():
# '''
# Description: After user cleans (or doesn't) their data, when the download button is pressed, write clean dataframe out to disk.
# Params: None
# Returns: Confirmation that the data has been downloaded successfully.
# '''
global interface_obj
file_name = 'cleaned_data'
x = 1
while file_name +'.csv' in files:
file_name += str(x)
file_name += '.csv'
interface_obj.clean_df.to_csv(file_name)
return html.P('Download completed successfully!')
@app.callback(Output('build-confirmation', 'children'),
events=[Event('build-btn', 'click')],
state = [State('model-choices', 'values')]
)
def build_models(values):
# '''
# Description: Based on what models the user has selected to build, build the models inside of the backend object.
# Params: Selected model values
# Returns: Confirmation that the models have been built
# '''
tf_idf, lsa, lda, spacy = False, False, False, False
if u'tf_idf' in values:
tf_idf = True
if u'lsa' in values:
lsa = True
if u'lda' in values:
lda = True
if u'spacy' in values:
spacy = True
global interface_obj
lsa_nt = utils.get_num_of_topics(interface_obj.clean_df)
if lsa_nt == 0:
lsa_nt = 300
lda_nt = lsa_nt / 3
interface_obj.build_models(tf_idf, lsa, lsa_nt, lda, lda_nt, spacy)
return html.P('Models built successfully!')
@app.callback(Output('similar-courses', 'children'),
state = [State('amount-entry', 'value'), State('course-selection', 'value')],
events = [Event('get-similar-btn', 'click')]
)
def get_similar(amount, course_name):
# '''
# Description: Given a selected label (course) output the similar X amount of courses (where X is the amount the user gives).
# Params: Amount of similar courses to get and the label name to find similar for.
# Returns: List of amount of similar courses.
# '''
global interface_obj
courses = interface_obj.iqss_model.get_similar(course_name, int(amount))
## in form [(course1, score), (course2, score), ...]
course_names = [course[0] for course in courses]
return [html.Li(course) for course in course_names]
@app.callback(Output('analyze-content', 'children'),
[Input('analyze-dropdown', 'value')]
)
def show_analyzing_content(value):
# '''
# Description: Using the option that the user selects, show the content for that analyzing page.
# Params: Value from dropdown of analyzing page options.
# Returns: Content for the analyzing page.
# '''
global interface_obj
if value == 'ents':
interface_obj.load_ents()
x = [label for label, texts in interface_obj.all_entities.items()]
y = [len(texts) for label, texts in interface_obj.all_entities.items()]
return html.Div([
html.Div([
dcc.Graph(
id='entity-pie',
animate=True,
figure = {
'data' : [graph.Pie(
labels = x,
values = y,
textinfo = 'percent'
)]
}
),
],
className = 'entity-containers'),
html.Div([
dcc.Dropdown(
options = [{'label' : label, 'value' : label} for label, texts in interface_obj.all_entities.items()],
placeholder='Select entity label...',
id = 'entity-choice'
),
html.Div(id='entity-table-container')],
className = 'entity-containers')
])
elif value == 'topics':
global cluster_obj
cluster_obj = lda_cluster(interface_obj.clean_df)
cluster_obj.make_lda(10, 1)
return html.Div([
html.Div([
html.Div([
html.P('# OF TOPICS'),
dcc.Input(
placeholder='# of topics',
type='number',
value = 10,
id='vis-topics-input'
)
], className='half-container'),
html.Div([
html.P('# OF ITERATIONS'),
dcc.Input(
placeholder='# of iterations',
type='number',
value = 1,
id='vis-iteration-input'
)
], className='half-container'),
html.Div([
html.Button('BUILD', id='build-vis-btn')
], style={'clear' : 'both'})
], id='lda-vis-top-container'),
html.Div([
html.Div([
dcc.RadioItems(
options=[{'label' : '2D', 'value' : '2d'},
{'label' : '3D' , 'value' : '3d'}
],
value = '2d',
id='2d3d'
),
dcc.Slider(
min=0,
max=20,
step=.2,
marks=[int(x) for x in range(0, 101, 5)],
id = 'lda-vis-slider',
value = 0,
)
],id = 'vis-options-container'),
html.Div([
dcc.Graph(id='lda-vis-graph', style={'marginTop' : '3%'})
], className = 'three-fourth-container'),
html.Div([
html.P('TOPICS'),
html.Ul(
children = [],
id='lda-vis-legend',
style={'listStyleType' : 'none'}
)
], id='legend-container', className = 'fourth-container'),
])
])
@app.callback(Output('entity-table-container', 'children'),
[Input('entity-choice', 'value')]
)
def load_entity_table(label):
# '''
# Description: Allows the user to analyze the individual words within each category of entity.
# Params: Selected category of entity from dropdown.
# Returns: HTML table of entities within the entity category.
# '''
global interface_obj
df = utils.get_dataframe(label, interface_obj.entity_dic)
columns = df.columns
return html.Table(
[html.Tr([html.Th(label)])] +
[html.Tr([html.Th(col) for col in columns])] +
[html.Tr([
html.Td(df.iloc[i][col]) for col in columns
]) for i in range(len(df))], id='entity-table'
)
@app.callback(Output('lda-vis-slider', 'value'),
state = [State('vis-topics-input', 'value'), State('vis-iteration-input', 'value')],
events = [Event('build-vis-btn', 'click')]
)
def build_lda_vis(num_of_topics, iterations):
# '''
# Description: Builds the LDA model based on the user selected number of topics and iterations.
# Params: Number of topics and number of iterations.
# Returns: Sets the slider value (threshold) to 0.
# '''
global cluster_obj
cluster_obj.make_lda(int(num_of_topics), int(iterations))
return 0
@app.callback(Output('lda-vis-graph', 'figure'),
[Input('lda-vis-slider', 'value'), Input('2d3d', 'value')]
)
def build_cluster_graph(slider_val, dim):
# '''
# Description: Builds the graph data for the LDA cluster visualizations.
# Params: Threshold slider value and the dimension of 2d or 3d.
# Returns: Figure and data for the LDA cluster graph visual.
# '''
global cluster_obj
slider_val2 = float(slider_val) * float(5) / float(100)
fig = cluster_obj.set_graph(slider_val2, dim)
return fig
@app.callback(Output('lda-vis-legend', 'children'),
[Input('lda-vis-slider', 'value'), Input('2d3d', 'value')]
)
def build_legned(slider_val, dim):
# '''
# Description: Creates the legend that displays the words that correspond to each topic.
# Color coded to match the cluster color in the graph.
# Params: Threshold slider value and the dimension of 2d or 3d.
# Returns: List of words that correspond to each cluster.
# '''
global cluster_obj
leg = cluster_obj.make_legend()
list_of_lis = []
for item, value in leg.items():
list_of_lis.append(html.Li(item, style={'color' : value}))
return list_of_lis
if __name__ == '__main__':
app.run_server(debug=True)