This repository has been archived by the owner on Mar 10, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
gensim_engine.py
174 lines (142 loc) · 6.48 KB
/
gensim_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import argparse
import csv
import logging
import re
import json
import sys
import warnings
import os
from itertools import chain, repeat
from operator import itemgetter
from gensim import corpora, models
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import Phrases
from nltk.corpus import stopwords
from collections import Counter
from corpus_building import CorpusReader
import pyLDAvis
import pyLDAvis.gensim
import gensim
warnings.filterwarnings('error')
csv.field_size_limit(sys.maxsize)
class GensimEngine(object):
def __init__(self, corpus, dictionary, document_metadata, log=False, corpus_reader=None):
"""
The corpus is a bag of words (a list of lists of tuples)
The dictionary maps word ids to strings
"""
self.topics = []
self.ldamodel = None
self.corpus = corpus
self.dictionary = dictionary
self.document_metadata = document_metadata
# TODO: this shouldn't be needed
self.corpus_reader = corpus_reader
if log:
logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
@staticmethod
def from_documents(documents, log=False, dictionary_path=None, **reader_kwargs):
"""
Documents is expected to be a list of dictionaries, where each element
includes a `base_path` and `text`.
"""
reader = CorpusReader(**reader_kwargs)
corpus, dictionary = reader.build_corpus(documents, dictionary_path=dictionary_path)
document_metadata = [dict(base_path=doc['base_path']) for doc in documents]
return GensimEngine(corpus, dictionary, log=log, corpus_reader=reader, document_metadata=document_metadata)
@staticmethod
def from_experiment(name, log=False):
experiment = Experiment.load(name)
return GensimEngine(experiment.corpus, experiment.dictionary, experiment.document_metadata, log=log)
def train(self, number_of_topics=20, words_per_topic=8, passes=50):
"""
It trains the LDA algorithm against the documents set in the
initializer. We can control the number of topics we need and how many
iterations the algorithm should make.
"""
print("Generate LDA model")
self.ldamodel = gensim.models.ldamodel.LdaModel(
self.corpus,
num_topics=number_of_topics,
id2word=self.dictionary,
passes=passes
)
raw_topics = self.ldamodel.show_topics(
num_topics=number_of_topics,
num_words=words_per_topic,
formatted=False
)
self.topics = [{'topic_id': topic_id, 'words': words} for topic_id, words in raw_topics]
return Experiment(model=self.ldamodel, corpus=self.corpus, dictionary=self.dictionary, document_metadata=self.document_metadata)
class Experiment(object):
"""
Each experiment contains a corpus of words and an LDA model of it
"""
DEFAULT_EXPERIMENT_PATH = os.path.join('experiments')
def __init__(self, model, corpus, dictionary, document_metadata):
self.ldamodel = model # Trained LDA model
self.corpus = corpus # List of documents where each document is a bag of words
self.dictionary = dictionary # Id -> Text mapping for terms
self.document_metadata = document_metadata # List of document metadata
@staticmethod
def load(experiment_name, path=DEFAULT_EXPERIMENT_PATH):
model_filename = Experiment._filename(path, experiment_name, 'model')
corpus_filename = Experiment._filename(path, experiment_name, 'corpus')
dictionary_filename = Experiment._filename(path, experiment_name, 'dict')
meta_filename = Experiment._filename(path, experiment_name, 'meta')
model = gensim.models.ldamodel.LdaModel.load(model_filename)
corpus = list(corpora.MmCorpus(corpus_filename))
dictionary = corpora.Dictionary.load_from_text(dictionary_filename)
with open(meta_filename) as metafile:
document_metadata = json.load(metafile)
return Experiment(model=model, corpus=corpus, dictionary=dictionary, document_metadata=document_metadata)
def save(self, experiment_name, path=DEFAULT_EXPERIMENT_PATH):
model_filename = self._filename(path, experiment_name, 'model')
corpus_filename = self._filename(path, experiment_name, 'corpus')
dictionary_filename = self._filename(path, experiment_name, 'dict')
meta_filename = Experiment._filename(path, experiment_name, 'meta')
with open(meta_filename, 'wb') as metafile:
json.dump(self.document_metadata, metafile)
self.ldamodel.save(model_filename)
corpora.MmCorpus.serialize(corpus_filename, self.corpus)
self.dictionary.save_as_text(dictionary_filename)
def visualise(self, filename):
"""
Visualise the topics generated
"""
# Create visualisation
viz = pyLDAvis.gensim.prepare(self.ldamodel, self.corpus, self.dictionary)
# Output HTML object
pyLDAvis.save_html(data=viz, fileobj=filename)
def topics(self, number_of_topics=20, words_per_topic=8):
raw_topics = self.ldamodel.show_topics(
num_topics=number_of_topics,
num_words=words_per_topic,
formatted=False)
return [{'topic_id': topic_id, 'words': words} for topic_id, words in raw_topics]
def tag(self, top_topics=3):
"""
Given a list of documents (dictionary with `base_path` and `text`), this
method adds an extra key to each of the dictionaries with the top 3
topics associated with the document.
"""
tagged_documents = []
for document_number, document in enumerate(self.document_metadata):
document['tags'] = self.topics_for(document_number, top_topics)
tagged_documents.append(document)
return tagged_documents
def topics_for(self, document_number, top_topics=3):
"""
Given a single document, it returns a list of topics for the document.
"""
document_bow = self.corpus[document_number]
# Tag the document
all_tags = self.ldamodel[document_bow]
tags = sorted(all_tags, key=itemgetter(1), reverse=True)[:top_topics]
return tags
@staticmethod
def _filename(path, experiment, suffix):
return os.path.join(path, experiment, 'models', suffix)