mediacloud · DonggeLiu · Jun 29, 2017 · Jul 3, 2017 · Jul 3, 2017 · Jul 3, 2017
diff --git a/install/install_python_dependencies.sh b/install/install_python_dependencies.sh
@@ -41,8 +41,29 @@ echo "Installing (upgrading) Supervisor..."
 ( cd /tmp; $COMMAND_PREFIX pip2.7 install --upgrade supervisor )
 
 echo "Installing (upgrading) Virtualenv..."
-$COMMAND_PREFIX pip2.7 install --force-reinstall --upgrade virtualenv
-$COMMAND_PREFIX pip$PYTHON3_MAJOR_VERSION install --force-reinstall --upgrade virtualenv
+$COMMAND_PREFIX pip2.7 install --upgrade virtualenv
+$COMMAND_PREFIX pip$PYTHON3_MAJOR_VERSION install --upgrade virtualenv
+
+# Install system-wide NLTK because otherwise sudo is unable to find
+# NLTK installed in virtualenv on Travis
+
+echo "Installing (upgrading) NLTK to install NLTK's data afterwards..."
+$COMMAND_PREFIX pip$PYTHON3_MAJOR_VERSION install --upgrade nltk
+
+# Installing WordNet with NLTK
+# (installing from own mirror on S3 to avoid hitting GitHub: https://github.com/nltk/nltk/issues/1787)
+echo "Installing NLTK WordNet data..."
+if [ `uname` == 'Darwin' ]; then
+    NLTK_DATA_PATH=/usr/local/share/nltk_data
+else
+    NLTK_DATA_PATH=/usr/share/nltk_data
+fi
+
+$COMMAND_PREFIX python$PYTHON3_MAJOR_VERSION \
+    -m nltk.downloader \
+    -u https://s3.amazonaws.com/mediacloud-nltk-data/nltk_data/index.xml \
+    -d "$NLTK_DATA_PATH" \
+    wordnet punkt
 
 echo "Creating mc-venv virtualenv..."
 echo "$(which python$PYTHON3_MAJOR_VERSION)"
@@ -69,3 +90,6 @@ pip$PYTHON3_MAJOR_VERSION install --upgrade -r mediacloud/requirements.txt || {
     echo "'pip$PYTHON3_MAJOR_VERSION install' failed the first time, retrying..."
     pip$PYTHON3_MAJOR_VERSION install --upgrade -r mediacloud/requirements.txt
 }
+
+
+
diff --git a/lib/MediaWords/Job/AnnotateWithCoreNLP.pm b/lib/MediaWords/Job/AnnotateWithCoreNLP.pm
@@ -31,12 +31,20 @@ use MediaWords::Util::CoreNLP;
 use MediaWords::DBI::Stories;
 use Readonly;
 
+# Having a global database object should be safe because
+# job workers don't fork()
+my $db = undef;
+
 # Run CoreNLP job
 sub run($;$)
 {
     my ( $self, $args ) = @_;
 
-    my $db = MediaWords::DB::connect_to_db();
+    unless ( $db )
+    {
+        # Postpone connecting to the database so that compile test doesn't do that
+        $db = MediaWords::DB::connect_to_db();
+    }
 
     my $stories_id = $args->{ stories_id } + 0;
     unless ( $stories_id )

diff --git a/lib/MediaWords/Job/Bitly/FetchStoryStats.pm b/lib/MediaWords/Job/Bitly/FetchStoryStats.pm
@@ -39,12 +39,17 @@ Readonly my $BITLY_RATE_LIMIT_SECONDS_TO_WAIT => 60 * 10;    # every 10 minutes
 # How many times to try on rate limiting errors
 Readonly my $BITLY_RATE_LIMIT_TRIES => 7;                    # try fetching 7 times in total (70 minutes)
 
+# Having a global database object should be safe because
+# job workers don't fork()
+my $db = undef;
+
 # Run job
 sub run($;$)
 {
     my ( $self, $args ) = @_;
 
-    my $db = MediaWords::DB::connect_to_db();
+    # Postpone connecting to the database so that compile test doesn't do that
+    $db ||= MediaWords::DB::connect_to_db();
 
     my $stories_id      = $args->{ stories_id } or die "'stories_id' is not set.";
     my $start_timestamp = $args->{ start_timestamp };

diff --git a/lib/MediaWords/Job/Facebook/FetchStoryStats.pm b/lib/MediaWords/Job/Facebook/FetchStoryStats.pm
@@ -32,6 +32,10 @@ use MediaWords::Util::Process;
 use Readonly;
 use Data::Dumper;
 
+# Having a global database object should be safe because
+# job workers don't fork()
+my $db = undef;
+
 # Run job
 sub run($;$)
 {
@@ -43,7 +47,8 @@ sub run($;$)
         fatal_error( 'Facebook API processing is not enabled.' );
     }
 
-    my $db = MediaWords::DB::connect_to_db();
+    # Postpone connecting to the database so that compile test doesn't do that
+    $db ||= MediaWords::DB::connect_to_db();
 
     my $stories_id = $args->{ stories_id } or die "'stories_id' is not set.";
 

diff --git a/lib/MediaWords/Languages/resources/ja/mecab-ipadic-neologd-dist b/lib/MediaWords/Languages/resources/ja/mecab-ipadic-neologd-dist
diff --git a/mediacloud/mediawords/db/handler.py b/mediacloud/mediawords/db/handler.py
@@ -239,8 +239,9 @@ def schema_is_up_to_date(self) -> bool:
             raise McSchemaIsUpToDateException("Current schema version is 0")
 
         # Target schema version
-        sql = open(mc_sql_schema_path(), 'r').read()
-        target_schema_version = schema_version_from_lines(sql)
+        sql = open(mc_sql_schema_path(), 'r')
+        target_schema_version = schema_version_from_lines(sql.read())
+        sql.close()
         if not target_schema_version:
             raise McSchemaIsUpToDateException("Invalid target schema version.")
 

diff --git a/mediacloud/mediawords/util/config.py b/mediacloud/mediawords/util/config.py
@@ -43,8 +43,9 @@ def __parse_yaml(config_file: str) -> dict:
     if not os.path.isfile(config_file):
         raise McConfigException("Configuration file '%s' was not found." % config_file)
 
-    yaml_file = open(config_file, 'r').read()
-    yaml_data = yaml.load(yaml_file, Loader=Loader)
+    yaml_file = open(config_file, 'r')
+    yaml_data = yaml.load(yaml_file.read(), Loader=Loader)
+    yaml_file.close()
     return yaml_data
 
 

diff --git a/mediacloud/mediawords/util/topic_modeling/__init__.py b/mediacloud/mediawords/util/topic_modeling/__init__.py
diff --git a/mediacloud/mediawords/util/topic_modeling/model_gensim.py b/mediacloud/mediawords/util/topic_modeling/model_gensim.py
@@ -0,0 +1,101 @@
+import gensim
+
+# from mediawords.db import connect_to_db
+from mediawords.util.topic_modeling.sample_handler import SampleHandler
+from mediawords.util.topic_modeling.topic_model import BaseTopicModel
+from mediawords.util.topic_modeling.token_pool import TokenPool
+from typing import Dict, List
+
+
+class ModelGensim(BaseTopicModel):
+    """Generate topics of each story based on the LDA model
+    ModelGensim operates on a single story at a time
+    by comparing the occurrence of each token in all sentences of that story.
+    It does not consider the rest of stories. The benefits of this approach include:
+    1. Each story contains the word in the topics of that story
+    2. There is a fixed number of topics for each story"""
+
+    def __init__(self) -> None:
+        self._story_number = 0
+        self._stories_ids = []
+        self._stories_tokens = []
+        self._dictionary = None
+        self._corpus = []
+        self._WORD_SPLITTER = ' + '
+        self._COEFFICIENT_SPLITTER = '*'
+
+    def add_stories(self, stories: Dict[int, List[List[str]]]) -> None:
+        """
+        Adding new stories into the model
+        :param stories: a dictionary of new stories
+        """
+        for story in stories.items():
+            story_id = story[0]
+            story_tokens = story[1]
+            self._stories_ids.append(story_id)
+            self._stories_tokens.append(story_tokens)
+
+        self._story_number = len(self._stories_ids)
+
+    def summarize_topic(self, topic_number: int = 1,
+                        word_number: int = 4, passes: int = 100) -> Dict[int, list]:
+        """
+        summarize the topic of each story based on the frequency of occurrence of each word
+        :return: a dictionary of story id
+        and corresponding list of TOPIC_NUMBER topics (each topic contains WORD_NUMBER words)
+        """
+
+        story_topic = {}
+
+        for i in range(len(self._stories_ids)):
+            # turn our token documents into a id <-> term dictionary
+            self._dictionary = gensim.corpora.Dictionary(self._stories_tokens[i])
+
+            # convert token documents into a document-term matrix
+            self._corpus = [self._dictionary.doc2bow(text) for text in self._stories_tokens[i]]
+
+            # generate LDA model
+            self._model = gensim.models.ldamodel.LdaModel(
+                corpus=self._corpus, num_topics=topic_number,
+                id2word=self._dictionary, passes=passes)
+
+            raw_topics = self._model.print_topics(num_topics=topic_number, num_words=word_number)
+
+            story_topic[self._stories_ids[i]] = self._format_topics(raw_topics=raw_topics)
+
+        return story_topic
+
+    def _format_topics(self, raw_topics: List[tuple]) -> List[List[str]]:
+        """
+        Return topics in the desired format
+        :param raw_topics: un-formatted topics
+        :return: formatted topics
+        """
+        formatted_topics = []
+        for topic in raw_topics:
+            words_str = topic[1]
+            # change the format
+            # from 'COEFFICIENT1*"WORD1" + COEFFICIENT2*"WORD2" + COEFFICIENT3*"WORD3"'
+            # to   [WORD1, WORD2, WORD3]
+            words = [word_str.split(self._COEFFICIENT_SPLITTER)[1][1:-1]
+                     for word_str in words_str.split(self._WORD_SPLITTER)]
+            formatted_topics.append(words)
+
+        return formatted_topics
+
+    def evaluate(self):
+        pass
+
+
+# A sample output
+if __name__ == '__main__':
+    model = ModelGensim()
+
+    # pool = TokenPool(connect_db())
+    # model.add_stories(pool.output_tokens(1, 0))
+    # model.add_stories(pool.output_tokens(5, 1))
+
+    pool = TokenPool(SampleHandler())
+    model.add_stories(pool.output_tokens())
+
+    print(model.summarize_topic())