mediacloud · thepsalmist · Mar 6, 2022 · Mar 18, 2022 · Mar 18, 2022 · Mar 23, 2022
diff --git a/apps/common/Dockerfile b/apps/common/Dockerfile
@@ -143,6 +143,8 @@ COPY bin/build_jieba_dict_cache.py /
 RUN \
     /build_jieba_dict_cache.py && \
     rm /build_jieba_dict_cache.py && \
+    chown mediacloud:mediacloud /var/tmp/jieba.cache && \
+    ls -l /var/tmp/jieba.cache && \
     true
 
 # Symlink Log::Log4perl configuration to where it's going to be found

diff --git a/apps/common/src/python/mediawords/solr/request.py b/apps/common/src/python/mediawords/solr/request.py
@@ -4,6 +4,7 @@
 
 import abc
 import time
+import json
 from typing import Union, Optional
 from urllib.parse import urlencode
 
@@ -24,6 +25,10 @@
 __QUERY_HTTP_TIMEOUT = 15 * 60
 """Timeout of a single HTTP query."""
 
+# Testing alias!!
+SOLR_COLLECTION = 'mediacloud2'
+MEDIACLOUD_32 = 'mediacloud'
+MEDIACLOUD_64 = 'mediacloud64'
 
 class _AbstractSolrRequestException(Exception, metaclass=abc.ABCMeta):
     """Abstract .solr.request exception."""
@@ -59,7 +64,7 @@ def __wait_for_solr_to_start(config: Optional[CommonConfig]) -> None:
     """Wait for Solr to start and collections to become available, if needed."""
 
     # search for an empty or rare term here because searching for *:* sometimes causes a timeout for some reason
-    sample_select_url = f"{config.solr_url()}/mediacloud/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json"
+    sample_select_url = f"{config.solr_url()}/{SOLR_COLLECTION}/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json"
 
     connected = False
 
@@ -152,6 +157,54 @@ def __solr_error_message_from_response(response: Response) -> str:
     return error_message
 
 
+def merge_responses(dict1: dict,dict2: dict):
+    """
+    Merge solr responses from each of the collections to one
+
+    :param dict1: Response from mediacloud32 collection.
+    :param dict2: Response from mediacloud64 collection.
+
+    """
+    new_response = {}
+    responseHeader = dict1["responseHeader"]
+
+    new_response.update(responseHeader)
+
+    #response
+    response = dict1["response"]
+    num_found = response["numFound"] + dict2["response"]["numFound"]
+    start = response["start"] + dict2["response"]["start"]
+    if response["docs"] is not None:
+        docs = response["docs"].extend(dict2["response"]["docs"])
+    else:
+        docs = []
+
+    res_data = {"numFound":num_found,"start":start,"docs":docs}
+    response = {"response": res_data}
+
+    new_response.update(response)
+
+    #facets
+    facets = dict1["facets"]
+    count = facets["count"] + dict2["facets"]["count"]
+    if "categories" in facets:
+        if facets["categories"]["buckets"] is not None:
+            if "categories" in dict2["facets"]:
+                buckets = facets["categories"]["buckets"].extend(dict2["facets"]["categories"]["buckets"])
+            else:
+                buckets = facets["categories"]["buckets"]
+        else:
+            buckets = []
+        categories = {"buckets":buckets}
+    else:
+        categories = {}
+
+    facets_data = {"count":count,"categories":categories}
+    facets = {"facets":facets_data}
+    new_response.update(facets)
+
+    return new_response
+
 def solr_request(path: str,
                  params: SolrParams = None,
                  content: Union[str, SolrParams] = None,
@@ -191,10 +244,8 @@ def solr_request(path: str,
     if not params:
         params = {}
 
-    abs_uri = furl(f"{solr_url}/mediacloud/{path}")
-    abs_uri = abs_uri.set(params)
-    abs_url = str(abs_uri)
-
+    collections = [MEDIACLOUD_32, MEDIACLOUD_64]
+
     ua = UserAgent()
     ua.set_timeout(__QUERY_HTTP_TIMEOUT)
     ua.set_max_size(None)
@@ -219,21 +270,38 @@ def solr_request(path: str,
 
         content_encoded = content.encode('utf-8', errors='replace')
 
-        request = Request(method='POST', url=abs_url)
-        request.set_header(name='Content-Type', value=content_type)
-        request.set_header(name='Content-Length', value=str(len(content_encoded)))
-        request.set_content(content_encoded)
-
+        results = []
+        for collection in collections:
+            abs_uri = furl(f"{solr_url}/{collection}/{path}")
+            abs_uri = abs_uri.set(params)
+            abs_url = str(abs_uri)
+            request = Request(method='POST', url=abs_url)
+            request.set_header(name='Content-Type', value=content_type)
+            request.set_header(name='Content-Length', value=str(len(content_encoded)))
+            request.set_content(content_encoded)
+            results.append(request)
+
     else:
-
         request = Request(method='GET', url=abs_url)
+        log.debug(f"Sending Solr request: {request}")
+
+    responses = []
+    if len(results) > 1:
+        for r in results:
+            response = ua.request(r)
+            if response.is_success():
+                responses.append(response.decoded_content())
+            else:
+                error_message = __solr_error_message_from_response(response=response)
+                raise McSolrRequestQueryErrorException(f"Error fetching Solr response: {error_message}")
+
+        response = merge_responses(json.loads(responses[0]),json.loads(responses[1]))
+        return json.dumps(response)
 
-    log.debug(f"Sending Solr request: {request}")
-
-    response = ua.request(request)
-
-    if not response.is_success():
-        error_message = __solr_error_message_from_response(response=response)
-        raise McSolrRequestQueryErrorException(f"Error fetching Solr response: {error_message}")
+    else:
+        response = ua.request(request)
+        if not response.is_success():
+            error_message = __solr_error_message_from_response(response=response)
+            raise McSolrRequestQueryErrorException(f"Error fetching Solr response: {error_message}")
 
-    return response.decoded_content()
+        return response.decoded_content()
diff --git a/apps/common/src/python/mediawords/util/config/__init__.py b/apps/common/src/python/mediawords/util/config/__init__.py
@@ -46,6 +46,16 @@ def env_value(name: str, required: bool = True, allow_empty_string: bool = False
 
     return value
 
+def env_bool(name: str, default: bool = False) -> bool:
+    """
+    Retrieve boolean from environment variable; should be 0 or 1.
+
+    :param name: Environment variable name.
+    :param default: default value, if no value found.
+    """
+
+    value = os.environ.get(name, default)
+    return bool(int(value))
 
 def file_with_env_value(name: str, allow_empty_string: bool = False, encoded_with_base64: bool = False) -> str:
     """

diff --git a/apps/common/src/requirements.txt b/apps/common/src/requirements.txt
@@ -43,6 +43,10 @@ furl==2.1.0
 # Chinese language tokenizer, stemmer, etc.
 jieba==0.42.1
 
+# For Jinja2 2.11.3, which requests MarkupSafe>=0.23 and is now
+# getting version 2.1.1, which removed a deprecated function.
+MarkupSafe==2.0.1
+
 # Parsing email templates
 Jinja2==2.11.3
 

diff --git a/apps/extract-and-vector/bin/extract_and_vector_worker.py b/apps/extract-and-vector/bin/extract_and_vector_worker.py
@@ -4,6 +4,7 @@
 
 from mediawords.db import connect_to_db
 from mediawords.job import JobBroker
+from mediawords.util.config import env_bool
 from mediawords.util.log import create_logger
 from mediawords.util.perl import decode_object_from_bytes_if_needed
 from extract_and_vector.dbi.stories.extractor_arguments import PyExtractorArguments
@@ -69,8 +70,10 @@ def run_extract_and_vector(stories_id: int, use_cache: bool = False, use_existin
 
     log.info("Extracting story {}...".format(stories_id))
 
+    no_dedup_sentences = env_bool('MC_NO_DEDUP_SENTENCES', True)
     try:
-        extractor_args = PyExtractorArguments(use_cache=use_cache, use_existing=use_existing)
+        extractor_args = PyExtractorArguments(use_cache=use_cache, use_existing=use_existing,
+                                              no_dedup_sentences=no_dedup_sentences)
         extract_and_process_story(db=db, story=story, extractor_args=extractor_args)
 
     except Exception as ex:

diff --git a/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm b/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm
@@ -55,7 +55,7 @@ Readonly my @SOLR_FIELDS => qw/stories_id media_id publish_date publish_day publ
   text title language processed_stories_id tags_id_stories timespans_id/;
 
 # how many sentences to fetch at a time from the postgres query
-Readonly my $FETCH_BLOCK_SIZE => 100;
+Readonly my $FETCH_BLOCK_SIZE => 200;
 
 # default time sleep when there are less than MIN_STORIES_TO_PROCESS:
 Readonly my $DEFAULT_THROTTLE => 60;
@@ -601,6 +601,7 @@ Options:
 * throttle -- sleep this number of seconds between each block of stories (default 60)
 * full -- shortcut for: update=false, empty_queue=true, throttle=1; assume and optimize for static queue
 * skip_logging -- skip logging the import into the solr_import_stories or solr_imports tables (default=false)
+* skip_update_snapshot -- skip setting snapshots.searchable=true (default=true)
 
 The import will run in blocks of "max_queued_stories" at a time. The function
 will keep trying to find stories to import.  If there are less than
@@ -627,6 +628,7 @@ sub import_data($;$)
     my $empty_queue  = $options->{ empty_queue }  // 0;
     my $throttle     = $options->{ throttle }     // $DEFAULT_THROTTLE;
     my $skip_logging = $options->{ skip_logging } // 0;
+    my $skip_update_snapshot = $options->{ skip_update_snapshot } // 1;
     my $daemon = $options->{ daemon } // 0;
 
     $_last_max_queue_stories_id = 0;
@@ -669,7 +671,7 @@ sub import_data($;$)
             _save_import_log( $db, $stories_ids );
         }
 
-        if ( !$skip_logging )
+        if ( !$skip_logging && !$skip_update_snapshot )
         {
             _update_snapshot_solr_status( $db );
         }

diff --git a/apps/postgresql-pgbouncer/conf/pgbouncer.ini b/apps/postgresql-pgbouncer/conf/pgbouncer.ini
@@ -1,5 +1,6 @@
 [databases]
-* = host=postgresql-server port=5432 user=mediacloud
+; PhilB 5/6/22: PG server running on postgresql EC2 server w/o docker
+* = host=172.30.0.58 port=5432 user=mediacloud
 
 [pgbouncer]
 

diff --git a/apps/postgresql-server/bin/apply_migrations.sh b/apps/postgresql-server/bin/apply_migrations.sh
@@ -14,7 +14,8 @@ MIGRATIONS_DIR="/opt/postgresql-server/pgmigrate/migrations"
 TEMP_PORT=12345
 
 # In case the database is in recovery, wait for up to 1 hour for it to complete
-PGCTL_START_TIMEOUT=3600
+# PLB: increased to three hours
+PGCTL_START_TIMEOUT=10800
 
 if [ ! -d "${MIGRATIONS_DIR}" ]; then
     echo "Migrations directory ${MIGRATIONS_DIR} does not exist."

diff --git a/apps/solr-base/Dockerfile b/apps/solr-base/Dockerfile
@@ -19,5 +19,18 @@ RUN \
 RUN mkdir -p /usr/src/
 COPY src/solr/ /usr/src/solr/
 
+# Try to create 64-bit enabled mediacloud64 collection by cloning config
+# NOTE: collections/mediacloud/conf/solrconfig.xml uses
+#	${mediacloud.luceneMatchVersion} ${mediacloud.solr_webapp_dir} ${mediacloud.solr_dist_dir}
+#	which reference JVM properties set in solr-shard/bin/solr-shard.sh
+# ALSO: core.properties has "instanceDir=/var/lib/solr/mediacloud" (dir does not exist?!)
+#	will be wacked to .../mediacloud64 (also does not exist)
+RUN \
+    mkdir -p /usr/src/solr/collections/mediacloud64 && \
+    cp -rp /usr/src/solr/collections/mediacloud/* /usr/src/solr/collections/mediacloud64/ && \
+    sed -i.32 's/mediacloud/mediacloud64/' /usr/src/solr/collections/mediacloud64/core.properties && \
+    sed -i.32 '/<field name=.*type="int"/s/"int"/"long"/' /usr/src/solr/collections/mediacloud64/conf/schema.xml && \
+    true
+
 # Add user that Solr will run as
 RUN useradd -ms /bin/bash solr
diff --git a/apps/solr-base/src/solr/aliases.json b/apps/solr-base/src/solr/aliases.json
@@ -0,0 +1 @@
+{"collection":{"mediacloud2":"mediacloud64,mediacloud"}}
diff --git a/apps/solr-zookeeper/bin/init_solr_config.sh b/apps/solr-zookeeper/bin/init_solr_config.sh
@@ -41,5 +41,12 @@ for collection_path in /usr/src/solr/collections/*; do
     fi
 done
 
+ALIASES=/usr/src/solr/aliases.json
+if [ -f $ALIASES ]; then
+    /opt/solr/server/scripts/cloud-scripts/zkcli.sh \
+                -zkhost 127.0.0.1:2181 \
+                -cmd putfile /aliases.json $ALIASES
+fi
+
 # Stop after initial configuration
 pkill java
diff --git a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Topics/Timespans.pm b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Topics/Timespans.pm
@@ -88,7 +88,7 @@ SQL
             snapshots_id
         FROM timespans AS t
         where
-            topics_id = ? AND
+            topics_id = ?
             $snapshot_clause
             $focus_clause
             $timespan_clause