diff --git a/apps/common/Dockerfile b/apps/common/Dockerfile index 08d99c3ef5..b7a575490c 100644 --- a/apps/common/Dockerfile +++ b/apps/common/Dockerfile @@ -143,6 +143,8 @@ COPY bin/build_jieba_dict_cache.py / RUN \ /build_jieba_dict_cache.py && \ rm /build_jieba_dict_cache.py && \ + chown mediacloud:mediacloud /var/tmp/jieba.cache && \ + ls -l /var/tmp/jieba.cache && \ true # Symlink Log::Log4perl configuration to where it's going to be found diff --git a/apps/common/src/python/mediawords/solr/request.py b/apps/common/src/python/mediawords/solr/request.py index 5694c0e0da..1036c32aaf 100644 --- a/apps/common/src/python/mediawords/solr/request.py +++ b/apps/common/src/python/mediawords/solr/request.py @@ -4,6 +4,7 @@ import abc import time +import json from typing import Union, Optional from urllib.parse import urlencode @@ -24,6 +25,10 @@ __QUERY_HTTP_TIMEOUT = 15 * 60 """Timeout of a single HTTP query.""" +# Testing alias!! +SOLR_COLLECTION = 'mediacloud2' +MEDIACLOUD_32 = 'mediacloud' +MEDIACLOUD_64 = 'mediacloud64' class _AbstractSolrRequestException(Exception, metaclass=abc.ABCMeta): """Abstract .solr.request exception.""" @@ -59,7 +64,7 @@ def __wait_for_solr_to_start(config: Optional[CommonConfig]) -> None: """Wait for Solr to start and collections to become available, if needed.""" # search for an empty or rare term here because searching for *:* sometimes causes a timeout for some reason - sample_select_url = f"{config.solr_url()}/mediacloud/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json" + sample_select_url = f"{config.solr_url()}/{SOLR_COLLECTION}/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json" connected = False @@ -152,6 +157,81 @@ def __solr_error_message_from_response(response: Response) -> str: return error_message +def merge_responses(mc_32_bit_collection: dict,mc_64_bit_collection: dict): + """ + Merge solr responses from each of the collections to one + + :param dict1: Response from mediacloud32 collection. + :param dict2: Response from mediacloud64 collection. + + """ + new_response = {} + + new_response.update(mc_32_bit_collection.get("responseHeader", {})) + + mc_32_bit_response = mc_32_bit_collection.get("response", {}) + mc_64_bit_response = mc_64_bit_collection.get("response", {}) + + num_found = mc_32_bit_response.get("numFound", 0) + mc_64_bit_response.get("numFound", 0) + start_index = mc_32_bit_response.get("start", 0) + mc_64_bit_response.get("start", 0) + + docs = [] + + docs.extend(mc_32_bit_response.get("docs", [])) + docs.extend(mc_64_bit_response.get("docs", [])) + + new_response.update({ + "response": { + "numFound": num_found, + "start": start_index, + "docs": docs, + } + }) + + # facets + if "facets" in mc_32_bit_collection or "facets" in mc_64_bit_collection: + mc_32_bit_facets = mc_32_bit_response.get("facets", {}) + mc_64_bit_facets = mc_64_bit_response.get("facets", {}) + + count = mc_32_bit_facets.get("count", 0) + mc_64_bit_facets.get("count", 0) + x = mc_32_bit_facets.get("x", 0) + mc_64_bit_facets.get("x", 0) + + categories = {} + + if "categories" in mc_32_bit_facets or "categories" in mc_64_bit_facets: + buckets = [] + mc_32_buckets = mc_32_bit_facets.get("categories", {}).get("buckets", []) + mc_64_buckets = mc_64_bit_facets.get("categories", {}).get("buckets", []) + merged = {} + for item in mc_32_buckets + mc_64_buckets: + val = item['val'] + if val in merged: + merged[val]['count'] += item['count'] + merged[val]['x'] += item['x'] + else: + merged[val] = item.copy() + + merged = list(merged.values()) + buckets.extend(merged) + categories.update({"buckets":buckets}) + + new_response.update({ + "facets": { + "count": count, + "categories": categories + } + }) + else: + new_response.update({ + "facets": { + "count": count, + "x": x + } + }) + + return new_response + + def solr_request(path: str, params: SolrParams = None, content: Union[str, SolrParams] = None, @@ -191,10 +271,8 @@ def solr_request(path: str, if not params: params = {} - abs_uri = furl(f"{solr_url}/mediacloud/{path}") - abs_uri = abs_uri.set(params) - abs_url = str(abs_uri) - + collections = [MEDIACLOUD_32, MEDIACLOUD_64] + ua = UserAgent() ua.set_timeout(__QUERY_HTTP_TIMEOUT) ua.set_max_size(None) @@ -219,21 +297,38 @@ def solr_request(path: str, content_encoded = content.encode('utf-8', errors='replace') - request = Request(method='POST', url=abs_url) - request.set_header(name='Content-Type', value=content_type) - request.set_header(name='Content-Length', value=str(len(content_encoded))) - request.set_content(content_encoded) - + results = [] + for collection in collections: + abs_uri = furl(f"{solr_url}/{collection}/{path}") + abs_uri = abs_uri.set(params) + abs_url = str(abs_uri) + request = Request(method='POST', url=abs_url) + request.set_header(name='Content-Type', value=content_type) + request.set_header(name='Content-Length', value=str(len(content_encoded))) + request.set_content(content_encoded) + results.append(request) + else: - request = Request(method='GET', url=abs_url) + log.debug(f"Sending Solr request: {request}") + + responses = [] + if len(results) > 1: + for r in results: + response = ua.request(r) + if response.is_success(): + responses.append(response.decoded_content()) + else: + error_message = __solr_error_message_from_response(response=response) + raise McSolrRequestQueryErrorException(f"Error fetching Solr response: {error_message}") + + response = merge_responses(json.loads(responses[0]),json.loads(responses[1])) + return json.dumps(response) - log.debug(f"Sending Solr request: {request}") - - response = ua.request(request) - - if not response.is_success(): - error_message = __solr_error_message_from_response(response=response) - raise McSolrRequestQueryErrorException(f"Error fetching Solr response: {error_message}") + else: + response = ua.request(request) + if not response.is_success(): + error_message = __solr_error_message_from_response(response=response) + raise McSolrRequestQueryErrorException(f"Error fetching Solr response: {error_message}") - return response.decoded_content() + return response.decoded_content() diff --git a/apps/common/src/python/mediawords/util/config/__init__.py b/apps/common/src/python/mediawords/util/config/__init__.py index 08f12feb8e..53819ff3e5 100644 --- a/apps/common/src/python/mediawords/util/config/__init__.py +++ b/apps/common/src/python/mediawords/util/config/__init__.py @@ -46,6 +46,16 @@ def env_value(name: str, required: bool = True, allow_empty_string: bool = False return value +def env_bool(name: str, default: bool = False) -> bool: + """ + Retrieve boolean from environment variable; should be 0 or 1. + + :param name: Environment variable name. + :param default: default value, if no value found. + """ + + value = os.environ.get(name, default) + return bool(int(value)) def file_with_env_value(name: str, allow_empty_string: bool = False, encoded_with_base64: bool = False) -> str: """ diff --git a/apps/common/src/requirements.txt b/apps/common/src/requirements.txt index 3bb17a43d9..6b8237199f 100644 --- a/apps/common/src/requirements.txt +++ b/apps/common/src/requirements.txt @@ -43,6 +43,10 @@ furl==2.1.0 # Chinese language tokenizer, stemmer, etc. jieba==0.42.1 +# For Jinja2 2.11.3, which requests MarkupSafe>=0.23 and is now +# getting version 2.1.1, which removed a deprecated function. +MarkupSafe==2.0.1 + # Parsing email templates Jinja2==2.11.3 diff --git a/apps/extract-and-vector/bin/extract_and_vector_worker.py b/apps/extract-and-vector/bin/extract_and_vector_worker.py index 0738c6e200..7a21a67864 100755 --- a/apps/extract-and-vector/bin/extract_and_vector_worker.py +++ b/apps/extract-and-vector/bin/extract_and_vector_worker.py @@ -4,6 +4,7 @@ from mediawords.db import connect_to_db from mediawords.job import JobBroker +from mediawords.util.config import env_bool from mediawords.util.log import create_logger from mediawords.util.perl import decode_object_from_bytes_if_needed from extract_and_vector.dbi.stories.extractor_arguments import PyExtractorArguments @@ -69,8 +70,10 @@ def run_extract_and_vector(stories_id: int, use_cache: bool = False, use_existin log.info("Extracting story {}...".format(stories_id)) + no_dedup_sentences = env_bool('MC_NO_DEDUP_SENTENCES', True) try: - extractor_args = PyExtractorArguments(use_cache=use_cache, use_existing=use_existing) + extractor_args = PyExtractorArguments(use_cache=use_cache, use_existing=use_existing, + no_dedup_sentences=no_dedup_sentences) extract_and_process_story(db=db, story=story, extractor_args=extractor_args) except Exception as ex: diff --git a/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm b/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm index a260cb109c..223dea7fcd 100644 --- a/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm +++ b/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm @@ -55,7 +55,7 @@ Readonly my @SOLR_FIELDS => qw/stories_id media_id publish_date publish_day publ text title language processed_stories_id tags_id_stories timespans_id/; # how many sentences to fetch at a time from the postgres query -Readonly my $FETCH_BLOCK_SIZE => 100; +Readonly my $FETCH_BLOCK_SIZE => 200; # default time sleep when there are less than MIN_STORIES_TO_PROCESS: Readonly my $DEFAULT_THROTTLE => 60; @@ -601,6 +601,7 @@ Options: * throttle -- sleep this number of seconds between each block of stories (default 60) * full -- shortcut for: update=false, empty_queue=true, throttle=1; assume and optimize for static queue * skip_logging -- skip logging the import into the solr_import_stories or solr_imports tables (default=false) +* skip_update_snapshot -- skip setting snapshots.searchable=true (default=true) The import will run in blocks of "max_queued_stories" at a time. The function will keep trying to find stories to import. If there are less than @@ -627,6 +628,7 @@ sub import_data($;$) my $empty_queue = $options->{ empty_queue } // 0; my $throttle = $options->{ throttle } // $DEFAULT_THROTTLE; my $skip_logging = $options->{ skip_logging } // 0; + my $skip_update_snapshot = $options->{ skip_update_snapshot } // 1; my $daemon = $options->{ daemon } // 0; $_last_max_queue_stories_id = 0; @@ -669,7 +671,7 @@ sub import_data($;$) _save_import_log( $db, $stories_ids ); } - if ( !$skip_logging ) + if ( !$skip_logging && !$skip_update_snapshot ) { _update_snapshot_solr_status( $db ); } diff --git a/apps/postgresql-pgbouncer/conf/pgbouncer.ini b/apps/postgresql-pgbouncer/conf/pgbouncer.ini index eb3f28662c..f7a14b215d 100644 --- a/apps/postgresql-pgbouncer/conf/pgbouncer.ini +++ b/apps/postgresql-pgbouncer/conf/pgbouncer.ini @@ -1,5 +1,6 @@ [databases] -* = host=postgresql-server port=5432 user=mediacloud +; PhilB 5/6/22: PG server running on postgresql EC2 server w/o docker +* = host=172.30.0.58 port=5432 user=mediacloud [pgbouncer] diff --git a/apps/postgresql-server/bin/apply_migrations.sh b/apps/postgresql-server/bin/apply_migrations.sh index bcc2d702e0..77267db3a7 100755 --- a/apps/postgresql-server/bin/apply_migrations.sh +++ b/apps/postgresql-server/bin/apply_migrations.sh @@ -14,7 +14,8 @@ MIGRATIONS_DIR="/opt/postgresql-server/pgmigrate/migrations" TEMP_PORT=12345 # In case the database is in recovery, wait for up to 1 hour for it to complete -PGCTL_START_TIMEOUT=3600 +# PLB: increased to three hours +PGCTL_START_TIMEOUT=10800 if [ ! -d "${MIGRATIONS_DIR}" ]; then echo "Migrations directory ${MIGRATIONS_DIR} does not exist." diff --git a/apps/solr-base/Dockerfile b/apps/solr-base/Dockerfile index 0ff5015f9c..6c76d33d8e 100644 --- a/apps/solr-base/Dockerfile +++ b/apps/solr-base/Dockerfile @@ -19,5 +19,18 @@ RUN \ RUN mkdir -p /usr/src/ COPY src/solr/ /usr/src/solr/ +# Try to create 64-bit enabled mediacloud64 collection by cloning config +# NOTE: collections/mediacloud/conf/solrconfig.xml uses +# ${mediacloud.luceneMatchVersion} ${mediacloud.solr_webapp_dir} ${mediacloud.solr_dist_dir} +# which reference JVM properties set in solr-shard/bin/solr-shard.sh +# ALSO: core.properties has "instanceDir=/var/lib/solr/mediacloud" (dir does not exist?!) +# will be wacked to .../mediacloud64 (also does not exist) +RUN \ + mkdir -p /usr/src/solr/collections/mediacloud64 && \ + cp -rp /usr/src/solr/collections/mediacloud/* /usr/src/solr/collections/mediacloud64/ && \ + sed -i.32 's/mediacloud/mediacloud64/' /usr/src/solr/collections/mediacloud64/core.properties && \ + sed -i.32 '/