Skip to content
This repository has been archived by the owner on Dec 14, 2023. It is now read-only.

Commit

Permalink
Merge branch 'master' into jot-pgmigrate
Browse files Browse the repository at this point in the history
  • Loading branch information
pypt committed Jun 8, 2021
2 parents 7bf04aa + d1da1be commit 6f9f239
Show file tree
Hide file tree
Showing 472 changed files with 25,819 additions and 12,856 deletions.
2 changes: 1 addition & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

*.conf text eol=lf
*.config text eol=lf
*.cpanfile text eol=lf
cpanfile text eol=lf
*.css text eol=lf
*.csv text eol=lf
*.enabled_plugins text eol=lf
Expand Down
5 changes: 5 additions & 0 deletions .github/free-up-disk-space.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ sudo rm -f /swapfile
echo "Cleaning APT cache..."
sudo apt clean

echo "Removing some directories..."
sudo rm -rf /usr/local/lib/android/
sudo rm -rf /usr/local/lib/node_modules/
sudo rm -rf /usr/local/share/chromium/

echo "Removing docker images..."
docker rmi $(docker image ls -aq)

Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,11 @@ jobs:
MC_DOWNLOADS_AMAZON_S3_SECRET_ACCESS_KEY: ${{ secrets.MC_DOWNLOADS_AMAZON_S3_SECRET_ACCESS_KEY }}
MC_FACEBOOK_APP_ID: ${{ secrets.MC_FACEBOOK_APP_ID }}
MC_FACEBOOK_APP_SECRET: ${{ secrets.MC_FACEBOOK_APP_SECRET }}
MC_PODCAST_FETCH_EPISODE_BUCKET_NAME: ${{ secrets.MC_PODCAST_FETCH_EPISODE_BUCKET_NAME }}
MC_PODCAST_AUTH_JSON_BASE64: ${{ secrets.MC_PODCAST_AUTH_JSON_BASE64 }}
MC_PODCAST_RAW_ENCLOSURES_BUCKET_NAME: ${{ secrets.MC_PODCAST_RAW_ENCLOSURES_BUCKET_NAME }}
MC_PODCAST_TRANSCODED_EPISODES_BUCKET_NAME: ${{ secrets.MC_PODCAST_TRANSCODED_EPISODES_BUCKET_NAME }}
MC_PODCAST_TRANSCRIPTS_BUCKET_NAME: ${{ secrets.MC_PODCAST_TRANSCRIPTS_BUCKET_NAME }}
MC_PODCAST_FETCH_TRANSCRIPT_RUN_COSTLY_TEST: ${{ secrets.MC_PODCAST_FETCH_TRANSCRIPT_RUN_COSTLY_TEST }}
MC_PODCAST_GC_AUTH_JSON_BASE64: ${{ secrets.MC_PODCAST_GC_AUTH_JSON_BASE64 }}
MC_TWITTER_ACCESS_TOKEN: ${{ secrets.MC_TWITTER_ACCESS_TOKEN }}
MC_TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.MC_TWITTER_ACCESS_TOKEN_SECRET }}
MC_TWITTER_CONSUMER_KEY: ${{ secrets.MC_TWITTER_CONSUMER_KEY }}
Expand Down
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ coverage.json
**/.idea/**/dataSources/
**/.idea/**/dataSources.ids
**/.idea/**/dataSources.local.xml
**/.idea/**/sqlDataSources.xml

# Not ignoring **/.idea/**/sqlDataSources.xml as it points to ./.idea/mediawords.sql

**/.idea/**/dynamic.xml
**/.idea/**/uiDesigner.xml
**/.idea/**/dbnavigator.xml
Expand Down
16 changes: 11 additions & 5 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,18 @@
[submodule "dev/quieter-docker-compose"]
path = dev/quieter-docker-compose
url = https://github.com/mediacloud/docker-compose-just-quieter.git
[submodule "apps/podcast-fetch-episode/tests/data/media-samples"]
path = apps/podcast-fetch-episode/tests/data/media-samples
url = https://github.com/mediacloud/podcast-media-samples.git
[submodule "apps/podcast-fetch-transcript/tests/data/media-samples"]
path = apps/podcast-fetch-transcript/tests/data/media-samples
[submodule "apps/podcast-transcribe-episode/tests/data/media-samples"]
path = apps/podcast-transcribe-episode/tests/data/media-samples
url = https://github.com/mediacloud/podcast-media-samples.git
[submodule "apps/elk-journalbeat/journald-log-sample"]
path = apps/elk-journalbeat/journald-log-sample
url = https://github.com/mediacloud/journald-log-sample.git
[submodule "apps/temporal-grafana/dashboards"]
path = apps/temporal-grafana/dashboards
url = https://github.com/temporalio/dashboards.git
[submodule "apps/temporal-server/config"]
path = apps/temporal-server/config
url = https://github.com/mediacloud/backend-temporal-server-config.git
[submodule "apps/temporal-postgresql/temporal-config"]
path = apps/temporal-postgresql/temporal-config
url = https://github.com/mediacloud/backend-temporal-server-config.git
29 changes: 25 additions & 4 deletions apps/base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#

# https://hub.docker.com/_/ubuntu?tab=tags&page=1
FROM ubuntu:focal-20210119
FROM ubuntu:focal-20210416

ENV DEBIAN_FRONTEND=noninteractive \
LANG=en_US.UTF-8 \
Expand Down Expand Up @@ -48,6 +48,9 @@ RUN \
apt-get -y --no-install-recommends install \
# Quicker container debugging
bash-completion \
# "mail" utility which uses sendmail (provided by msmtp-mta) internally;
# some tools like munin-cron use "mail" to send emails
bsd-mailx \
curl \
htop \
# apt-key
Expand All @@ -56,7 +59,8 @@ RUN \
iproute2 \
# Pinging other containers from within Compose environment
iputils-ping \
# Sending mail via sendmail utility through mail-postfix-server
# Provides "sendmail" utility which relays email through
# "mail-postfix-server" app
msmtp \
msmtp-mta \
# Provides killall among other utilities
Expand All @@ -67,8 +71,6 @@ RUN \
netcat \
# Some packages insist on logging to syslog
rsyslog \
# "mail" utility (which uses msmtp internally)
s-nail \
# Timezone data, used by many packages
tzdata \
# Basic editor for files in container while debugging
Expand All @@ -90,6 +92,25 @@ COPY bin/container_memory_limit.sh bin/container_cpu_limit.sh bin/dl_to_stdout.s
# Copy MSMTP configuration
COPY conf/msmtprc conf/msmtp-aliases /etc/

# Both "sendmail" and "mail" utilities are important as they're used by various
# apps (e.g. munin-cron) to send us important email, and those apps aren't
# particularly vocal when they're unable to send email. So, for extra paranoia,
# verify that both utilities point to correct symlinks here.
RUN \
if [ "$(readlink -- "/usr/sbin/sendmail")" != "../bin/msmtp" ]; then \
echo "sendmail is not symlinked to msmtp, sending email won't work." && \
exit 1; \
fi; \
if [ "$(readlink -- "/usr/bin/mail")" != "/etc/alternatives/mail" ]; then \
echo "mail is not symlinked to /etc/alternatives/mail, sending email won't work." && \
exit 1; \
fi; \
if [ "$(readlink -- "/etc/alternatives/mail")" != "/usr/bin/bsd-mailx" ]; then \
echo "mail is not symlinked to /etc/alternatives/mail, sending email won't work." && \
exit 1; \
fi; \
true

# Generate and set locale
RUN \
locale-gen en_US en_US.UTF-8 && \
Expand Down
2 changes: 1 addition & 1 deletion apps/cliff-annotator/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN \
# Install Tomcat 7
RUN \
mkdir -p /usr/lib/tomcat7/ && \
/dl_to_stdout.sh "https://archive.apache.org/dist/tomcat/tomcat-7/v7.0.96/bin/apache-tomcat-7.0.96.tar.gz" | \
/dl_to_stdout.sh "https://mediacloud-archive-apache-org.s3.amazonaws.com/apache-tomcat-7.0.96.tar.gz" | \
tar -zx -C /usr/lib/tomcat7/ --strip 1 && \
true

Expand Down
1 change: 1 addition & 0 deletions apps/cliff-fetch-annotation-and-tag/.dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,4 @@ sdist
Temporary Items
wheels
_Inline

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions apps/cliff-fetch-annotation-and-tag/.idea/mediawords.sql

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion apps/cliff-fetch-annotation-and-tag/.idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions apps/cliff-fetch-annotation-and-tag/.idea/sqlDataSources.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions apps/cliff-fetch-annotation-and-tag/.idea/sqldialects.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions apps/cliff-fetch-annotation-and-tag/docker-compose.tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,5 @@ services:
source: ./../postgresql-server/schema/
target: /opt/mediacloud/schema/
- type: bind
source: ./../postgresql-server/conf/
target: /etc/postgresql/11/main/
source: ./../postgresql-base/conf/
target: /etc/postgresql/13/main/
6 changes: 4 additions & 2 deletions apps/common/.idea/common.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions apps/common/.idea/inspectionProfiles/Project_Default.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions apps/common/.idea/mediawords.sql

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion apps/common/.idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions apps/common/.idea/sqlDataSources.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions apps/common/.idea/sqldialects.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 8 additions & 2 deletions apps/common/docker-compose.tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ services:
MC_DOWNLOADS_AMAZON_S3_DIRECTORY_NAME: "${MC_DOWNLOADS_AMAZON_S3_DIRECTORY_NAME}"
MC_PUBLIC_STORE_TYPE: "postgresql"
MC_PUBLIC_STORE_SALT: "foo"
# Email address to point to in List-Unsubscribe email header.
# Technically we don't have a straightforward "unsubscribe" endpoint, but our
# emails are more likely to be marked spam if we don't have such a header, so
# we make the email subject "Delete account and unsubscribe" in
# mediawords/util/config/common.py
MC_EMAIL_UNSUBSCRIBE: "[email protected]"
volumes:
- type: bind
source: ./src/
Expand Down Expand Up @@ -91,8 +97,8 @@ services:
source: ./../postgresql-server/schema/
target: /opt/mediacloud/schema/
- type: bind
source: ./../postgresql-server/conf/
target: /etc/postgresql/11/main/
source: ./../postgresql-base/conf/
target: /etc/postgresql/13/main/

solr-shard-01:
image: gcr.io/mcback/solr-shard:latest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ sub stop_words_map($)
return $stop_words_map;
}

# FIXME remove once stopword comparison is over
sub stop_words_old_map($)
{
my $self = shift;

my $stop_words_old_map = $self->{ _python_lang }->stop_words_old_map();
return $stop_words_old_map;
}

sub stem_words($$)
{
my ( $self, $words ) = @_;
Expand Down
59 changes: 47 additions & 12 deletions apps/common/src/python/mediawords/db/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,31 @@
import time
from typing import Optional

from mediawords.db.handler import DatabaseHandler
from mediawords.util.config.common import CommonConfig
from mediawords.util.config.common import CommonConfig, DatabaseConfig, ConnectRetriesConfig
from mediawords.util.log import create_logger
from mediawords.util.perl import decode_object_from_bytes_if_needed
from mediawords.util.process import fatal_error

log = create_logger(__name__)


def connect_to_db() -> DatabaseHandler:
"""Connect to PostgreSQL."""
class McConnectToDBError(Exception):
"""Exception that gets raised if connect_to_db() runs out of retries and
db_config.retries.fatal_error_on_failure is set to False."""
pass


def connect_to_db(db_config: Optional[DatabaseConfig] = None) -> DatabaseHandler:
"""
Connect to PostgreSQL (via PgBouncer).
:param db_config: Optional DatabaseConfig parameter to specify connection retry parameters.
:return: DatabaseHandler object.
"""

if not db_config:
db_config = CommonConfig.database()

db_config = CommonConfig.database()
retries_config = db_config.retries()

assert retries_config.max_attempts() > 0, "max_tries can't be negative."
Expand Down Expand Up @@ -57,12 +70,34 @@ def connect_to_db() -> DatabaseHandler:
else:
log.info("Out of retries, giving up and exiting...")

# Don't throw any exceptions because they might be caught by
# the try-catch block, and so the caller will just assume that
# there was something wrong with the input data and proceed
# with processing next item in the job queue (e.g. the next
# story). Instead, just quit and wait for someone to restart
# the whole app that requires database access.
fatal_error(error_message)
if retries_config.fatal_error_on_failure():
# Don't throw any exceptions because they might be caught by
# the try-catch block, and so the caller will just assume that
# there was something wrong with the input data and proceed
# with processing next item in the job queue (e.g. the next
# story). Instead, just quit and wait for someone to restart
# the whole app that requires database access.
fatal_error(error_message)
else:
raise McConnectToDBError(error_message)

return db


def connect_to_db_or_raise() -> DatabaseHandler:
"""
Shorthand for connect_to_db() with its own retries and fatal_error() disabled.
By default, connect_to_db() will attempt connecting to PostgreSQL a few times and would call fatal_error() on
failures and stop the whole process.
Useful in workflows, i.e. it's better to leave all of the retrying to Temporal.
"""
return connect_to_db(
db_config=DatabaseConfig(
retries=ConnectRetriesConfig(
max_attempts=1,
fatal_error_on_failure=False,
)
)
)
Loading

0 comments on commit 6f9f239

Please sign in to comment.