Skip to content
This repository has been archived by the owner on Dec 14, 2023. It is now read-only.

Add monitoring to the rabbitmq service #840

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions apps/common/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ COPY bin/build_jieba_dict_cache.py /
RUN \
/build_jieba_dict_cache.py && \
rm /build_jieba_dict_cache.py && \
chown mediacloud:mediacloud /var/tmp/jieba.cache && \
ls -l /var/tmp/jieba.cache && \
true

# Symlink Log::Log4perl configuration to where it's going to be found
Expand Down
6 changes: 4 additions & 2 deletions apps/common/src/python/mediawords/solr/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
__QUERY_HTTP_TIMEOUT = 15 * 60
"""Timeout of a single HTTP query."""

# Testing alias!!
SOLR_COLLECTION = 'mediacloud2'

class _AbstractSolrRequestException(Exception, metaclass=abc.ABCMeta):
"""Abstract .solr.request exception."""
Expand Down Expand Up @@ -59,7 +61,7 @@ def __wait_for_solr_to_start(config: Optional[CommonConfig]) -> None:
"""Wait for Solr to start and collections to become available, if needed."""

# search for an empty or rare term here because searching for *:* sometimes causes a timeout for some reason
sample_select_url = f"{config.solr_url()}/mediacloud/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json"
sample_select_url = f"{config.solr_url()}/{SOLR_COLLECTION}/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json"

connected = False

Expand Down Expand Up @@ -191,7 +193,7 @@ def solr_request(path: str,
if not params:
params = {}

abs_uri = furl(f"{solr_url}/mediacloud/{path}")
abs_uri = furl(f"{solr_url}/{SOLR_COLLECTION}/{path}")
abs_uri = abs_uri.set(params)
abs_url = str(abs_uri)

Expand Down
10 changes: 10 additions & 0 deletions apps/common/src/python/mediawords/util/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ def env_value(name: str, required: bool = True, allow_empty_string: bool = False

return value

def env_bool(name: str, default: bool = False) -> bool:
"""
Retrieve boolean from environment variable; should be 0 or 1.

:param name: Environment variable name.
:param default: default value, if no value found.
"""

value = os.environ.get(name, default)
return bool(int(value))

def file_with_env_value(name: str, allow_empty_string: bool = False, encoded_with_base64: bool = False) -> str:
"""
Expand Down
4 changes: 4 additions & 0 deletions apps/common/src/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ furl==2.1.0
# Chinese language tokenizer, stemmer, etc.
jieba==0.42.1

# For Jinja2 2.11.3, which requests MarkupSafe>=0.23 and is now
# getting version 2.1.1, which removed a deprecated function.
MarkupSafe==2.0.1

# Parsing email templates
Jinja2==2.11.3

Expand Down
42 changes: 39 additions & 3 deletions apps/docker-compose.dist.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1813,7 +1813,8 @@ services:
placement:
constraints:
# Must run on the host with Temporal Grafana data volume
- node.labels.role-temporal-grafana == true
# - node.labels.role-temporal-grafana == true
- node.labels.role-monitoring == true
# Worker count
replicas: 1
resources:
Expand Down Expand Up @@ -1909,7 +1910,8 @@ services:
placement:
constraints:
# Must run on the host with Temporal Prometheus data volume
- node.labels.role-temporal-prometheus == true
# - node.labels.role-temporal-prometheus == true
- node.labels.role-monitoring == true
# Worker count
replicas: 1
resources:
Expand Down Expand Up @@ -2237,7 +2239,33 @@ services:
# RAM limit
memory: "2G"


#
# Temporal Prometheus (Temporal's statistics store)
# -------------------------------------------------
#
temporal-alertmanager:
image: thepsalmist/temporal-alertmanager:release_monitoring_v2
init: true
depends_on:
- temporal-prometheus
networks:
- default
expose:
- "9093"
volumes:
- vol_temporal_alertmanager_data:/opt/alertmanager/data/
deploy:
<<: *endpoint-mode-dnsrr
placement:
constraints:
# Must run on the host with Temporal Alertmanager data volume
- node.labels.role-monitoring == true
# Worker count
replicas: 1
resources:
limits:
cpus: "1"
memory: "1G"
#
# Networks
# ========
Expand Down Expand Up @@ -2544,3 +2572,11 @@ volumes:
type: none
o: bind
device: /space/mediacloud/vol_temporal_grafana_data

# Temporal Grafana data
vol_temporal_alertmanager_data:
driver: local
driver_opts:
type: none
o: bind
device: /space/mediacloud/vol_temporal_alertmanager_data
5 changes: 4 additions & 1 deletion apps/extract-and-vector/bin/extract_and_vector_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from mediawords.db import connect_to_db
from mediawords.job import JobBroker
from mediawords.util.config import env_bool
from mediawords.util.log import create_logger
from mediawords.util.perl import decode_object_from_bytes_if_needed
from extract_and_vector.dbi.stories.extractor_arguments import PyExtractorArguments
Expand Down Expand Up @@ -69,8 +70,10 @@ def run_extract_and_vector(stories_id: int, use_cache: bool = False, use_existin

log.info("Extracting story {}...".format(stories_id))

no_dedup_sentences = env_bool('MC_NO_DEDUP_SENTENCES', True)
try:
extractor_args = PyExtractorArguments(use_cache=use_cache, use_existing=use_existing)
extractor_args = PyExtractorArguments(use_cache=use_cache, use_existing=use_existing,
no_dedup_sentences=no_dedup_sentences)
extract_and_process_story(db=db, story=story, extractor_args=extractor_args)

except Exception as ex:
Expand Down
6 changes: 4 additions & 2 deletions apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ Readonly my @SOLR_FIELDS => qw/stories_id media_id publish_date publish_day publ
text title language processed_stories_id tags_id_stories timespans_id/;

# how many sentences to fetch at a time from the postgres query
Readonly my $FETCH_BLOCK_SIZE => 100;
Readonly my $FETCH_BLOCK_SIZE => 200;

# default time sleep when there are less than MIN_STORIES_TO_PROCESS:
Readonly my $DEFAULT_THROTTLE => 60;
Expand Down Expand Up @@ -601,6 +601,7 @@ Options:
* throttle -- sleep this number of seconds between each block of stories (default 60)
* full -- shortcut for: update=false, empty_queue=true, throttle=1; assume and optimize for static queue
* skip_logging -- skip logging the import into the solr_import_stories or solr_imports tables (default=false)
* skip_update_snapshot -- skip setting snapshots.searchable=true (default=true)

The import will run in blocks of "max_queued_stories" at a time. The function
will keep trying to find stories to import. If there are less than
Expand All @@ -627,6 +628,7 @@ sub import_data($;$)
my $empty_queue = $options->{ empty_queue } // 0;
my $throttle = $options->{ throttle } // $DEFAULT_THROTTLE;
my $skip_logging = $options->{ skip_logging } // 0;
my $skip_update_snapshot = $options->{ skip_update_snapshot } // 1;
my $daemon = $options->{ daemon } // 0;

$_last_max_queue_stories_id = 0;
Expand Down Expand Up @@ -669,7 +671,7 @@ sub import_data($;$)
_save_import_log( $db, $stories_ids );
}

if ( !$skip_logging )
if ( !$skip_logging && !$skip_update_snapshot )
{
_update_snapshot_solr_status( $db );
}
Expand Down
3 changes: 2 additions & 1 deletion apps/postgresql-pgbouncer/conf/pgbouncer.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[databases]
* = host=postgresql-server port=5432 user=mediacloud
; PhilB 5/6/22: PG server running on postgresql EC2 server w/o docker
* = host=172.30.0.58 port=5432 user=mediacloud

[pgbouncer]

Expand Down
3 changes: 2 additions & 1 deletion apps/postgresql-server/bin/apply_migrations.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ MIGRATIONS_DIR="/opt/postgresql-server/pgmigrate/migrations"
TEMP_PORT=12345

# In case the database is in recovery, wait for up to 1 hour for it to complete
PGCTL_START_TIMEOUT=3600
# PLB: increased to three hours
PGCTL_START_TIMEOUT=10800

if [ ! -d "${MIGRATIONS_DIR}" ]; then
echo "Migrations directory ${MIGRATIONS_DIR} does not exist."
Expand Down
2 changes: 1 addition & 1 deletion apps/rabbitmq-server/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# RabbitMQ server
#

FROM gcr.io/mcback/base:latest
FROM gcr.io/mcback/base:release

# Add RabbitMQ APT repository
RUN \
Expand Down
2 changes: 1 addition & 1 deletion apps/rabbitmq-server/conf/enabled_plugins
Original file line number Diff line number Diff line change
@@ -1 +1 @@
[rabbitmq_amqp1_0,rabbitmq_management,rabbitmq_management_visualiser,rabbitmq_shovel,rabbitmq_shovel_management].
[rabbitmq_amqp1_0,rabbitmq_management,rabbitmq_management_visualiser,rabbitmq_shovel,rabbitmq_shovel_management,rabbitmq_prometheus].
13 changes: 13 additions & 0 deletions apps/solr-base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,18 @@ RUN \
RUN mkdir -p /usr/src/
COPY src/solr/ /usr/src/solr/

# Try to create 64-bit enabled mediacloud64 collection by cloning config
# NOTE: collections/mediacloud/conf/solrconfig.xml uses
# ${mediacloud.luceneMatchVersion} ${mediacloud.solr_webapp_dir} ${mediacloud.solr_dist_dir}
# which reference JVM properties set in solr-shard/bin/solr-shard.sh
# ALSO: core.properties has "instanceDir=/var/lib/solr/mediacloud" (dir does not exist?!)
# will be wacked to .../mediacloud64 (also does not exist)
RUN \
mkdir -p /usr/src/solr/collections/mediacloud64 && \
cp -rp /usr/src/solr/collections/mediacloud/* /usr/src/solr/collections/mediacloud64/ && \
sed -i.32 's/mediacloud/mediacloud64/' /usr/src/solr/collections/mediacloud64/core.properties && \
sed -i.32 '/<field name=.*type="int"/s/"int"/"long"/' /usr/src/solr/collections/mediacloud64/conf/schema.xml && \
true

# Add user that Solr will run as
RUN useradd -ms /bin/bash solr
1 change: 1 addition & 0 deletions apps/solr-base/src/solr/aliases.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"collection":{"mediacloud2":"mediacloud64,mediacloud"}}
7 changes: 7 additions & 0 deletions apps/solr-zookeeper/bin/init_solr_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,12 @@ for collection_path in /usr/src/solr/collections/*; do
fi
done

ALIASES=/usr/src/solr/aliases.json
if [ -f $ALIASES ]; then
/opt/solr/server/scripts/cloud-scripts/zkcli.sh \
-zkhost 127.0.0.1:2181 \
-cmd putfile /aliases.json $ALIASES
fi

# Stop after initial configuration
pkill java
92 changes: 92 additions & 0 deletions apps/temporal-alertmanager/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#
# Files from the build context to be ignored by "docker build".
#
# You might want to add as many of constantly changing files here as possible
# to prevent container's image from getting rebuilt every full moon.
#
# Unfortunately, we can't just symlink this file to every app's directory:
#
# https://github.com/moby/moby/issues/12886
#
# so for the time being you have to manually copy this file to every app
# subdirectory:
#
# cd apps/
# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \;
#

*$py.class
*.cover
*.DS_Store
*.egg
*.egg-info/
*.log
*.manifest
*.mo
*.pot
*.py[cod]
*.sage.py
*.so
*.spec
*.swp
*/*.py[cod]
*/*.swp
*/*/*.py[cod]
*/*/*.swp
*/*/*/*.py[cod]
*/*/*/*.swp
*/*/*/__pycache__/
*/*/__pycache__/
*/__pycache__/
._*
.apdisk
.AppleDB
.AppleDesktop
.AppleDouble
.cache
.com.apple.timemachine.donotpresent
.coverage
.coverage.*
.dockerignore
.DocumentRevisions-V100
.DS_Store
.eggs
.env
.fseventsd
.git
.gitignore
.hypothesis
.idea
.installed.cfg
.ipynb_checkpoints
.LSOverride
.mypy_cache
.pytest_cache
.Python
.python-version
.ropeproject
.scrapy
.Spotlight-V100
.spyderproject
.spyproject
.TemporaryItems
.tox
.Trashes
.venv
.VolumeIcon.icns
.webassets-cache
__pycache__
celerybeat-schedule
coverage.xml
Icon
local_settings.py
Network Trash Folder
nosetests.xml
parts
pip-delete-this-directory.txt
pip-log.txt
sdist
Temporary Items
wheels
_Inline

28 changes: 28 additions & 0 deletions apps/temporal-alertmanager/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
FROM gcr.io/mcback/base:release

RUN \
mkdir -p /opt/alertmanager/ && \
/dl_to_stdout.sh "https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-$(dpkg --print-architecture).tar.gz" | \
tar -zx -C /opt/alertmanager/ --strip 1 && \
true

COPY alertmanager.yml /opt/alertmanager/alertmanager.yml

# Add unprivileged user the service will run as
RUN \
useradd -ms /bin/bash temporal && \
mkdir -p /opt/alertmanager/data/ && \
chown temporal:temporal /opt/alertmanager/data/ && \
true

WORKDIR /opt/alertmanager/

ENV PATH="/opt/alertmanager:${PATH}"

EXPOSE 9093

USER temporal

VOLUME /opt/alertmanager/data

CMD ["alertmanager"]
14 changes: 14 additions & 0 deletions apps/temporal-alertmanager/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
route:
receiver: 'mail'
repeat_interval: 4h
group_by: [ alertname ]


receivers:
- name: 'mail'
email_configs:
- smarthost: ${EMAIL_HOST}
auth_username: ${EMAIL_USERNAME}
auth_password: ${EMAIL_PASSWORD}
from: ${FROM_EMAIL}
to: ${TO_EMAIL}
2 changes: 1 addition & 1 deletion apps/temporal-prometheus/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Prometheus for Temporal stats
#

FROM gcr.io/mcback/base:latest
FROM gcr.io/mcback/base:release

RUN \
mkdir -p /opt/prometheus/ && \
Expand Down
Loading