Skip to content
This repository has been archived by the owner on Dec 14, 2023. It is now read-only.

Adds short description of what each app does #740

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions apps/docker-compose.dist.yml
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,8 @@ services:
#
# CLIFF annotator service
# -----------------------
# HTTP service with machine learning model that does named entity recognition (names, places, brandnames, etc.
# see https://github.com/mediacloud/backend/blob/master/apps/cliff-base/src/python/cliff_base/sample_data.py for a list.
#
cliff-annotator:
image: dockermediacloud/cliff-annotator:release
Expand All @@ -290,6 +292,7 @@ services:
#
# CLIFF fetch annotation
# ----------------------
# fetches named entities from cliff-annotator, stores them in a PostgreSQL table as a compressed blob.
#
cliff-fetch-annotation:
image: dockermediacloud/cliff-fetch-annotation:release
Expand Down Expand Up @@ -317,6 +320,7 @@ services:
#
# CLIFF update story tags
# -----------------------
# fetches compressed blob from PostgreSQL table, parses JSON, tags stories with entities
#
cliff-update-story-tags:
image: dockermediacloud/cliff-update-story-tags:release
Expand Down Expand Up @@ -352,6 +356,8 @@ services:
#
# AP crawler
# ----------
# worker that (used to) fetch content via Associated Press API;
# long broken, but could serve as a reference for how to ingest content from various "custom" APIs
#
crawler-ap:
image: dockermediacloud/crawler-ap:release
Expand Down Expand Up @@ -380,6 +386,8 @@ services:
#
# Crawler fetcher
# ---------------
# Polls the PostgreSQL table managed by crawler-provider and fetches downloads enqueued on that table,
# stores their content, adds extract-and-vector jobs to get the content extracted;
#
crawler-fetcher:
image: dockermediacloud/crawler-fetcher:release
Expand Down Expand Up @@ -410,6 +418,7 @@ services:
#
# Crawler provider
# ----------------
# using the feeds table, manages PostgreSQL table with a queue of (RSS) feeds / news articles (stories) to be fetched.
#
crawler-provider:
image: dockermediacloud/crawler-provider:release
Expand All @@ -433,6 +442,7 @@ services:
#
# Create missing PostgreSQL partitions
# ------------------------------------
# Lingers around and tries to create missing PostgreSQL table partitions for the upcoming rows;
#
create-missing-partitions:
image: dockermediacloud/create-missing-partitions:release
Expand All @@ -456,6 +466,7 @@ services:
#
# Generate daily RSS dumps Cron job
# ---------------------------------
# dumps all stories collected every day into an RSS feed for external users to download;
#
cron-generate-daily-rss-dumps:
image: dockermediacloud/cron-generate-daily-rss-dumps:release
Expand Down Expand Up @@ -486,6 +497,7 @@ services:
#
# Generate media health report Cron job
# -------------------------------------
# generates daily (?) media health reports, i.e. tries to find out which media sources are dead;
#
cron-generate-media-health:
image: dockermediacloud/cron-generate-media-health:release
Expand All @@ -509,6 +521,7 @@ services:
#
# Generate daily / weekly user summary Cron job
# ---------------------------------------------
# generates daily report of new users who have signed up
#
cron-generate-user-summary:
image: dockermediacloud/cron-generate-user-summary:release
Expand All @@ -532,6 +545,7 @@ services:
#
# Print long running job states
# -----------------------------
# Tries to periodically identify which Celery jobs are running for a long time, and if they are, which "state" have they reported last;
#
cron-print-long-running-job-states:
image: dockermediacloud/cron-print-long-running-job-states:release
Expand All @@ -555,6 +569,7 @@ services:
#
# Refresh stats Cron job
# ----------------------
# updates some daily stats (https://github.com/mediacloud/backend/blob/master/apps/common/src/perl/MediaWords/DBI/Stats.pm)
#
cron-refresh-stats:
image: dockermediacloud/cron-refresh-stats:release
Expand All @@ -578,6 +593,8 @@ services:
#
# Add due media to the rescraping queue Cron job
# ----------------------------------------------
# Periodically adds new rescrape-media Celery jobs for every media source
# so that we would become aware of new / updated / deleted RSS feeds in each of these media sources;
#
cron-rescrape-due-media:
image: dockermediacloud/cron-rescrape-due-media:release
Expand All @@ -602,6 +619,7 @@ services:
#
# Report rescraping changes Cron job
# ----------------------------------
# Prints reports on how well cron-rescrape-due-media did its job;
#
cron-rescraping-changes:
image: dockermediacloud/cron-rescraping-changes:release
Expand All @@ -625,6 +643,8 @@ services:
#
# Set media primary language Cron job
# -----------------------------------
# Identifies media sources for which we haven't determined their "primary language"
# (e.g. English for BBC UK or French for Le Monde) and tries to do that.
#
cron-set-media-primary-language:
image: dockermediacloud/cron-set-media-primary-language:release
Expand All @@ -648,6 +668,7 @@ services:
#
# Set media subject country Cron job
# -----------------------------------
# Identifies media sources for which we haven't determined their countries and tries to do that.
#
cron-set-media-subject-country:
image: dockermediacloud/cron-set-media-subject-country:release
Expand All @@ -671,6 +692,8 @@ services:
#
# Extract and vector stories
# --------------------------
# Tries to extract plain text from HTML pages of news articles (stories) fetched by crawler-fetcher,
# determines each story's language, tokenizes it into sentences to do deduplication;
#
extract-and-vector:
image: dockermediacloud/extract-and-vector:release
Expand Down Expand Up @@ -698,6 +721,7 @@ services:
#
# Extract article HTML from page HTML
# -----------------------------------
# HTTP service that does the actual HTML -> plain text extracting;
#
extract-article-from-page:
image: dockermediacloud/extract-article-from-page:release
Expand All @@ -724,6 +748,7 @@ services:
#
# Fetch story stats from Facebook
# -------------------------------
# Fetches Facebook statistics (IIRC share and comment counts) for newly added stories
#
facebook-fetch-story-stats:
image: dockermediacloud/facebook-fetch-story-stats:release
Expand Down Expand Up @@ -759,6 +784,7 @@ services:
#
# Import stories into Solr
# ------------------------
# Daemon process which periodically imports new stories to Solr
#
import-solr-data:
image: dockermediacloud/import-solr-data:release
Expand Down Expand Up @@ -786,6 +812,7 @@ services:
#
# Import stories by scraping Feedly
# ---------------------------------
# One-off script (not a service) that imports stories from Feedly;
#
import-stories-feedly:
image: dockermediacloud/import-stories-feedly:release
Expand All @@ -810,6 +837,7 @@ services:
#
# OpenDKIM server
# ---------------
# Signs emails sent out with mail-postfix-server with DKIM(DomainKeys Identified Mail);
#
mail-opendkim-server:
image: dockermediacloud/mail-opendkim-server:release
Expand Down Expand Up @@ -840,6 +868,7 @@ services:
#
# Postfix server
# ---------------
# SMTP server which listens on port 25 and sends emails from the rest of the system (registration form, various periodic reports, monitoring alerts, etc.)
#
mail-postfix-server:
image: dockermediacloud/mail-postfix-server:release
Expand Down Expand Up @@ -891,6 +920,7 @@ services:
#
# Munin Cron stats collector
# --------------------------
# Munin's (our monitoring system's) Cron script which fetches the monitored stats every 5 minutes;
#
munin-cron:
image: dockermediacloud/munin-cron:release
Expand Down Expand Up @@ -924,6 +954,7 @@ services:
#
# Munin FastCGI graph generator
# -----------------------------
# FastCGI workers for Munin's HTTP webapp;
#
munin-fastcgi-graph:
image: dockermediacloud/munin-fastcgi-graph:release
Expand Down Expand Up @@ -990,6 +1021,7 @@ services:
#
# Munin node
# ----------
# Munin's stat collector
#
munin-node:
image: dockermediacloud/munin-node:release
Expand Down Expand Up @@ -1017,6 +1049,8 @@ services:
#
# NYT-Based News Tagger service
# -----------------------------
# Somewhat similar to CLIFF, this service tries to guess what the story is about, e.g. US elections, gardening, Nairobi, the Moon, etc.;
# works with English language content only
#
nytlabels-annotator:
image: dockermediacloud/nytlabels-annotator:release
Expand All @@ -1040,6 +1074,7 @@ services:
#
# NYTLabels fetch annotation
# ----------------------
# Same like with cliff-fetch-annotation, just with NYTLabels;
#
nytlabels-fetch-annotation:
image: dockermediacloud/nytlabels-fetch-annotation:release
Expand Down Expand Up @@ -1067,6 +1102,7 @@ services:
#
# NYTLabels update story tags
# -----------------------
# Same like with cliff-update-story-tags, just with NYTLabels;
#
nytlabels-update-story-tags:
image: dockermediacloud/nytlabels-update-story-tags:release
Expand Down Expand Up @@ -1097,6 +1133,10 @@ services:
#
# Fetch story podcast episode and store it in GCS
# -----------------------------------------------
# Fetches audio files of podcasts (from podcast feeds),
# transcribes them to something that Google's speech detection can understand (e.g. WAV),
# tries to guess the language from the podcast episode's description;
# uploads it to Google Cloud's object store, adds podcast-submit-operation job;
#
podcast-fetch-episode:
image: dockermediacloud/podcast-fetch-episode:release
Expand Down Expand Up @@ -1133,6 +1173,9 @@ services:
#
# Fetch finished transcripts and store them locally
# -------------------------------------------------
# Fetches transcripts from Google Cloud's speech API, stores them locally;
# if the transcript is not done yet, makes sure that podcast-poll-due-operations will add a new podcast-fetch-transcript job after a few minutes;
#
podcast-fetch-transcript:
image: dockermediacloud/podcast-fetch-transcript:release
init: true
Expand Down Expand Up @@ -1164,6 +1207,9 @@ services:
#
# Poll due operations and submit them to "podcast-fetch-transcript"
# -----------------------------------------------------------------
# Polls a PostgreSQL table for speech transcription operations (added by podcast-submit-operation)
# which should be done by now (assumes that it will take Google 1 min to transcribe 1 min of speech), adds jobs to podcast-fetch-transcript;
#
podcast-poll-due-operations:
image: dockermediacloud/podcast-poll-due-operations:release
init: true
Expand Down Expand Up @@ -1194,6 +1240,8 @@ services:
#
# Submit a Speech API operation for a podcast episode
# ---------------------------------------------------
# Adds a new speech transcription job for each podcast story;
#
podcast-submit-operation:
image: dockermediacloud/podcast-submit-operation:release
init: true
Expand Down Expand Up @@ -1225,6 +1273,7 @@ services:
#
# pgAdmin
# ---------
# pgAdmin4 instance to manage PostgreSQL
#
postgresql-pgadmin:
image: dockermediacloud/postgresql-pgadmin:release
Expand Down Expand Up @@ -1254,6 +1303,7 @@ services:
#
# PgBouncer
# ---------
# pgBouncer instance that does connection pooling in front of postgresql-server.
#
postgresql-pgbouncer:
image: dockermediacloud/postgresql-pgbouncer:release
Expand Down Expand Up @@ -1306,6 +1356,7 @@ services:
#
# Purge PostgreSQL object caches
# ------------------------------------
# We use (or at least used to use) a few PostgreSQL tables to do some object caching (think Redis), so this script periodically cleans up said cache
#
purge-object-caches:
image: dockermediacloud/purge-object-caches:release
Expand All @@ -1329,6 +1380,7 @@ services:
#
# RabbitMQ
# --------
# RabbitMQ server, the backbone of the Celery-based jobs system
#
rabbitmq-server:
image: dockermediacloud/rabbitmq-server:release
Expand Down Expand Up @@ -1393,6 +1445,8 @@ services:
#
# (Re)scrape media
# ----------------
# For each new / updated (by cron-rescrape-due-media service) media source,
# crawls their webpage a bit in an attempt to find RSS / Atom feeds that we could then set up for periodic fetching.
#
rescrape-media:
image: dockermediacloud/rescrape-media:release
Expand All @@ -1419,6 +1473,7 @@ services:
#
# Fetch sitemap pages from media
# ------------------------------
# Part of our sitemap XML ingestion experiment
#
sitemap-fetch-media-pages:
image: dockermediacloud/sitemap-fetch-media-pages:release
Expand Down Expand Up @@ -1692,6 +1747,7 @@ services:
#
# Solr ZooKeeper
# --------------
# ZooKeeper which manages Solr's configuration, keeps track of live shards, etc.
#
solr-zookeeper:
image: dockermediacloud/solr-zookeeper:release
Expand Down