mediacloud · esirK · Nov 15, 2020
diff --git a/apps/docker-compose.dist.yml b/apps/docker-compose.dist.yml
@@ -266,6 +266,8 @@ services:
     #
     # CLIFF annotator service
     # -----------------------
+    # HTTP service with machine learning model that does named entity recognition (names, places, brandnames, etc.
+    # see https://github.com/mediacloud/backend/blob/master/apps/cliff-base/src/python/cliff_base/sample_data.py for a list.
     #
     cliff-annotator:
         image: dockermediacloud/cliff-annotator:release
@@ -290,6 +292,7 @@ services:
     #
     # CLIFF fetch annotation
     # ----------------------
+    # fetches named entities from cliff-annotator, stores them in a PostgreSQL table as a compressed blob.
     #
     cliff-fetch-annotation:
         image: dockermediacloud/cliff-fetch-annotation:release
@@ -317,6 +320,7 @@ services:
     #
     # CLIFF update story tags
     # -----------------------
+    #  fetches compressed blob from PostgreSQL table, parses JSON, tags stories with entities
     #
     cliff-update-story-tags:
         image: dockermediacloud/cliff-update-story-tags:release
@@ -352,6 +356,8 @@ services:
     #
     # AP crawler
     # ----------
+    # worker that (used to) fetch content via Associated Press API;
+    # long broken, but could serve as a reference for how to ingest content from various "custom" APIs
     #
     crawler-ap:
         image: dockermediacloud/crawler-ap:release
@@ -380,6 +386,8 @@ services:
     #
     # Crawler fetcher
     # ---------------
+    # Polls the PostgreSQL table managed by crawler-provider and fetches downloads enqueued on that table,
+    # stores their content, adds extract-and-vector jobs to get the content extracted;
     #
     crawler-fetcher:
         image: dockermediacloud/crawler-fetcher:release
@@ -410,6 +418,7 @@ services:
     #
     # Crawler provider
     # ----------------
+    # using the feeds table, manages PostgreSQL table with a queue of (RSS) feeds / news articles (stories) to be fetched.
     #
     crawler-provider:
         image: dockermediacloud/crawler-provider:release
@@ -433,6 +442,7 @@ services:
     #
     # Create missing PostgreSQL partitions
     # ------------------------------------
+    # Lingers around and tries to create missing PostgreSQL table partitions for the upcoming rows;
     #
     create-missing-partitions:
         image: dockermediacloud/create-missing-partitions:release
@@ -456,6 +466,7 @@ services:
     #
     # Generate daily RSS dumps Cron job
     # ---------------------------------
+    # dumps all stories collected every day into an RSS feed for external users to download;
     #
     cron-generate-daily-rss-dumps:
         image: dockermediacloud/cron-generate-daily-rss-dumps:release
@@ -486,6 +497,7 @@ services:
     #
     # Generate media health report Cron job
     # -------------------------------------
+    # generates daily (?) media health reports, i.e. tries to find out which media sources are dead;
     #
     cron-generate-media-health:
         image: dockermediacloud/cron-generate-media-health:release
@@ -509,6 +521,7 @@ services:
     #
     # Generate daily / weekly user summary Cron job
     # ---------------------------------------------
+    # generates daily report of new users who have signed up
     #
     cron-generate-user-summary:
         image: dockermediacloud/cron-generate-user-summary:release
@@ -532,6 +545,7 @@ services:
     #
     # Print long running job states
     # -----------------------------
+    # Tries to periodically identify which Celery jobs are running for a long time, and if they are, which "state" have they reported last;
     #
     cron-print-long-running-job-states:
         image: dockermediacloud/cron-print-long-running-job-states:release
@@ -555,6 +569,7 @@ services:
     #
     # Refresh stats Cron job
     # ----------------------
+    # updates some daily stats (https://github.com/mediacloud/backend/blob/master/apps/common/src/perl/MediaWords/DBI/Stats.pm)
     #
     cron-refresh-stats:
         image: dockermediacloud/cron-refresh-stats:release
@@ -578,6 +593,8 @@ services:
     #
     # Add due media to the rescraping queue Cron job
     # ----------------------------------------------
+    # Periodically adds new rescrape-media Celery jobs for every media source
+    # so that we would become aware of new / updated / deleted RSS feeds in each of these media sources;
     #
     cron-rescrape-due-media:
         image: dockermediacloud/cron-rescrape-due-media:release
@@ -602,6 +619,7 @@ services:
     #
     # Report rescraping changes Cron job
     # ----------------------------------
+    # Prints reports on how well cron-rescrape-due-media did its job;
     #
     cron-rescraping-changes:
         image: dockermediacloud/cron-rescraping-changes:release
@@ -625,6 +643,8 @@ services:
     #
     # Set media primary language Cron job
     # -----------------------------------
+    # Identifies media sources for which we haven't determined their "primary language"
+    # (e.g. English for BBC UK or French for Le Monde) and tries to do that.
     #
     cron-set-media-primary-language:
         image: dockermediacloud/cron-set-media-primary-language:release
@@ -648,6 +668,7 @@ services:
     #
     # Set media subject country Cron job
     # -----------------------------------
+    # Identifies media sources for which we haven't determined their countries and tries to do that.
     #
     cron-set-media-subject-country:
         image: dockermediacloud/cron-set-media-subject-country:release
@@ -671,6 +692,8 @@ services:
     #
     # Extract and vector stories
     # --------------------------
+    # Tries to extract plain text from HTML pages of news articles (stories) fetched by crawler-fetcher,
+    # determines each story's language, tokenizes it into sentences to do deduplication;
     #
     extract-and-vector:
         image: dockermediacloud/extract-and-vector:release
@@ -698,6 +721,7 @@ services:
     #
     # Extract article HTML from page HTML
     # -----------------------------------
+    # HTTP service that does the actual HTML -> plain text extracting;
     #
     extract-article-from-page:
         image: dockermediacloud/extract-article-from-page:release
@@ -724,6 +748,7 @@ services:
     #
     # Fetch story stats from Facebook
     # -------------------------------
+    # Fetches Facebook statistics (IIRC share and comment counts) for newly added stories
     #
     facebook-fetch-story-stats:
         image: dockermediacloud/facebook-fetch-story-stats:release
@@ -759,6 +784,7 @@ services:
     #
     # Import stories into Solr
     # ------------------------
+    # Daemon process which periodically imports new stories to Solr
     #
     import-solr-data:
         image: dockermediacloud/import-solr-data:release
@@ -786,6 +812,7 @@ services:
     #
     # Import stories by scraping Feedly
     # ---------------------------------
+    # One-off script (not a service) that imports stories from Feedly;
     #
     import-stories-feedly:
         image: dockermediacloud/import-stories-feedly:release
@@ -810,6 +837,7 @@ services:
     #
     # OpenDKIM server
     # ---------------
+    # Signs emails sent out with mail-postfix-server with DKIM(DomainKeys Identified Mail);
     #
     mail-opendkim-server:
         image: dockermediacloud/mail-opendkim-server:release
@@ -840,6 +868,7 @@ services:
     #
     # Postfix server
     # ---------------
+    # SMTP server which listens on port 25 and sends emails from the rest of the system (registration form, various periodic reports, monitoring alerts, etc.)
     #
     mail-postfix-server:
         image: dockermediacloud/mail-postfix-server:release
@@ -891,6 +920,7 @@ services:
     #
     # Munin Cron stats collector
     # --------------------------
+    # Munin's (our monitoring system's) Cron script which fetches the monitored stats every 5 minutes;
     #
     munin-cron:
         image: dockermediacloud/munin-cron:release
@@ -924,6 +954,7 @@ services:
     #
     # Munin FastCGI graph generator
     # -----------------------------
+    # FastCGI workers for Munin's HTTP webapp;
     #
     munin-fastcgi-graph:
         image: dockermediacloud/munin-fastcgi-graph:release
@@ -990,6 +1021,7 @@ services:
     #
     # Munin node
     # ----------
+    # Munin's stat collector
     #
     munin-node:
         image: dockermediacloud/munin-node:release
@@ -1017,6 +1049,8 @@ services:
     #
     # NYT-Based News Tagger service
     # -----------------------------
+    # Somewhat similar to CLIFF, this service tries to guess what the story is about, e.g. US elections, gardening, Nairobi, the Moon, etc.;
+    # works with English language content only
     #
     nytlabels-annotator:
         image: dockermediacloud/nytlabels-annotator:release
@@ -1040,6 +1074,7 @@ services:
     #
     # NYTLabels fetch annotation
     # ----------------------
+    # Same like with cliff-fetch-annotation, just with NYTLabels;
     #
     nytlabels-fetch-annotation:
         image: dockermediacloud/nytlabels-fetch-annotation:release
@@ -1067,6 +1102,7 @@ services:
     #
     # NYTLabels update story tags
     # -----------------------
+    # Same like with cliff-update-story-tags, just with NYTLabels;
     #
     nytlabels-update-story-tags:
         image: dockermediacloud/nytlabels-update-story-tags:release
@@ -1097,6 +1133,10 @@ services:
     #
     # Fetch story podcast episode and store it in GCS
     # -----------------------------------------------
+    # Fetches audio files of podcasts (from podcast feeds),
+    # transcribes them to something that Google's speech detection can understand (e.g. WAV),
+    # tries to guess the language from the podcast episode's description;
+    # uploads it to Google Cloud's object store, adds podcast-submit-operation job;
     #
     podcast-fetch-episode:
         image: dockermediacloud/podcast-fetch-episode:release
@@ -1133,6 +1173,9 @@ services:
     #
     # Fetch finished transcripts and store them locally
     # -------------------------------------------------
+    # Fetches transcripts from Google Cloud's speech API, stores them locally;
+    # if the transcript is not done yet, makes sure that podcast-poll-due-operations will add a new podcast-fetch-transcript job after a few minutes;
+    #
     podcast-fetch-transcript:
         image: dockermediacloud/podcast-fetch-transcript:release
         init: true
@@ -1164,6 +1207,9 @@ services:
     #
     # Poll due operations and submit them to "podcast-fetch-transcript"
     # -----------------------------------------------------------------
+    #  Polls a PostgreSQL table for speech transcription operations (added by podcast-submit-operation)
+    #  which should be done by now (assumes that it will take Google 1 min to transcribe 1 min of speech), adds jobs to podcast-fetch-transcript;
+    #
     podcast-poll-due-operations:
         image: dockermediacloud/podcast-poll-due-operations:release
         init: true
@@ -1194,6 +1240,8 @@ services:
     #
     # Submit a Speech API operation for a podcast episode
     # ---------------------------------------------------
+    # Adds a new speech transcription job for each podcast story;
+    #
     podcast-submit-operation:
         image: dockermediacloud/podcast-submit-operation:release
         init: true
@@ -1225,6 +1273,7 @@ services:
     #
     # pgAdmin
     # ---------
+    # pgAdmin4 instance to manage PostgreSQL
     #
     postgresql-pgadmin:
         image: dockermediacloud/postgresql-pgadmin:release
@@ -1254,6 +1303,7 @@ services:
     #
     # PgBouncer
     # ---------
+    # pgBouncer instance that does connection pooling in front of postgresql-server.
     #
     postgresql-pgbouncer:
         image: dockermediacloud/postgresql-pgbouncer:release
@@ -1306,6 +1356,7 @@ services:
     #
     # Purge PostgreSQL object caches
     # ------------------------------------
+    # We use (or at least used to use) a few PostgreSQL tables to do some object caching (think Redis), so this script periodically cleans up said cache
     #
     purge-object-caches:
         image: dockermediacloud/purge-object-caches:release
@@ -1329,6 +1380,7 @@ services:
     #
     # RabbitMQ
     # --------
+    # RabbitMQ server, the backbone of the Celery-based jobs system
     #
     rabbitmq-server:
         image: dockermediacloud/rabbitmq-server:release
@@ -1393,6 +1445,8 @@ services:
     #
     # (Re)scrape media
     # ----------------
+    #  For each new / updated (by cron-rescrape-due-media service) media source,
+    #  crawls their webpage a bit in an attempt to find RSS / Atom feeds that we could then set up for periodic fetching.
     #
     rescrape-media:
         image: dockermediacloud/rescrape-media:release
@@ -1419,6 +1473,7 @@ services:
     #
     # Fetch sitemap pages from media
     # ------------------------------
+    # Part of our sitemap XML ingestion experiment
     #
     sitemap-fetch-media-pages:
         image: dockermediacloud/sitemap-fetch-media-pages:release
@@ -1692,6 +1747,7 @@ services:
     #
     # Solr ZooKeeper
     # --------------
+    # ZooKeeper which manages Solr's configuration, keeps track of live shards, etc.
     #
     solr-zookeeper:
         image: dockermediacloud/solr-zookeeper:release