diff --git a/.gitattributes b/.gitattributes index 19ba67b5d3..b03edb6003 100644 --- a/.gitattributes +++ b/.gitattributes @@ -9,7 +9,7 @@ *.conf text eol=lf *.config text eol=lf -*.cpanfile text eol=lf +cpanfile text eol=lf *.css text eol=lf *.csv text eol=lf *.enabled_plugins text eol=lf diff --git a/.github/free-up-disk-space.sh b/.github/free-up-disk-space.sh index b0bbd19e09..acc6c85de8 100755 --- a/.github/free-up-disk-space.sh +++ b/.github/free-up-disk-space.sh @@ -25,6 +25,11 @@ sudo rm -f /swapfile echo "Cleaning APT cache..." sudo apt clean +echo "Removing some directories..." +sudo rm -rf /usr/local/lib/android/ +sudo rm -rf /usr/local/lib/node_modules/ +sudo rm -rf /usr/local/share/chromium/ + echo "Removing docker images..." docker rmi $(docker image ls -aq) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d40605f77f..417bddc68d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -180,9 +180,11 @@ jobs: MC_DOWNLOADS_AMAZON_S3_SECRET_ACCESS_KEY: ${{ secrets.MC_DOWNLOADS_AMAZON_S3_SECRET_ACCESS_KEY }} MC_FACEBOOK_APP_ID: ${{ secrets.MC_FACEBOOK_APP_ID }} MC_FACEBOOK_APP_SECRET: ${{ secrets.MC_FACEBOOK_APP_SECRET }} - MC_PODCAST_FETCH_EPISODE_BUCKET_NAME: ${{ secrets.MC_PODCAST_FETCH_EPISODE_BUCKET_NAME }} + MC_PODCAST_AUTH_JSON_BASE64: ${{ secrets.MC_PODCAST_AUTH_JSON_BASE64 }} + MC_PODCAST_RAW_ENCLOSURES_BUCKET_NAME: ${{ secrets.MC_PODCAST_RAW_ENCLOSURES_BUCKET_NAME }} + MC_PODCAST_TRANSCODED_EPISODES_BUCKET_NAME: ${{ secrets.MC_PODCAST_TRANSCODED_EPISODES_BUCKET_NAME }} + MC_PODCAST_TRANSCRIPTS_BUCKET_NAME: ${{ secrets.MC_PODCAST_TRANSCRIPTS_BUCKET_NAME }} MC_PODCAST_FETCH_TRANSCRIPT_RUN_COSTLY_TEST: ${{ secrets.MC_PODCAST_FETCH_TRANSCRIPT_RUN_COSTLY_TEST }} - MC_PODCAST_GC_AUTH_JSON_BASE64: ${{ secrets.MC_PODCAST_GC_AUTH_JSON_BASE64 }} MC_TWITTER_ACCESS_TOKEN: ${{ secrets.MC_TWITTER_ACCESS_TOKEN }} MC_TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.MC_TWITTER_ACCESS_TOKEN_SECRET }} MC_TWITTER_CONSUMER_KEY: ${{ secrets.MC_TWITTER_CONSUMER_KEY }} diff --git a/.gitignore b/.gitignore index dba8fe1ceb..99845389ff 100644 --- a/.gitignore +++ b/.gitignore @@ -59,7 +59,9 @@ coverage.json **/.idea/**/dataSources/ **/.idea/**/dataSources.ids **/.idea/**/dataSources.local.xml -**/.idea/**/sqlDataSources.xml + +# Not ignoring **/.idea/**/sqlDataSources.xml as it points to ./.idea/mediawords.sql + **/.idea/**/dynamic.xml **/.idea/**/uiDesigner.xml **/.idea/**/dbnavigator.xml diff --git a/.gitmodules b/.gitmodules index 70ec6e112b..a6f576b719 100644 --- a/.gitmodules +++ b/.gitmodules @@ -25,12 +25,18 @@ [submodule "dev/quieter-docker-compose"] path = dev/quieter-docker-compose url = https://github.com/mediacloud/docker-compose-just-quieter.git -[submodule "apps/podcast-fetch-episode/tests/data/media-samples"] - path = apps/podcast-fetch-episode/tests/data/media-samples - url = https://github.com/mediacloud/podcast-media-samples.git -[submodule "apps/podcast-fetch-transcript/tests/data/media-samples"] - path = apps/podcast-fetch-transcript/tests/data/media-samples +[submodule "apps/podcast-transcribe-episode/tests/data/media-samples"] + path = apps/podcast-transcribe-episode/tests/data/media-samples url = https://github.com/mediacloud/podcast-media-samples.git [submodule "apps/elk-journalbeat/journald-log-sample"] path = apps/elk-journalbeat/journald-log-sample url = https://github.com/mediacloud/journald-log-sample.git +[submodule "apps/temporal-grafana/dashboards"] + path = apps/temporal-grafana/dashboards + url = https://github.com/temporalio/dashboards.git +[submodule "apps/temporal-server/config"] + path = apps/temporal-server/config + url = https://github.com/mediacloud/backend-temporal-server-config.git +[submodule "apps/temporal-postgresql/temporal-config"] + path = apps/temporal-postgresql/temporal-config + url = https://github.com/mediacloud/backend-temporal-server-config.git diff --git a/apps/base/Dockerfile b/apps/base/Dockerfile index 6f12c7fdde..7fb5875dd1 100644 --- a/apps/base/Dockerfile +++ b/apps/base/Dockerfile @@ -7,7 +7,7 @@ # # https://hub.docker.com/_/ubuntu?tab=tags&page=1 -FROM ubuntu:focal-20210119 +FROM ubuntu:focal-20210416 ENV DEBIAN_FRONTEND=noninteractive \ LANG=en_US.UTF-8 \ @@ -48,6 +48,9 @@ RUN \ apt-get -y --no-install-recommends install \ # Quicker container debugging bash-completion \ + # "mail" utility which uses sendmail (provided by msmtp-mta) internally; + # some tools like munin-cron use "mail" to send emails + bsd-mailx \ curl \ htop \ # apt-key @@ -56,7 +59,8 @@ RUN \ iproute2 \ # Pinging other containers from within Compose environment iputils-ping \ - # Sending mail via sendmail utility through mail-postfix-server + # Provides "sendmail" utility which relays email through + # "mail-postfix-server" app msmtp \ msmtp-mta \ # Provides killall among other utilities @@ -67,8 +71,6 @@ RUN \ netcat \ # Some packages insist on logging to syslog rsyslog \ - # "mail" utility (which uses msmtp internally) - s-nail \ # Timezone data, used by many packages tzdata \ # Basic editor for files in container while debugging @@ -90,6 +92,25 @@ COPY bin/container_memory_limit.sh bin/container_cpu_limit.sh bin/dl_to_stdout.s # Copy MSMTP configuration COPY conf/msmtprc conf/msmtp-aliases /etc/ +# Both "sendmail" and "mail" utilities are important as they're used by various +# apps (e.g. munin-cron) to send us important email, and those apps aren't +# particularly vocal when they're unable to send email. So, for extra paranoia, +# verify that both utilities point to correct symlinks here. +RUN \ + if [ "$(readlink -- "/usr/sbin/sendmail")" != "../bin/msmtp" ]; then \ + echo "sendmail is not symlinked to msmtp, sending email won't work." && \ + exit 1; \ + fi; \ + if [ "$(readlink -- "/usr/bin/mail")" != "/etc/alternatives/mail" ]; then \ + echo "mail is not symlinked to /etc/alternatives/mail, sending email won't work." && \ + exit 1; \ + fi; \ + if [ "$(readlink -- "/etc/alternatives/mail")" != "/usr/bin/bsd-mailx" ]; then \ + echo "mail is not symlinked to /etc/alternatives/mail, sending email won't work." && \ + exit 1; \ + fi; \ + true + # Generate and set locale RUN \ locale-gen en_US en_US.UTF-8 && \ diff --git a/apps/cliff-annotator/Dockerfile b/apps/cliff-annotator/Dockerfile index 7db0f3ae4d..df2ccad7e5 100644 --- a/apps/cliff-annotator/Dockerfile +++ b/apps/cliff-annotator/Dockerfile @@ -27,7 +27,7 @@ RUN \ # Install Tomcat 7 RUN \ mkdir -p /usr/lib/tomcat7/ && \ - /dl_to_stdout.sh "https://archive.apache.org/dist/tomcat/tomcat-7/v7.0.96/bin/apache-tomcat-7.0.96.tar.gz" | \ + /dl_to_stdout.sh "https://mediacloud-archive-apache-org.s3.amazonaws.com/apache-tomcat-7.0.96.tar.gz" | \ tar -zx -C /usr/lib/tomcat7/ --strip 1 && \ true diff --git a/apps/cliff-fetch-annotation-and-tag/.dockerignore b/apps/cliff-fetch-annotation-and-tag/.dockerignore index 752414ae9c..9b2c362a80 100644 --- a/apps/cliff-fetch-annotation-and-tag/.dockerignore +++ b/apps/cliff-fetch-annotation-and-tag/.dockerignore @@ -89,3 +89,4 @@ sdist Temporary Items wheels _Inline + diff --git a/apps/cliff-fetch-annotation-and-tag/.idea/cliff-fetch-annotation-and-tag.iml b/apps/cliff-fetch-annotation-and-tag/.idea/cliff-fetch-annotation-and-tag.iml index 4aaca228bb..5f8a5e5f93 100644 --- a/apps/cliff-fetch-annotation-and-tag/.idea/cliff-fetch-annotation-and-tag.iml +++ b/apps/cliff-fetch-annotation-and-tag/.idea/cliff-fetch-annotation-and-tag.iml @@ -1,8 +1,8 @@ - - + + diff --git a/apps/cliff-fetch-annotation-and-tag/.idea/mediawords.sql b/apps/cliff-fetch-annotation-and-tag/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/cliff-fetch-annotation-and-tag/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/cliff-fetch-annotation-and-tag/.idea/misc.xml b/apps/cliff-fetch-annotation-and-tag/.idea/misc.xml index 5914dad53f..907e6bae2a 100644 --- a/apps/cliff-fetch-annotation-and-tag/.idea/misc.xml +++ b/apps/cliff-fetch-annotation-and-tag/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/apps/cliff-fetch-annotation-and-tag/.idea/sqlDataSources.xml b/apps/cliff-fetch-annotation-and-tag/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..3228ec2234 --- /dev/null +++ b/apps/cliff-fetch-annotation-and-tag/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/cliff-fetch-annotation-and-tag/.idea/sqldialects.xml b/apps/cliff-fetch-annotation-and-tag/.idea/sqldialects.xml index 790b3f37f8..92fefa2e78 100644 --- a/apps/cliff-fetch-annotation-and-tag/.idea/sqldialects.xml +++ b/apps/cliff-fetch-annotation-and-tag/.idea/sqldialects.xml @@ -1,6 +1,7 @@ + diff --git a/apps/cliff-fetch-annotation-and-tag/docker-compose.tests.yml b/apps/cliff-fetch-annotation-and-tag/docker-compose.tests.yml index 5b589005cd..a97835a29f 100644 --- a/apps/cliff-fetch-annotation-and-tag/docker-compose.tests.yml +++ b/apps/cliff-fetch-annotation-and-tag/docker-compose.tests.yml @@ -54,5 +54,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/common/.idea/common.iml b/apps/common/.idea/common.iml index 492339729c..bb46b8eafc 100644 --- a/apps/common/.idea/common.iml +++ b/apps/common/.idea/common.iml @@ -2,9 +2,12 @@ - + + + - \ No newline at end of file diff --git a/apps/common/.idea/inspectionProfiles/Project_Default.xml b/apps/common/.idea/inspectionProfiles/Project_Default.xml index 76ebfe820e..d3d52a9b48 100644 --- a/apps/common/.idea/inspectionProfiles/Project_Default.xml +++ b/apps/common/.idea/inspectionProfiles/Project_Default.xml @@ -1,6 +1,7 @@ \ No newline at end of file diff --git a/apps/common/.idea/sqlDataSources.xml b/apps/common/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..2cf8b2da55 --- /dev/null +++ b/apps/common/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/common/.idea/sqldialects.xml b/apps/common/.idea/sqldialects.xml index 790b3f37f8..92fefa2e78 100644 --- a/apps/common/.idea/sqldialects.xml +++ b/apps/common/.idea/sqldialects.xml @@ -1,6 +1,7 @@ + diff --git a/apps/common/docker-compose.tests.yml b/apps/common/docker-compose.tests.yml index f822f335f5..17126383ee 100644 --- a/apps/common/docker-compose.tests.yml +++ b/apps/common/docker-compose.tests.yml @@ -13,6 +13,12 @@ services: MC_DOWNLOADS_AMAZON_S3_DIRECTORY_NAME: "${MC_DOWNLOADS_AMAZON_S3_DIRECTORY_NAME}" MC_PUBLIC_STORE_TYPE: "postgresql" MC_PUBLIC_STORE_SALT: "foo" + # Email address to point to in List-Unsubscribe email header. + # Technically we don't have a straightforward "unsubscribe" endpoint, but our + # emails are more likely to be marked spam if we don't have such a header, so + # we make the email subject "Delete account and unsubscribe" in + # mediawords/util/config/common.py + MC_EMAIL_UNSUBSCRIBE: "support@example.com" volumes: - type: bind source: ./src/ @@ -91,8 +97,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ solr-shard-01: image: gcr.io/mcback/solr-shard:latest diff --git a/apps/common/src/perl/MediaWords/Languages/Language/PythonWrapper.pm b/apps/common/src/perl/MediaWords/Languages/Language/PythonWrapper.pm index 384318b8e6..67dad9a001 100644 --- a/apps/common/src/perl/MediaWords/Languages/Language/PythonWrapper.pm +++ b/apps/common/src/perl/MediaWords/Languages/Language/PythonWrapper.pm @@ -71,6 +71,15 @@ sub stop_words_map($) return $stop_words_map; } +# FIXME remove once stopword comparison is over +sub stop_words_old_map($) +{ + my $self = shift; + + my $stop_words_old_map = $self->{ _python_lang }->stop_words_old_map(); + return $stop_words_old_map; +} + sub stem_words($$) { my ( $self, $words ) = @_; diff --git a/apps/common/src/python/mediawords/db/__init__.py b/apps/common/src/python/mediawords/db/__init__.py index f03065ed5a..0b98f8fb16 100644 --- a/apps/common/src/python/mediawords/db/__init__.py +++ b/apps/common/src/python/mediawords/db/__init__.py @@ -1,18 +1,31 @@ import time +from typing import Optional from mediawords.db.handler import DatabaseHandler -from mediawords.util.config.common import CommonConfig +from mediawords.util.config.common import CommonConfig, DatabaseConfig, ConnectRetriesConfig from mediawords.util.log import create_logger -from mediawords.util.perl import decode_object_from_bytes_if_needed from mediawords.util.process import fatal_error log = create_logger(__name__) -def connect_to_db() -> DatabaseHandler: - """Connect to PostgreSQL.""" +class McConnectToDBError(Exception): + """Exception that gets raised if connect_to_db() runs out of retries and + db_config.retries.fatal_error_on_failure is set to False.""" + pass + + +def connect_to_db(db_config: Optional[DatabaseConfig] = None) -> DatabaseHandler: + """ + Connect to PostgreSQL (via PgBouncer). + + :param db_config: Optional DatabaseConfig parameter to specify connection retry parameters. + :return: DatabaseHandler object. + """ + + if not db_config: + db_config = CommonConfig.database() - db_config = CommonConfig.database() retries_config = db_config.retries() assert retries_config.max_attempts() > 0, "max_tries can't be negative." @@ -57,12 +70,34 @@ def connect_to_db() -> DatabaseHandler: else: log.info("Out of retries, giving up and exiting...") - # Don't throw any exceptions because they might be caught by - # the try-catch block, and so the caller will just assume that - # there was something wrong with the input data and proceed - # with processing next item in the job queue (e.g. the next - # story). Instead, just quit and wait for someone to restart - # the whole app that requires database access. - fatal_error(error_message) + if retries_config.fatal_error_on_failure(): + # Don't throw any exceptions because they might be caught by + # the try-catch block, and so the caller will just assume that + # there was something wrong with the input data and proceed + # with processing next item in the job queue (e.g. the next + # story). Instead, just quit and wait for someone to restart + # the whole app that requires database access. + fatal_error(error_message) + else: + raise McConnectToDBError(error_message) return db + + +def connect_to_db_or_raise() -> DatabaseHandler: + """ + Shorthand for connect_to_db() with its own retries and fatal_error() disabled. + + By default, connect_to_db() will attempt connecting to PostgreSQL a few times and would call fatal_error() on + failures and stop the whole process. + + Useful in workflows, i.e. it's better to leave all of the retrying to Temporal. + """ + return connect_to_db( + db_config=DatabaseConfig( + retries=ConnectRetriesConfig( + max_attempts=1, + fatal_error_on_failure=False, + ) + ) + ) diff --git a/apps/common/src/python/mediawords/job/__init__.py b/apps/common/src/python/mediawords/job/__init__.py index 7a110d32e7..428599bc03 100644 --- a/apps/common/src/python/mediawords/job/__init__.py +++ b/apps/common/src/python/mediawords/job/__init__.py @@ -10,7 +10,7 @@ from mediawords.db import connect_to_db, DatabaseHandler from mediawords.db.locks import get_session_lock, release_session_lock from mediawords.job.states import STATE_QUEUED, STATE_RUNNING, STATE_COMPLETED, STATE_ERROR -from mediawords.util.config.common import CommonConfig +from mediawords.util.config.common import CommonConfig, RabbitMQConfig from mediawords.util.log import create_logger from mediawords.util.parse_json import encode_json, decode_json from mediawords.util.perl import decode_object_from_bytes_if_needed @@ -382,7 +382,7 @@ class JobBroker(object): '__queue_name', ] - def __init__(self, queue_name: str): + def __init__(self, queue_name: str, rabbitmq_config: Optional[RabbitMQConfig] = None): """ Create job broker object. @@ -397,7 +397,9 @@ def __init__(self, queue_name: str): config = CommonConfig() - rabbitmq_config = config.rabbitmq() + if not rabbitmq_config: + rabbitmq_config = config.rabbitmq() + broker_uri = 'amqp://{username}:{password}@{hostname}:{port}/{vhost}'.format( username=rabbitmq_config.username(), password=rabbitmq_config.password(), @@ -440,6 +442,19 @@ def __init__(self, queue_name: str): self.__app.conf.worker_max_tasks_per_child = 1000 + retries_config = rabbitmq_config.retries() + if retries_config: + self.__app.task_publish_retry = True + self.__app.task_publish_retry_policy = { + 'max_retries': retries_config.max_retries(), + 'interval_start': retries_config.interval_start(), + 'interval_step': retries_config.interval_step(), + 'interval_max': retries_config.interval_max(), + } + + else: + self.__app.task_publish_retry = False + queue = Queue( name=queue_name, exchange=Exchange(queue_name), diff --git a/apps/common/src/python/mediawords/languages/__init__.py b/apps/common/src/python/mediawords/languages/__init__.py index b985445af4..efbfac1fcb 100644 --- a/apps/common/src/python/mediawords/languages/__init__.py +++ b/apps/common/src/python/mediawords/languages/__init__.py @@ -50,7 +50,7 @@ class AbstractLanguage(object, metaclass=abc.ABCMeta): @abc.abstractmethod def language_code() -> str: """Return ISO 639-1 language code, e.g. 'en'.""" - raise NotImplemented("Abstract method.") + raise NotImplementedError("Abstract method.") @staticmethod @abc.abstractmethod @@ -63,7 +63,7 @@ def sample_sentence() -> str: * Wikipedia * cld2-cffi's unit test: https://github.com/GregBowyer/cld2-cffi/blob/master/tests/test_cld.py """ - raise NotImplemented("Abstract method.") + raise NotImplementedError("Abstract method.") # MC_REWRITE_TO_PYTHON: use set after rewrite to Python @abc.abstractmethod @@ -81,6 +81,12 @@ def stop_words_map(self) -> Dict[str, bool]: """ raise NotImplementedError("Abstract method.") + # FIXME remove once stopword comparison is over + @abc.abstractmethod + def stop_words_old_map(self) -> Dict[str, bool]: + """Return map of old stopwords.""" + raise NotImplementedError("Abstract method.") + @abc.abstractmethod def stem_words(self, words: List[str]) -> List[str]: """Return list of stems for a list of words. @@ -283,6 +289,9 @@ def __init__(self): # Stop words map (lazy initialized) self.__stop_words_map = None + # FIXME remove once stopword comparison is over + self.__stop_words_old_map = None + def stop_words_map(self) -> Dict[str, bool]: """Return stop word map read from a file.""" if self.__stop_words_map is None: @@ -312,3 +321,33 @@ def stop_words_map(self) -> Dict[str, bool]: self.__stop_words_map = stop_words return self.__stop_words_map + + # FIXME remove once stopword comparison is over + def stop_words_old_map(self) -> Dict[str, bool]: + if self.__stop_words_old_map is None: + + stop_words_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + self.language_code(), + '%s_stop_words_old.txt' % self.language_code(), + ) + if stop_words_path is None: + raise McLanguageException("Stop words file path is None.") + + if not os.path.isfile(stop_words_path): + raise McLanguageException("Stop words file does not exist at path '%s'." % stop_words_path) + + stop_words = dict() + with open(stop_words_path, 'r', encoding='utf-8') as f: + for stop_word in f.readlines(): + # Remove comments + stop_word = re.sub(r'\s*?#.*?$', '', stop_word) + + stop_word = stop_word.strip() + + if len(stop_word) > 0: + stop_words[stop_word] = True + + self.__stop_words_old_map = stop_words + + return self.__stop_words_old_map diff --git a/apps/common/src/python/mediawords/languages/ca/ca_stop_words.txt b/apps/common/src/python/mediawords/languages/ca/ca_stop_words.txt index eaf6168385..40abbeb608 100644 --- a/apps/common/src/python/mediawords/languages/ca/ca_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ca/ca_stop_words.txt @@ -1,10 +1,8 @@ -# -# This is a stop word list for the Catalan language. -# -# Sources: -# https://raw.githubusercontent.com/stopwords-iso/stopwords-ca/master/stopwords-ca.txt +# # Sources: # http://latel.upf.edu/morgana/altres/pub/ca_stop.htm +# https://raw.githubusercontent.com/stopwords-iso/stopwords-ca/master/stopwords-ca.txt # https://www.ranks.nl/stopwords/catalan +# (Lightly edited to remove words in the original lists that are actually meaningful) # a @@ -12,10 +10,12 @@ abans abans-d'ahir abintestat ací -adesiara +açò adàgio adés +adesiara adéu +àdhuc ah ahir ai @@ -28,13 +28,15 @@ aixà així això al +alça aleshores +algú algun alguna algunes alguns -algú alhora +àlies allà allèn allí @@ -48,13 +50,12 @@ altres altresí altri al·legro -alça amargament amb -ambdues ambdós -amunt +ambdues amén +amunt anar anc andante @@ -73,11 +74,11 @@ aquell aquella aquelles aquells +aquèn aquest aquesta aquestes aquests -aquèn aquí ara arran @@ -92,7 +93,6 @@ avall avant aviat avui -açò bah baix baldament @@ -100,17 +100,18 @@ ballmanetes banzim-banzam bastant bastants +bé ben bis bitllo-bitllo bo -bé ca +ça cada +cadascú cadascuna cadascunes cadascuns -cadascú cal cap car @@ -126,10 +127,11 @@ certes certs cinc cinquanta +cinquè cinquena cinquenes cinquens -cinquè +ço com comsevulla consegueixo @@ -147,10 +149,10 @@ d'un d'una d'unes d'uns -daixonses daixò -dallonses +daixonses dallò +dallonses dalt daltabaix damunt @@ -160,6 +162,7 @@ davall davant de debades +deçà dedins defora dejorn @@ -167,12 +170,13 @@ dejús del dellà dels -dementre -dempeus demà +dementre demés +dempeus des des de +desè desena desenes desens @@ -180,11 +184,9 @@ després dessobre dessota dessús -desè deu devers devora -deçà diferents dinou dins @@ -217,6 +219,7 @@ emperò en enans enant +ençà encara encontinent endalt @@ -224,9 +227,9 @@ endarrera endarrere endavant endebades -endemig endemà endemés +endemig endins endintre enfora @@ -235,8 +238,8 @@ enguany enguanyasses enjús enlaire -enlloc enllà +enlloc enrera enrere ens @@ -250,65 +253,69 @@ entretant entrò envers envides -environs enviró -ençà +environs ep era erem +érem eren eres +éreu ergo es +és escar essent +éssent esser +ésser est esta +està estada estades estan estant estar +estarà estaran +estaràs +estaré estarem estareu estaria +estaríem estarien estaries -estarà -estaràs -estaré -estaríem estaríeu +estàs estat estats estava +estàvem estaven estaves +estàveu estem estes esteu estic +estigué estiguem +estiguérem estigueren estigueres +estiguéreu estigues +estigués estiguessis estigueu estigui +estiguí estiguin estiguis -estigué -estiguérem -estiguéreu -estigués -estiguí estos -està -estàs -estàvem -estàveu et etc etcètera @@ -325,20 +332,20 @@ feu fi fins fora +fóra +força +fórem foren fores -força +fóreu fos +fóssim fossin fossis +fóssiu fou fra fui -fóra -fórem -fóreu -fóssim -fóssiu gaire gairebé gaires @@ -347,45 +354,47 @@ girientorn gratis ha hagi +hàgim hagin hagis +hàgiu haguda hagudes -hagueren -hagueres -haguessin -haguessis -hagut -haguts hagué haguérem +hagueren +hagueres haguéreu hagués haguéssim +haguessin +haguessis haguéssiu haguí +hagut +haguts hala han has +haurà hauran +hauràs +hauré haurem haureu hauria +hauríem haurien hauries -haurà -hauràs -hauré -hauríem hauríeu havem havent haver haveu havia +havíem havien havies -havíem havíeu he hem @@ -394,13 +403,12 @@ hi ho hom hui -hàgim -hàgiu i +ídem igual iguals -inclusive inclòs +inclusive ja jamai jo @@ -444,6 +452,7 @@ mentre mentrestant menys mes +més meu meua meues @@ -466,7 +475,6 @@ molts mon mons mos -més n n'he n'hi @@ -479,35 +487,37 @@ nogensmenys només noranta nos +nós +nòs nosaltres nostra nostre nostres nou +novè novena novenes novens -novè ns -nòs -nós o oh oi oidà +òlim on onsevulga onsevulla onze pas +pàssim pel pels pengim-penjam per per que +però perquè pertot -però piano pla poc @@ -532,7 +542,6 @@ prou puc puix pus -pàssim qual quals qualsevol @@ -552,6 +561,7 @@ quarts quasi quatre que +què quelcom qui quin @@ -560,7 +570,6 @@ quines quins quinze quisvulla -què ran re rebé @@ -596,25 +605,25 @@ sengles sens sense ser +serà seran +seràs +seré serem sereu seria +seríem serien series -serà -seràs -seré -seríem seríeu ses set setanta +setè setena setenes setens setze -setè seu seua seues @@ -622,6 +631,7 @@ seus seva seves si +sí sia siau sic @@ -633,13 +643,14 @@ siguin siguis sinó sis +sisè sisena sisenes sisens -sisè sobre sobretot soc +sóc sol sola solament @@ -647,6 +658,7 @@ soles sols som son +són sons sos sota @@ -654,9 +666,6 @@ sots sou sovint suara -sí -sóc -són t t'ha t'han @@ -712,14 +721,23 @@ u uf ui uix +últim +última +últimes +últims ultra un una unes +únic +única +únics +úniques uns up upa us +ús va vagi vagin @@ -728,54 +746,34 @@ vaig vair vam van +vàreig +vàrem vares +vàreu vas vau vem verbigràcia vers +vés vet veu vint vora vos +vós vosaltres +vostè +vostès vostra vostre vostres -vostè -vostès vuit vuitanta +vuitè vuitena vuitenes vuitens -vuitè -vàreig -vàrem -vàreu -vés -vós xano-xano xau-xau -xec -àdhuc -àlies -ça -ço -érem -éreu -és -éssent -ésser -ídem -òlim -últim -última -últimes -últims -únic -única -únics -úniques -ús +xec \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/ca/ca_stop_words_old.txt b/apps/common/src/python/mediawords/languages/ca/ca_stop_words_old.txt new file mode 100644 index 0000000000..eaf6168385 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/ca/ca_stop_words_old.txt @@ -0,0 +1,781 @@ +# +# This is a stop word list for the Catalan language. +# +# Sources: +# https://raw.githubusercontent.com/stopwords-iso/stopwords-ca/master/stopwords-ca.txt +# http://latel.upf.edu/morgana/altres/pub/ca_stop.htm +# https://www.ranks.nl/stopwords/catalan +# + +a +abans +abans-d'ahir +abintestat +ací +adesiara +adàgio +adés +adéu +ah +ahir +ai +aitambé +aitampoc +aitan +aitant +aitantost +aixà +així +això +al +aleshores +algun +alguna +algunes +alguns +algú +alhora +allà +allèn +allí +allò +almenys +als +alto +altra +altre +altres +altresí +altri +al·legro +alça +amargament +amb +ambdues +ambdós +amunt +amén +anar +anc +andante +andantino +anit +ans +antany +apa +aprés +aqueix +aqueixa +aqueixes +aqueixos +aqueixs +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aquèn +aquí +ara +arran +arrera +arrere +arreu +arri +arruix +atxim +au +avall +avant +aviat +avui +açò +bah +baix +baldament +ballmanetes +banzim-banzam +bastant +bastants +ben +bis +bitllo-bitllo +bo +bé +ca +cada +cadascuna +cadascunes +cadascuns +cadascú +cal +cap +car +caram +catorze +cent +centes +cents +cerca +cert +certa +certes +certs +cinc +cinquanta +cinquena +cinquenes +cinquens +cinquè +com +comsevulla +consegueixo +conseguim +conseguir +consigueix +consigueixen +consigueixes +contra +cordons +corrents +cric-crac +d +d'un +d'una +d'unes +d'uns +daixonses +daixò +dallonses +dallò +dalt +daltabaix +damunt +darrera +darrere +davall +davant +de +debades +dedins +defora +dejorn +dejús +del +dellà +dels +dementre +dempeus +demà +demés +des +des de +desena +desenes +desens +després +dessobre +dessota +dessús +desè +deu +devers +devora +deçà +diferents +dinou +dins +dintre +disset +divers +diversa +diverses +diversos +divuit +donat +doncs +dos +dotze +dues +durant +e +ecs +eh +el +ela +elis +ell +ella +elles +ells +els +em +emperò +en +enans +enant +encara +encontinent +endalt +endarrera +endarrere +endavant +endebades +endemig +endemà +endemés +endins +endintre +enfora +engir +enguany +enguanyasses +enjús +enlaire +enlloc +enllà +enrera +enrere +ens +ensems +ensota +ensús +entorn +entre +entremig +entretant +entrò +envers +envides +environs +enviró +ençà +ep +era +erem +eren +eres +ergo +es +escar +essent +esser +est +esta +estada +estades +estan +estant +estar +estaran +estarem +estareu +estaria +estarien +estaries +estarà +estaràs +estaré +estaríem +estaríeu +estat +estats +estava +estaven +estaves +estem +estes +esteu +estic +estiguem +estigueren +estigueres +estigues +estiguessis +estigueu +estigui +estiguin +estiguis +estigué +estiguérem +estiguéreu +estigués +estiguí +estos +està +estàs +estàvem +estàveu +et +etc +etcètera +ets +excepte +fa +faig +fan +fas +fem +fer +fer faig +feu +fi +fins +fora +foren +fores +força +fos +fossin +fossis +fou +fra +fui +fóra +fórem +fóreu +fóssim +fóssiu +gaire +gairebé +gaires +gens +girientorn +gratis +ha +hagi +hagin +hagis +haguda +hagudes +hagueren +hagueres +haguessin +haguessis +hagut +haguts +hagué +haguérem +haguéreu +hagués +haguéssim +haguéssiu +haguí +hala +han +has +hauran +haurem +haureu +hauria +haurien +hauries +haurà +hauràs +hauré +hauríem +hauríeu +havem +havent +haver +haveu +havia +havien +havies +havíem +havíeu +he +hem +heu +hi +ho +hom +hui +hàgim +hàgiu +i +igual +iguals +inclusive +inclòs +ja +jamai +jo +l +l'hi +la +leri-leri +les +li +li'n +lla +llarg +llavors +llevat +lluny +llur +llurs +lo +los +ls +m +m'he +ma +mai +mal +malament +malgrat +manco +mant +manta +mantes +mantinent +mants +massa +mateix +mateixa +mateixes +mateixos +me +mentre +mentrestant +menys +mes +meu +meua +meues +meus +meva +meves +mi +mig +mil +mitges +mitja +mitjançant +mitjos +mode +moixoni +molt +molta +moltes +molts +mon +mons +mos +més +n +n'he +n'hi +na +ne +ni +ningú +no +nogensmenys +només +noranta +nos +nosaltres +nostra +nostre +nostres +nou +novena +novenes +novens +novè +ns +nòs +nós +o +oh +oi +oidà +on +onsevulga +onsevulla +onze +pas +pel +pels +pengim-penjam +per +per que +perquè +pertot +però +piano +pla +poc +poca +pocs +podem +poden +poder +podeu +poques +potser +prest +primer +primera +primeres +primers +pro +prompte +prop +propi +prou +puc +puix +pus +pàssim +qual +quals +qualsevol +qualsevulla +qualssevol +qualssevulla +quan +quant +quanta +quantes +quants +quaranta +quart +quarta +quartes +quarts +quasi +quatre +que +quelcom +qui +quin +quina +quines +quins +quinze +quisvulla +què +ran +re +rebé +renoi +rera +rere +res +retruc +s +s'ha +s'han +sa +sabem +saben +saber +sabeu +salvament +salvant +salvat +sap +saps +se +segon +segona +segones +segons +seguida +seixanta +semblant +semblants +sempre +sengles +sens +sense +ser +seran +serem +sereu +seria +serien +series +serà +seràs +seré +seríem +seríeu +ses +set +setanta +setena +setenes +setens +setze +setè +seu +seua +seues +seus +seva +seves +si +sia +siau +sic +siguem +sigues +sigueu +sigui +siguin +siguis +sinó +sis +sisena +sisenes +sisens +sisè +sobre +sobretot +soc +sol +sola +solament +soles +sols +som +son +sons +sos +sota +sots +sou +sovint +suara +sí +sóc +són +t +t'ha +t'han +t'he +ta +tal +tals +també +tampoc +tan +tanmateix +tant +tanta +tantes +tantost +tants +te +tene +tenim +tenir +teniu +tercer +tercera +terceres +tercers +tes +teu +teua +teues +teus +teva +teves +tinc +ton +tons +tos +tost +tostemps +tot +tota +total +totes +tothom +tothora +tots +trenta +tres +tret +tretze +tu +tururut +u +uf +ui +uix +ultra +un +una +unes +uns +up +upa +us +va +vagi +vagin +vagis +vaig +vair +vam +van +vares +vas +vau +vem +verbigràcia +vers +vet +veu +vint +vora +vos +vosaltres +vostra +vostre +vostres +vostè +vostès +vuit +vuitanta +vuitena +vuitenes +vuitens +vuitè +vàreig +vàrem +vàreu +vés +vós +xano-xano +xau-xau +xec +àdhuc +àlies +ça +ço +érem +éreu +és +éssent +ésser +ídem +òlim +últim +última +últimes +últims +únic +única +únics +úniques +ús diff --git a/apps/common/src/python/mediawords/languages/da/da_stop_words.txt b/apps/common/src/python/mediawords/languages/da/da_stop_words.txt index 220a35602a..ea271bda3a 100644 --- a/apps/common/src/python/mediawords/languages/da/da_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/da/da_stop_words.txt @@ -1,16 +1,20 @@ -# -# This is a stop word list for the Danish language. -# # Sources: -# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-da/blob/master/stopwords-da.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) ad af +aldrig alle alt anden +andet +andre at +bare +begge blev blive bliver @@ -25,77 +29,148 @@ det dette dig din +dine disse +dit dog du efter +ej eller en end +ene +eneste +enhver er et +få +far +får +fem +fik +fire +flere +fleste for +før +fordi +forrige fra +god +godt ham han hans har havde have +hej +helt hende hendes her hos hun hvad +hvem +hver +hvilken hvis hvor +hvordan +hvorfor +hvornår i ikke ind +ingen +intet +ja jeg jer +jeres jo +kan +kom +komme +kommer +kun kunne +lad +lav +lidt +lige +lille +må man +mand mange med meget men +mens +mere mig min mine mit mod +når +nær +næste +næsten ned +nej +ni +nogen noget nogle nu -når +ny +nyt og også +okay om op os +otte over på +så +sådan +se +seks selv +ser +ses sig +sige sin sine sit skal skulle som -sådan +stor +store +syv +tag +tage thi +ti til +to +tre ud under var +være +været +ved vi vil ville vor -være -været +vores \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/da/da_stop_words_old.txt b/apps/common/src/python/mediawords/languages/da/da_stop_words_old.txt new file mode 100644 index 0000000000..220a35602a --- /dev/null +++ b/apps/common/src/python/mediawords/languages/da/da_stop_words_old.txt @@ -0,0 +1,101 @@ +# +# This is a stop word list for the Danish language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +ad +af +alle +alt +anden +at +blev +blive +bliver +da +de +dem +den +denne +der +deres +det +dette +dig +din +disse +dog +du +efter +eller +en +end +er +et +for +fra +ham +han +hans +har +havde +have +hende +hendes +her +hos +hun +hvad +hvis +hvor +i +ikke +ind +jeg +jer +jo +kunne +man +mange +med +meget +men +mig +min +mine +mit +mod +ned +noget +nogle +nu +når +og +også +om +op +os +over +på +selv +sig +sin +sine +sit +skal +skulle +som +sådan +thi +til +ud +under +var +vi +vil +ville +vor +være +været diff --git a/apps/common/src/python/mediawords/languages/de/de_stop_words.txt b/apps/common/src/python/mediawords/languages/de/de_stop_words.txt index aad240c48c..57a23fd1e9 100644 --- a/apps/common/src/python/mediawords/languages/de/de_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/de/de_stop_words.txt @@ -1,16 +1,27 @@ -# -# This is a stop word list for the German language. -# # Sources: -# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-de/blob/master/stopwords-de.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) +a +ab aber +ach +acht +achte +achten +achter +achtes +ag alle +allein allem allen aller +allerdings alles +allgemeinen als also am @@ -25,19 +36,65 @@ anderm andern anderr anders +au auch auf aus +ausser +ausserdem +außer +außerdem +b +bald bei +beide +beiden +beim +beispiel +bekannt +bereits +besonders +besser +besten bin bis +bisher bist +c +d +d.h da +dabei +dadurch +dafür +dagegen +daher +dahin +dahinter +damals damit +danach +daneben +dank dann +daran +darauf +daraus +darf +darfst +darin +darüber +darum +darunter das +dasein +daselbst +dass dasselbe +davon +davor dazu +dazwischen daß dein deine @@ -46,19 +103,34 @@ deinen deiner deines dem +dementsprechend +demgegenüber +demgemäss +demgemäß demselben +demzufolge den +denen denn denselben der +deren derer +derjenige +derjenigen +dermassen +dermaßen derselbe derselben des +deshalb desselben dessen +deswegen dich die +diejenige +diejenigen dies diese dieselbe @@ -70,9 +142,32 @@ dieses dir doch dort +drei +drin +dritte +dritten +dritter +drittes du durch +durchaus +dürfen +dürft +durfte +durften +e +eben +ebenso +ehrlich +ei +ei, +eigen +eigene +eigenen +eigener +eigenes ein +einander eine einem einen @@ -85,8 +180,21 @@ einigen einiger einiges einmal +eins +elf +en +ende +endlich +entweder er +ernst +erst +erste +ersten +erster +erstes es +etwa etwas euch euer @@ -95,18 +203,75 @@ eurem euren eurer eures +f +folgende +früher +fünf +fünfte +fünften +fünfter +fünftes für +g +gab +ganz +ganze +ganzen +ganzer +ganzes +gar +gedurft gegen +gegenüber +gehabt +gehen +geht +gekannt +gekonnt +gemacht +gemocht +gemusst +genug +gerade +gern +gesagt +geschweige gewesen +gewollt +geworden +gibt +ging +gross +grosse +grosser +grosses +große +großer +großes +gut +guter +gutes +h hab habe haben +habt +hast hat hatte +hätte hatten +hätten +hattest +hattet +heisst +her +heute hier hin hinter +hoch +i ich ihm ihn @@ -118,78 +283,219 @@ ihren ihrer ihres im +immer in indem +infolgedessen ins +irgend ist +j +ja +jahr +jahre +jahren +je jede jedem jeden jeder +jedermann +jedermanns jedes +jedoch +jemand +jemandem +jemanden jene jenem jenen jener jenes jetzt +k +kam kann +kannst +kaum kein keine keinem keinen keiner keines +kleine +kleinen +kleiner +kleines +kommen +kommt können +könnt +konnte könnte +konnten +kurz +l +lang +lange +leicht +leide +lieber +los +m machen +macht +machte +mag +magst +mahn +mal man manche manchem manchen mancher manches +mann +mehr mein meine meinem meinen meiner meines +mensch +menschen mich mir mit +mittel +mochte +möchte +mochten +mögen +möglich +mögt +morgen muss +müssen +musst +müsst musste +mussten +muß +mußt +müßt +n +na nach +nachdem +nahm +natürlich +neben +nein +neue +neuen +neun +neunte +neunten +neunter +neuntes nicht nichts +nie +niemand +niemandem +niemanden noch nun nur +o ob +oben oder +offen +oft ohne +p +q +r +recht +rechte +rechten +rechter +rechtes +s +sa +sache +sagt +sagte +sah +schlecht +schluss +sechs +sechste +sechsten +sechster +sechstes sehr +sei +seid +seien sein seine seinem seinen seiner seines +seit +seitdem selbst sich sie +sieben +siebente +siebenten +siebenter +siebentes sind so +solang solche solchem solchen solcher solches soll +sollen +sollst +sollt sollte +sollten sondern sonst +soweit +sowie +später +startseite +statt +steht +suche +t +tag +tage +tagen +tat +teil +tel +tritt +trotzdem +tun +u +über +überhaupt +übrigens um und uns @@ -197,42 +503,111 @@ unse unsem unsen unser +unsere +unserer unses unter +v +vergangenen viel +viele +vielem +vielen +vielleicht +vier +vierte +vierten +vierter +viertes vom von vor +w +wahr +während +währenddem +währenddessen +wann war +wäre waren warst +wart +warum was weg +wegen weil +weit weiter +weitere +weiteren +weiteres welche welchem welchen welcher welches +wem +wen +wenig +wenige +weniger +weniges +wenigstens wenn +wer werde werden +werdet +weshalb +wessen wie wieder +wieso will +willst wir wird +wirklich wirst wo +woher +wohin +wohl wollen +wollt wollte -während +wollten +worden +wurde würde +wurden würden +x +y +z +z.b +zehn +zehnte +zehnten +zehnter +zehntes +zeit zu +zuerst +zugleich zum +zunächst zur +zurück +zusammen +zwanzig zwar +zwei +zweite +zweiten +zweiter +zweites zwischen -über +zwölf \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/de/de_stop_words_old.txt b/apps/common/src/python/mediawords/languages/de/de_stop_words_old.txt new file mode 100644 index 0000000000..aad240c48c --- /dev/null +++ b/apps/common/src/python/mediawords/languages/de/de_stop_words_old.txt @@ -0,0 +1,238 @@ +# +# This is a stop word list for the German language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +aber +alle +allem +allen +aller +alles +als +also +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +auch +auf +aus +bei +bin +bis +bist +da +damit +dann +das +dasselbe +dazu +daß +dein +deine +deinem +deinen +deiner +deines +dem +demselben +den +denn +denselben +der +derer +derselbe +derselben +des +desselben +dessen +dich +die +dies +diese +dieselbe +dieselben +diesem +diesen +dieser +dieses +dir +doch +dort +du +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +es +etwas +euch +euer +eure +eurem +euren +eurer +eures +für +gegen +gewesen +hab +habe +haben +hat +hatte +hatten +hier +hin +hinter +ich +ihm +ihn +ihnen +ihr +ihre +ihrem +ihren +ihrer +ihres +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kann +kein +keine +keinem +keinen +keiner +keines +können +könnte +machen +man +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mich +mir +mit +muss +musste +nach +nicht +nichts +noch +nun +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +um +und +uns +unse +unsem +unsen +unser +unses +unter +viel +vom +von +vor +war +waren +warst +was +weg +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +während +würde +würden +zu +zum +zur +zwar +zwischen +über diff --git a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt index eec3311701..b69d36a7f3 100644 --- a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt @@ -1,1399 +1,321 @@ -# # This is a "long" stop word list for the English language. # # Sources: # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# http://www.lextek.com/manuals/onix/stopwords1.html +# http://xpo6.com/list-of-english-stop-words/ +# https://countwordsfree.com/stopwords +# https://gist.github.com/sebleier/554280 (NLTK stop words) # https://github.com/arc12/Text-Mining-Weak-Signals/wiki/Standard-set-of-english-stopwords +# https://github.com/berkmancenter/mediacloud-sentence-splitter/blob/develop/sentence_splitter/non_breaking_prefixes/en.txt +# https://github.com/stopwords-iso/stopwords-en/blob/master/stopwords-en.txt # https://www.link-assistant.com/seo-stop-words.html -# some English non-breaking prefixes -# +# https://www.ranks.nl/stopwords +# (Lightly edited to remove words in the original lists that are actually meaningful) +'ll +'tis +'twas +'ve +10 +39 +A +Adj +Adm +Adv +Apr +Art +Asst +Aug +B +Bart +Bldg +Brig +C +Capt +Cmdr +Co +Col +Comdr +Con +Corp +Cpl +D +DR +Dec +Dr +Drs +E +Ens +F +Feb +Fig +G +Gen +Gov +H +Hon +Hosp +Hr +I +I'd +I'll +I'm +I've +Inc +Insp +J +Jan +Jr +Jul +Jun +K +L +Lt +M +MM +MR +MRS +MS +Maj +Mar +Messrs +Mlle +Mme +Mr +Mrs +Ms +Msgr +N +No +Nos +Nov +Nr +O +Oct +Okt +Op +Ord +P +Pfc +Ph +Ph.D +PhD +Prof +Pvt +Q +R +Rep +Reps +Res +Rev +Rt +S +Sen +Sens +Sep +Sept +Sfc +Sgt +Sr +St +Supt +Surg +T +U +V +W +X +Y +Z a a's -abandoned -abbr -ability able -aboard +ableabout about above -abroad -absence -absent -absolute -absolutely -absorbed -abstract -academic -accept -acceptable -acceptance -accepted -accepting -access -accident -accompanied -accomplish -accomplished +abst accordance according accordingly -account -accounts -accuracy -accurate -accurately -accused -achieve -achieved -achievement -achievements -acquire -acquired across act -acting -action -actions -active -activities -activity -actor -acts -actual actually -ad -add added -adding -addition -additional -address -addressed -addresses -adequate adj -adjusted -adjustment -adjustments -adm -administration -admission -admit -admitted -adopted -adult -adults -adv -advance -advanced -advantage -advantages -advertising -advice -advised -aesthetic -affair -affairs -affect affected +affecting affects -afford -afraid after -afternoon afterwards again against -age -agencies -agency -agent -agents -ages ago -agree -agreed +ah ahead -aid -aids -aim -aimed ain't -air -al -alert -alienation -align -alike -alive +aint all -alliance -allied -allies -allotment -allow -allowances -allowed -allowing -allows almost alone along alongside already also -altered -alternative although -altogether always am -amazing -ambiguous -amendment amid amidst among amongst +amoungst amount -amounts an -analysis -ancient and -anger -angle -angry -animal -animals -anniversary -announced -announcement -annual -anode +announce another -answer -answered -answers -anti-Semitism -anti-trust -anticipated -anticipation -anxiety -anxious any anybody anyhow +anymore anyone anything anyway anyways anywhere apart -apartment -apparatus -apparent apparently -appeal appear -appearance -appeared -appears -apple -application -applications -applied -apply -applying -appointed -appointment -appreciate -appreciation -approach -approached -approaches -approaching -appropriate -approval -approved approximately -apr -april -arbitrary -arc -architect are area areas aren aren't -argue -argued -argument +arent arise -arm around -aroused -arrange -arranged -arrangement -arrangements -arrest -arrested -arrival -arrive -arrived -art -artery -article -articles -artist -artistic -artists -arts as aside ask asked asking asks -asleep -aspect -aspects -assembled -assessment -assessors -assigned -assignment -assist -assistance -assistant -associate associated -association -asst -assume -assumed -assumption -assumptions -assure -assured -astronomy at -atmosphere -atom -atomic -atoms -attached -attack -attacked -attacks -attain -attempt -attempted -attempting -attempts -attend -attended -attending -attention -attitude -attitudes -attorney -attract -attracted -attractive -audience -aug -august -aunt -authentic -author -authorities -authority -authorized -authors -auto -automatic -automatically -automobile -automobiles -autumn -availability +auth available -average -avoid -avoided -awake -award -aware -awareness away awfully -axis b -baby -back -backed -background -backing -backs -backward -backwards -bad -badly -bag -balance -balanced -ball -band -bank -banks -bar -bare -barely -barn -barrel -bars -bart -base -baseball -based -basement -bases -basic -basically -basis -bat -bath -battle -bay be -beach -bear -beard -bearing -beat -beautiful -beauty became because become becomes becoming -bed -bedroom -beef been -beer before beforehand -began -begin -beginning -begins -begun -behalf -behavior -behind being beings -belief -beliefs believe -believed -believes -bell -belly -belong -belongs below -bench -beneath -benefit -benefits -bent beside besides best -bet better between beyond -bgcolor -bid -big -bigger -biggest -bill -billion -bills -binding -binomial -biological -bird -birds -birth -bit -bitter -black -blame -blanket -bldg -blind -block -blockquote -blocks -blog -blonde -blood -blow -blue -blues -board -boards -boat -boating -boats -bod -bodies -body -bold -bomb -bombs -bond -bonds -bone -bones -book -books -border -bore -born -boss both -bother -bottle -bottom -bought -bound -box -boy -boys -br -branch -branches -brave -bread -break -breakfast -breaking -breath -breathing -brick -bride -bridge -bridges -brief -briefly -brig -bright -brilliant -bring -bringing -brings -broad -broke -broken -bronchial -bros -brother -brought -brown -browser -brush -brushed -budget -build -builder -building -buildings -built -bullet -bullets -bundle -burden -bureau -burn -burned -burning -burns -burst -bus -business -businesses -busy but -butter buy -buying by c c'mon c's -cafe -calculated -calendar call -called -calls -calm came -camera -camp can can't -candidate -candidates cannot cant -cap -capabilities -capable -capacity -capital -capt -captain caption -car -carbon -card -care -career -careful -carefully -carried -carries -carry -carrying -cars case cases -cash -cast -casual -cat -catch -categories -category -cattle -caught cause -caused causes -cdt -ceiling -cell -cellar -cells -cent -center -centers -central -cents -centuries -century certain certainly -certainty -cf -chain -chair -chairman -chairs -challenge -champion -chance -chances -change -changed -changes -changing -channels -chapel -char -character -characteristic -characteristics -characterized -characters -charge -charged -charges -charm -charming -charoff -chart -charter -cheap -check -checked -cheek -chemical -chest -chick -chicken -chief -chiefly -chlorine -choice -cholesterol -choose -chord -chose -chosen -church -churches -cigarette -circle -circles -circular -circumstances -cite -cited -cities -citizen -citizens -city -civic -civil -civilian -civilization -claim -claimed -claims -clarity -class -classes -classic -classical -classification -clean -cleaning -clear -cleared -clearly -clerk -click -climb -climbed -clinical -clock -close -closed -closely -closer -closing -cloth -clothe -clothes -clothing -cloud -clouds -club -cmdr +cmon co co. -coach -coast -coat -coating -cocktail -code -coffee -col -cold -colleagues -collect -collected -collection -collective -colonel -colonial -colony -color -colored -colorful -colors -column -columns com -combat -combination -combined -comdr come -comedy comes -comfort -comfortable -coming -command -commander -comment -comments -commerce -commercial -commissioner -committed -committee -commodities -common -commonly -communication -communications -communism -communities -community -companies -companion -company -comparable -compare -compared -comparison -compete -competition -competitive -complained -complement -complete -completed -completely -completion -complex -complicated -component -components -composed -composer -composition -compromise -computed -con -conceived -concentrated -concentration -concept -conception -concepts -concern -concerned concerning -concerns -concert -concerts -concluded -conclusion -conclusions -concrete -condemned -condition -conditioned -conditions -conduct -conducted -conductor -conference -conferences -confidence -confirmed -conflict -confronted -confused -confusion -congressional -connect -connected -connection -conscience -conscious -consciousness -consequence -consequences consequently consider -considerable -considerably -consideration -considerations -considered considering -consisted -consistent -consistently -consisting -consists -consonant -conspiracy -constant -constantly -constitute -constitutional -constructed -construction -consumer -contact -contacts contain -contained containing contains -contemporary -content -contest -context -continent -continually -continue -continued -continues -continuing -continuity -continuous -continuously -contract -contracts -contrary -contrast -contribute -contributed -contributions -control -controlled -controlling -controls -controversy -convenience -convenient -conventional -conversation -conversion -converted -conviction -convictions -convinced -cook -cooking -cool -cooling -cooperative -cope -copy -core -corn -corner -corp -correct -correspondence corresponding -cost -costs -cottage -cotton could +could've +couldn couldn't -council -count -counter -counties -countries -country -county -couple -courage -course -courses -court -courts -cousin -cover -coverage -covered -covering -covers -cow -cpl -crack -craft -crash -crawled -crazy -cream -crease -create -created -creating -creation -creative -creatures -credit -crew -critic -critical -criticism -critics -crop -cross -crossed -crossing -crowd -crowded -crown -crucial +couldnt cry -cst -cultural -culture -cure -curiosity -curious -current currently -curt -curve -customer -customers -cut -cuts -cutting -cycle d -D -dad -daily -damage -damn -dance -dancer -dancers -dances -dancing -danger -dangerous dare daren't -dark -darkness -data +darent date -dates -datetime -daughter -dawn -day -days -dead -deal -dealer -dealers -dealing -dealt dear -death -debate -dec -decade -decades -december -decent -decide -decided -decimal -decision -decisions -deck -declaration -declared -decline -dedicated -dedication -deep -deeper -deeply -defeat -defend -defense -define -defined -definite definitely -definition -degree -degrees -del -delay -delayed -deliberately -delicate -delight -delightful -delivered -delivery -demand -demanded -demanding -demands -democracy -demonstrate -demonstrated -demonstration -denied -density -deny -department -departments -depend -dependent -depending -depends -depression -depth -derived -describe -described -describes -description -desegregation -desert -design -designed -designs -desirable -desire -desired -desires -desk -despair -desperate -desperately despite -destiny -destroy -destroyed -destruction -destructive -detail -detailed -details -detergent -determination -determine -determined -determining -develop -developed -developing -development -developments -device -devices -devil -devoted -diameter -dictionary did didn didn't -die -died -diet +didnt differ -difference -differences different differently -difficult -difficulties -difficulty -diffusion -dignity -dilemma -dimensions -dining -dinner -diplomatic -dir -direct -directed -direction -directions directly -director -directors -dirt -dirty -disappeared -disaster -discharge -discipline -discuss -discussed -discussion -discussions -disease -dishes -disk -displacement -display -displayed -displays -disposal -dispute -distance -distant -distinct -distinction -distinctive -distinguished -distributed -distribution -districts -disturbed -div -dive -divide -divided -divine -division -divorce do -doctor -doctors -doctrine -documents does doesn doesn't -dog -dogs +doesnt doing -dollar -dollars -domestic -dominant -dominated don don't done -door -doors -double -doubt -doubtful -down -downed -downing -downs -downtown -downwards -dozen -dr -draft -drama -dramatic -drank -draw -drawing -drawings -drawn -dream -dreamed -dreams -dress -dressed -dressing -drew -dried -drill -drink -drinking -drinks -drive -driven -driver -drivers -driving -drop -dropped -drove -drs -drug -drugs -drunk -dry -drying -duck +dont due -dull during -dust -duties -duty -dying -dynamic e +e.g each -eager -ear -earlier -earliest -early -earnings -ears -earth -ease -easier -easily -east -easy -eat -eating -ecumenical -edge -edges -edition -editor -editorial -edt -edu effect -effective -effectively -effectiveness -effects -efficiency -efficient -effort -efforts eg -egg -eggs eight -eighteenth -eighth eighty either -elaborate -elected -electric -electrical -electricity -electron -electronic -electronics -element -elements eleven -eliminate -eliminated else elsewhere -em -email -emerged -emergency -emission -emphasis -emphasize -empirical -employed -employee -employees -employment -empty -enable -encounter -encountered -encourage -encouraged -encouraging -end -ended -ending -endless -ends -enemies -enemy -energy -enforced -enforcement -engaged -engagement -engine -engineer -engineering -engineers -enjoy -enjoyed -enjoyment -enormous enough -ens -enter -entered -entering -enterprise -entertainment -enthusiasm -enthusiastic -entire entirely -entitled -entrance -entries -entry -envelope -environment -equal -equally -equate -equation -equipment -equipped -equivalent -era -error -errors -escape -esp especially -essential -essentially -est -establish -established -establishing -establishment -estate -estimate -estimated -estimates et +et-al etc -etc. -eternal -ethical -ethics -evaluation -eve even -evening evenly -event -events -eventually ever evermore every @@ -1401,2775 +323,449 @@ everybody everyone everything everywhere -evidence -evident -evidently -evil -ex -exact exactly -examination -examine -examined example -examples -excellent except -exception -exceptions -excess -excessive -exchange -excite -excited -excitement -exciting -exclusive -exclusively -excuse -executive -exercise -exercises -exhibit -exhibition -exist -existed -existence -existing -exists -expanded -expanding -expansion -expect -expectations -expected -expects -expenditures -expense -expenses -expensive -experience -experienced -experiences -experiment -experimental -experiments -expert -experts -explain -explained -explains -explanation -explicit -exploration -exposed -exposure -express -expressed -expressing -expression -extend -extended -extending -extension -extensive -extent -extra -extraordinary -extreme -extremely -eye -eyes f -fabrics -face -faced -faces -facilities -facing -fact -factor -factories -factors -factory -facts -faculty -fail -failed -failure -faint -fair -fairly -faith -fall -fallen -falling -fallout -familiar -family -famous -fans -fantastic -far -farm -farmer -farmers -farther -fascinating -fashion -fast -fat -fate -father -fathers -fault -favor -favorable -favorite -fear -fears -feature -features -feb -february -fed -federal -feed -feel -feeling -feelings -feels -fees -feet -fell -fellow -felt -female -fence -festival -few -fewer -fiber -fibers -fiction -field -fields fifteen fifth fifty -fig -fight -fighting -figure -figured -figures -file -filed -filing -fill -filled -filling -film -films -final -finally -finance -financial -financing find -finding -findings -finds -fine -finger -fingers -finish -finished -fire -fired -firing -firm -firmly -firms first -fiscal -fish -fishing -fist -fit -fitted five -fixed -flagicon -flash -flat -fled -flesh -flew -flexible -flight -floor -flow -flower -flowers -fluid -flux -fly -flying -foam -foams -focus -fog -folk -folks -follow -followed -following -follows -font -food -foods -fool -foot -football for -force -forced -forces -foreign -forest -forests forever -forget -forgive -form -formal -formation -formed -former -formerly -forming -forms -formula -formulas -fort forth -fortune -forum +forty forward -fought -found -founded four -fourteen -fourth -fraction -fractions -frame -frames -free -freight -frequencies -frequency -frequent -frequently -fresh -friday -friend -friendly -friends -friendship -frightened from -front -frozen -fruit -ft. full -full-time fully -fun -function -functional -functions -fund -funds -furnish -furnished -furniture further furthered furthering furthermore furthers -future g -gain -gained -gains -game -games -gang -garage -garden -gardens -gas -gather -gathered -gathering gave -gay -gear -gen general generally -generation -generations -generous -genius -gentle -gentleman -gentlemen -gently -genuine -gesture get gets getting -giant -gift -gin -girl -girls give given gives giving -glad -glance -glanced -glass -glasses -glory +gmt go -goal -goals goes going -gold -golden -golf gone -good -goods -gorton got gotten -gov -govern -governing -government -governmental -governments -governor -grabbed -grade -grades -gradually -graduate -grain -grains -grand -grant -granted -grants -grass -grateful -grave -gray -great -greater -greatest -greatly -green -greeted -greetings -grew -grinned -grip -gross -ground -grounds -group -grouped -grouping -groups -grow -growing -grown -grows -growth -guards -guess -guest -guests -guidance -guide -guided -guilt -guilty -gun -guns -guy -guys -gyro h -habit -habits had hadn't -hair +hadnt half -halign -hall -ham -hand -handed -handle -handled -handling -hands -hang -hanging -happen -happened -happening happens -happily -happiness -happy -hard -harder hardly -harm -harmony has hasn hasn't -hat -hate -hated -hatred +hasnt have +haven haven't +havent having he he'd he'll he's -head -headed -heading -headquarters -heads -health -healthy -hear -heard -hearing -heart -hearts -heat -heaven -heavily -heavy -heels -height -heights -held -hell +hed hello -help -helped -helpful -helping -helpless -helps -hen hence her -herd here here's hereafter hereby herein +heres hereupon -heritage -hero -heroic hers +herse +herse" herself -hesitated +herse” +hes hi -hidden -hide -high -higher -highest -highly -hill him +himse +himse" himself -hired +himse” his -historian -historians -historic -historical -history -hit hither -hits -hold -holder -holding -holds -hole -holes -hollywood home -homes -hon -honest -honey -honor -honored -hope -hoped +homepage hopefully -hopes -hoping -horizon -horse -horses -hosp -host -hot -hotels -hour -hours -house -household -houses -housing how +how'd +how'll how's howbeit however -hr -href +htm html http -huge -human -humanity hundred -hundreds -hung -hungry -hunt -hunting -hurried -hurry -husband -hydrogen -hypothalamic i -I i'd i'll i'm i've -ice -idea -ideal -ideas -identical -identification -identified -identify -identity -ideological +i.e +i.e. ie if -ignored -illness -illusion -illustration -image -images -imagination -imagine -imagined -imitation -immediate -immediately -impact -implications -importance -important -imposed -impossible -impressed -impression -impressions -impressive -improve -improved -improvement -improvements -impulse +ill in -inadequate inasmuch inc inc. -inch -inches -incident -inclined -include -included -includes -including -income -increase -increased -increases -increasing -increasingly -incredible indeed -index indicate indicated indicates -indication -indirect -individual -individuals -industrial -industry -inevitable -inevitably -influence -information -informed -inherent -initial -initiative -injured -injury -inner -innocence -innocent -input -insect -insects -inside -insight -insist -insisted insofar -insp -inspection -inspired -installed -instance -instances -instant instead -institution -institutions -instruction -instructions -instrument -instruments -insurance -insure -integration -intended -intense -intensity -intention -intentions -interest -interested -interesting -interests -interference -interior -internal -international -interpretation -interpreted -intervals -intervention -interview -interviews -intimate into -introduced -introduction -invariably -invent -invention -inventory -investigation -investigations -investment -invited -involve -involved -involves -involving -inward -iron is -island isn isn't -isolated -issue -issued -issues +isnt it it'd it'll it's -item -items +itd +itll its +itse" itself +itse” +ive j -jacket -jail -jan -january -jazz -jet -job -jobs join -joined -joint -joke -journey -joy -jr -judges -judgment -judgments -jul -july -jump -jumped -jun -june -jungle -junior -juniors just -justice -justified -justify k keep -keeping keeps kept -key -keys -kid -kids -kill -killed -killer -killing kind -kinds -king -kingdom -kitchen -knee -knees knew -knife -knocked know -knowing -knowledge known knows l -la -label -laboratory -labour -lack -lacked -lacking -ladder -ladies -lady -laid -lake -land -landing -lands -landscape -lang -language -languages -large largely -larger -largest last -late lately later -latest latter latterly -laugh -laughed -laughing -laughter -launched -law -laws -lawyer -lawyers -lay -lb. -lead -leaders -leadership -leading -leads -lean -leaned -leaped -learn -learned -learning -least -leather -leave -leaves -leaving -led -left -leg -legal -legend -legislation -legislative -legislators -legs -length -lengths less -lesson lest let let's lets -letter -letters -letting -level -levels -liberal -liberty -libraries -library -license -lid -lie -lies -lieutenant -life -lift -lifted -light -lighted -lighting -lightly -lights like liked likely -likes likewise -limit -limitations -limited -limits line -linear -lines -link -link-en -lips -liquid -liquor -list -listed -listen -listened -listeners -listening -lists -literally -literary -literature -little -live -lived -lively -lives -livestock -living -lo -load -loaded -loan -loans -lobby -local -locate -located -location -lock -locked -locking -log -logical -lone -lonely -long -long-range -long-term -longer -longest look -looked looking looks -loop -loose -lose -losing -loss -losses -lost -lot -lots -loud -love -low -lower -lowered -loyalty -lt ltd -luck -lucky -lumber -lunch -luncheon -lungs -luxury -lying m -ma -machine -machinery -machines -mad made -magazine -magazines -magic -magnet -magnetic -magnificent -magnitude -maid -mail -main mainly -maintain -maintained -maintaining -maintenance -maj -major -majority make -makers makes making -male -males -man -manage -managed -management -manager -managers -mankind -manner -manufacturer -manufacturers -manufacturing many -map -mar -marble -march -marginal -mark -marked -market -marketing -markets -marks -marriage -marriages -married -marshall -mass -masses -massive -master -match -matching -mate -material -materials -mathematical -mathematics -matter -matters -mature -maturity -maximum may maybe mayn't -mdt +maynt me -meal -meals mean -meaning -meaningful -meanings means -meant meantime meanwhile -measure -measured -measurement -measures -measuring -meat -mechanical -mechanism -medical -medicine -medium -meet -meeting -meetings -meets -melody -melting member members -membership -memory men -mental -mention -mentioned -merchant -merchants -mere merely -merger -merit -mess -message -messrs -met -meta -metal -method -methods -mg -middle -middle-class -midnight might +might've mightn't -mighty -mile -miles -military -milk -milligrams -million -millions -mind -minds +mightnt mine -mines -minimal -minimum -minor -minority minus -minute -minutes -mirror miss -missed -missile -missing -mission -mistake -mix -mixed -mixture -mlle -mm -mme -mobile -mode -model -moderate -modern -modest -mold -molecule -moment -monday -money -month -monthly -months -monument -mood -moon -moral -morality more moreover -morning most mostly -mother -mothers -motion -motive -motives -motor -mount -mountain -mounted -mouth move -moved -movement -movements -moves -movie -movies -moving mr mrs -ms -msgr -mss -mst much -mud -multiple -multiply -municipal -murder -muscle -muscles -music -musical -musician -musicians must -mustard +must've mustn't -mutual +mustnt my +myse" myself -mysterious -mystery -myth +myse” n -naked name -named namely -names -narrative -narrow -nation -national -natural -naturally -nature -naval -nd +nay near -nearby -nearest nearly -neat necessarily necessary -necessity -neck -need -needed -needing needn't -needs -negative -negotiations -neighbor -neighborhood -neighboring -neighbors +neednt neither -nerves -nervous -nest -net -network -neutral never neverf neverless nevertheless -new -newer -newest -newly -news -newspaper -newspapers next -nice -night -nights nine -nineteenth ninety -ninth no no-one -noble nobody -nodded -noise non none nonetheless -noon noone nor -normal -normally -norms -north -nose +nos not -notable -note noted -notes nothing -notice -noticed -notion notwithstanding -noun -nov -novel -novels -november now nowhere -nude +null number numbers -numeral -numerous -nuts o -object -objective -objectives -objects -obligations -obliged -observation -observations -observe -observed -observers obtain obtained -obvious obviously -occasion -occasional -occasionally -occasions -occupation -occupied -occur -occurred -occurrence -occurring -occurs -ocean -oct -october -odd of off -offer -offered -offering -offers -office -officer -officers -offices -official -officials often oh -oil ok okay -okt -old -older -oldest on once one one's ones only -onset onto -op -open -opened -opening -openly opens -operate -operated -operating -operation -operational -operations -operator -opportunities -opportunity -opposed -opposite -opposition -optimal or -oral -orchestra -ord -order -ordered -ordering -orderly -orders -organ -organic -organization -organizations -organized -origin -original -originally other others otherwise ought oughtn't +oughtnt our ours ourselves out -outcome -outdoor -outlook -output -outside -outstanding over -over-all overall -overcome -overseas -overwhelming +owing own -owned -owner -owners -ownership -oxidation -oxygen p -pace -pack -package -packed -page -pages -paid -pain -painful -paint -painted -painter -painting -paintings -pair -pale -panel -panels -panic -paper -papers -parade -paragraph -parallel -parent -parents -parked -parking -parks part -part-time parted -partially -participate -participation -particle -particles particular particularly -parties parting -partisan -partly -partner parts -party -pass -passage -passages -passed -passenger -passengers -passes -passing -passion past -patent -path -pathology -patience -patient -patients -patrol -pattern -patterns -pause -paused -pay -paying -payment -payments -pdt -peace -peaceful -peas -peculiar -peered -pencil -penny -people -peoples per -percent -percentage -perception -perfect -perfectly -perform -performance -performances -performed perhaps -period -periods -permanent -permission -permit -permits -permitted -person -personal -personality -personally -personnel -persons -perspective -persuaded -pertinent -petitioner -pfc -ph -ph.d -phase -phases -phd -phenomena -phenomenon -philosophical -philosophy -phone -phrase -physical -physically -physics -piano -pick -picked -picture -pictures -piece -pieces -pile -pilot -pink -pioneer -pipe -pistol -pitch place placed places -placing -plain -plan -plane -planes -planet -planetary -planets -planned -planning -plans -plant -plants -plaster -plastic -plastics -plate -plates -platform -play -played -player -players -playing -plays -pleasant please -pleased -pleasure -plenty -plot -plug -plural plus -pm -pocket -poem -poems -poet -poetic -poetry -poets point pointed pointing points -police -policeman -policies -policy -political -politicians -politics -polynomial -pond -pool -poor -popular -populate -population -porch -port -portion -pose -position -positions -positive -possessed -possession -possibilities -possibility possible possibly -post -posted -posts -pot -potential -pound -pounds -poured -poverty -powder -power -powerful -powers -pp -practical -practically -practice -practices -preceding -precious -precise -precisely -precision -prefer -preferred -preliminary -preparation -prepare -prepared -preparing -presence +potentially +predominantly present -presentation presented presenting presents -preserve -president -press -pressed -pressing -pressure -pressures -prestige presumably -pretty -prevent -prevented -prevention -previous previously -price -prices -pride primarily -primary -prime -primitive -principal -principle -principles -print -printed -prior -prison -prisoners -private -prize -probabilities -probability -probable probably -problem -problems -procedure -procedures -proceeded -process -processes -processing -procurement -produce -produced -producing -product -production -productive -products -prof -profession -professional -profit -profits -profound -program -programs -progress -project -projects -prominent -promise -promised -promises -promising -promote -promotion promptly -proof -propaganda -proper -properly -properties -property -proportion -proposal -proposals -proposed -prospect -prospective -prospects -protect -protected -protection -protein -protest -proud -prove -proved -provide provided provides -providing -provision -provisions -pst -psychological -public -publication -publicity -publicly -published -publisher -pull -pulled -pulling -pulmonary -punishment -pupil -pupils -purchase -purchased -pure -purely -purpose -purposes -pursuant -pursue -push -pushed put puts -putting -pvt q -qualified -qualities -quality -quantity -quarrel -quart -quarter -quarters que -question -questioned -questioning -questionnaire -questions -quick -quickly -quiet -quietly quite -quoted -quotient -qv r -race -races -racial -racing -radar -radiation -radio -rail -railroad -rain -raise -raised -raising ran -ranch -rang -range -ranging -rank -ranks -rapid -rapidly -rare -rarely -rate -rates rather -ratio -rational -raw -rd -re -reach -reached -reaches -reaching -reaction -reactionary -reactions -read -reader -readers readily -reading -ready -real -realism -realistic -reality -realization -realize -realized really -rear -reason -reasonable reasonably -reasons -recall -recalled -receive -received -receives -receiving recent recently -reception -recognize -recognized -recommend -recommendation -recommendations -recommended -record -recorded -recording -records -recovery -recreation -rector -red -reduce -reduced -reducing -reduction -ref -refer -reference -referred -reflect -reflected -reflection -reflects -reform -refrigerator -refund -refused -regard -regarded regarding regardless regards -regime -regiment -region -regional -regions -register -registered -registration -regular -regularly -regulations -rehabilitation -rejected related -relating -relation -relations -relationship -relationships -relative relatively -relatives -release -released -relevant -reliable -relief -relieved -religion -religious -remain -remainder -remained -remaining -remains -remark -remarkable -remarked -remarks -remember -remembered -reminded -remote -removal -remove -removed -rendered -rent -reorganization -rep -repair -repeat -repeated -replace -replaced -replacement -replied -reply -report -reported -reporter -reporters -reports -represent -representative -representatives -represented -representing -represents -reprint -reps -republic -reputation -request -require -required -requirement -requirements -requires -res -research -reserve -reserved -residence -residential -residents -resist -resistance -resolution -resolved -resources -respect -respectable -respective respectively -respects -respond -responded -response -responses -responsibilities -responsibility -responsible -rest -restaurant -restrictions -result -resulted -resulting -results -resumed -retained -retired -retirement -return -returned -returning -returns -rev -reveal -revealed -reveals -revenues -review -revolution -revolutionary -rhythm -rich -rid -ride -riding -rifle -rifles -right -rights -rigid -ring -rise -rises -rising -risk -ritual -river -road -roads -rock -rocks -rode -role -roles -roll -rolled -romantic -roof -room -rooms -root -roots -rope -rose -rough -roughly -round -route -routine -row -rt -rub -rule -ruled -rules -ruling run -running -runs -rural -rush -rushed s -sacred -sacrifice -sad -saddle -safe -safety said -sail -sailing -sake -salary -sale -sales -saline -salt same -sample -sampling -sand -sang -sat -satisfaction -satisfactory -satisfied -saturday -sauce -save -saved -saving -savings saw say saying says -scale -scarcely -scared -scattered -scene -scenes -schedule -scheduled -scheme -scholars -scholarship -school -schools -science -scope -score -screen -sea -search -searching -season -seat -seated second -secondary secondly seconds -secret -secretary -secrets section -sections -secure -security see -seed -seeds seeing -seek -seeking seem seemed seeming seems seen sees -segment -seized -seldom -select -selected -selection self -sell -selling selves -sen -senator -send -sending -senior -sens -sense -sensible -sensitive -sensitivity sent -sentence -sentiment -sep -separate -separated -sept -september -sequence -sergeant -series -serious -seriously -servants -serve -served -serves -service -services -serving -session -sessions -set -sets -setting -settle -settled -settlement seven -seventh +seventy several -severe -sewage -sex -sexual -sfc -sgt -shade -shadows -shaking shall -shame shan't -shape -shapes -share -shared -shares -sharing -sharp -sharply +shant she she'd she'll she's -shear -sheep -sheet +shed shell -shelter -shelters -shift -shine -shining -ship -shipping -ships -shirt -shock -shoe -shoes -shook -shoot -shooting -shop -shopping -shore -short -shortly -shorts -shot -shots +shes should -shoulder -shoulders +should've shouldn shouldn't -shout -shouted -shouting +shouldnt show showed showing shown +showns shows -shut -sick side sides -sidewalk -sighed -sight -sign -signal -signals -signed -significance significant -signs -silence -silent -silver +significantly similar similarly -simple -simply -sin since -sing -singing -single -sink -sister -sit site -sitter -sitting -situation -situations six -sixteen -sixties sixty -size -skill -skilled -skills -skin -skirt -sky -skywave -slave -sleep -slender -slept -slid -slide -slight slightly -slim -slip -slipped -slow -slowly -small -smaller -smallest -smart -smell -smile -smiled -smoke -smooth -snake -snakes -snapped -snow -so -so-called -soap -social -socialism -societies -society -soft -softly -soil -sold -soldier -solely -solid -solution -solve -solved some somebody someday somehow someone +somethan something sometime sometimes somewhat somewhere -son -song -songs soon -sophisticated -sorry -sort -sought -soul -souls -sound -sounded -sounds -source -sources -south -sovereign -sovereignty -space -span -spare -speak -speaker -speaking -special -specialists -species -specific specifically specified specify specifying -specimen -spectacular -speech -speeches -speed -spell -spend -spending -spent -sphere -spirit -spirits -spiritual -spite -splendid -spoke -spoken -sponsor -sponsored -spot -spots -spread -spring -square -sr -st -stable -staff -stage -stages -staining -stairs -stake -stand -standard -standards -standing -stands -star -stared -staring -stars -start -started -starting -startled -starts state -stated -statement -statements states -station -stations -statistics -status -stay -stayed -stead -steadily -steady -steam -steel -stem -stems -step -stepped -steps -stick -sticks -stiff still -stock -stockholders -stomach -stone -stood stop -stopped -storage -store -stored -stores -stories -storm -story -straight -strain -strange -stranger -strategic -strategy -stream -street -streets -stress -stressed -stresses -stretch -stretched -strictly -strike -strikes -striking -string -strip -stroke -strong -stronger -strongest strongly -struck -structural -structure -structures -struggle -struggling -stuck -student -studio -study -stuff -stumbled -stupid -style -styles -sub -subject -subjected -subjects -submitted -substance -substances -substantial substantially -substitute -substrate -subtle -subtract -suburban -succeeded -success -successes -successful successfully -succession such -sudden -suddenly -suffer -suffered -suffering -sufficient sufficiently -suffix -sugar suggest -suggested -suggestion -suggestions -suggests -suit -suitable -suitcase -suite -suited -suits -sum -summary -summer -sun -sunday sup -supervision -supper -supplement -supplied -supplies -supply -support -supported -supporting -suppose -supposed -supt sure -surely -surface -surfaces -surg -surplus -surprise -surprised -surprising -surrender -surrounded -survey -survival -survive -suspect -suspected -suspended -suspicion -sweat -sweet -swept -swift -swim -swimming -swing -switch -switches -swung -syllable -symbol -symbolic -symbols -sympathetic -sympathy -system -systems t t's -table -tables -tactics -tag -tagged -tags -tail take taken -takes taking -tale -talent -talents -talk -talked -talking -tall -tangent -tangible -tape -target -task -tasks -taste -taught -teach -team -teams -tears -technical -technique -techniques -technology -teeth -telephone -television tell -telling -tells -temperature -temperatures -temporarily -temporary ten -tend -tended -tendency tends -tension -tent -term -terms -terrible -test -tested -testimony -testing -tests -text -textile -th than -thank -thanks -thanx that that'll that's that've +thatll thats +thatve the -theater their theirs them -theme themselves then thence -theological -theoretical -theories -theory there there'd there'll @@ -4178,332 +774,96 @@ there's there've thereafter thereby +thered therefore therein +therell +thereof +therere theres +thereto thereupon -thermal +thereve these they they'd they'll they're they've -thick -thickness +theyd +theyll +theyre +theyve thin thing things -think -thinking -thinks third thirty this thorough thoroughly those +thou though -thought -thoughts +thoughh thousand -thousands -threat -threatened -threatening three -threw -throat +throug through throughout -throw -thrown thru -thrust -thursday thus -thyroid -tie -tied -tight +til till -time -times -tiny tip -tire -tired -tissue -title +tis to -toast today -toes together -told -tomorrow -tone -tones -tongue -tonight -tons too took -tool -tools -tooth top -torn -tossed -total -totally -touch -touched -tough -tour -tournament toward towards -town -towns -trace -track -tractor -trade -traders -trading -tradition -traditional -traditions -traffic -tragedy -tragic -train -trained -training -transfer -transferred -transformation -transformed -transition -transportation -trap -travel -traveled -treat -treated -treatment -tree -trees -trembling -tremendous -trend -trends -trial -trials -triangle -tribute tried tries -trim -trip -trips -triumph -troops -trouble -troubled -troubles -truck -trucks -true -truly -trust -truth try trying -tsunami -tube -tubes -tuesday -turn -turned -turning -turns +twas +twelve twenty -twenty-five twice two -type -types -typical u -ugly -ultimate -un -unable -uncertain -uncle -unconscious under -underground -underlying underneath -understand -understanding -understood -undoing -undoubtedly -uneasy -unexpected -unfortunate -unfortunately -unhappy -uniform -union -unions -unique -unit -units -unity -universal -universe -universities -unknown unless unlike -unlikely until unto -unusual up -update upon -upper -upstairs -upward -upwards -urban -urge -urged -urgent -url us use used -useful -user -username uses using -usual usually -utc -utility -utopian -utterly v -vacation -vacuum -valid -valign -valley -valuable -value -variable -variables -variation -variations -varied -variety various -vary -varying -vast -vehicle -vehicles -vein -velocity -venture -verb -verbal -verse -version versus very -veteran via -vice -video -view -viewed -views -vigorous -village -virtually -virtue -visible -vision -visit -visited -visiting -visitors -visual -vital -vivid viz -vocational -voice -voices -vol -volume -volumes -voluntary -volunteers -vote -voted -voters -votes -voting -vowel -vs w -wage -wages -wagon -wait -waited -waiting -wake -walk -walked -walking -wall -walls want wanted wanting wants -war -ward -warfare -warm -warmth -warned -warning -warrant was -wash -washed -washing +wasn wasn't -waste -watch -watched -watching -water -waters -wave -waves +wasnt way ways we @@ -4511,44 +871,38 @@ we'd we'll we're we've -weak -weakness -wear -wearing -weather -web -wedding -wednesday -week -weekend -weekly -weeks -weight -welcome well wells went were +weren weren't -west -wet +werent +weve what +what'd what'll what's what've whatever -wheel -wheels +whatll +whats +whatve when +when'd +when'll when's whence whenever where +where'd +where'll where's whereafter whereas whereby wherein +wheres whereupon wherever whether @@ -4556,128 +910,58 @@ which whichever while whilst -whip -whisky -whispered -white +whim whither who who'd who'll who's +whod whoever whole -wholly +wholl whom whomever +whos whose why +why'd +why'll why's -wide widely -widespread -widow width -wife -wild -wildlife -wildly will -willing -win -wind -window -windows -winds -wine -wines -wing -wings -winning -winter -wiped -wire -wisdom -wish -wished -wishes -wit with within without -witness -witnesses -wives -woman -women -won won't -wonder -wondered -wonderful -wondering -wood -wooden -woods -word -words -wore -work -worked -worker -workers -working -works -workshop -world -worn -worried -worries -worry -worse -worst -worth -worthy +wont would +would've wouldn wouldn't -wound -wounded -write -writers -writes -writing -written -wrong -wrote +wouldnt +ws www x -xml y -yard -yards +ye year -year-old years -yelled -yellow yes -yesterday yet -yield -york you you'd you'll you're you've -young -younger -youngest -youngsters +youd +youll your +youre yours yourself yourselves +youve z -zero +zero \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/es/es_stop_words.txt b/apps/common/src/python/mediawords/languages/es/es_stop_words.txt index 4f08f76cb8..91e465d8f7 100644 --- a/apps/common/src/python/mediawords/languages/es/es_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/es/es_stop_words.txt @@ -1,69 +1,199 @@ -# -# This is a stop word list for the Spanish language. -# # Sources: -# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-es/blob/master/stopwords-es.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) a acerca +actualmente +adelante +ademas además adónde +afirmó +agregó +ahi +ahí +ahora al algo +algún +alguna algunas +alguno algunos +alli +allí +alrededor +ambos +ampleamos +añadió +antano +antaño ante +anterior antes +apenas +aproximadamente aquel +aquél aquella +aquélla aquellas +aquéllas +aquello aquellos +aquéllos +aqui +aquí +arriba +arribaabajo +aseguró aseveró +asi así +atras +aun +aún aunque +ayer +b +bajo +bastante +bien +breve +buen +buena +buenas +bueno +buenos +c cada +casi +cerca +cierta +ciertos +cinco +claro +comentó como +cómo con +conmigo +conocer +conseguimos +conseguir +considera +consideró +consigo +consigue +consiguen +consigues +contigo contra +cosas +creo cual +cuál cuales +cuáles +cualquier cualquiera cuando -cuál -cuáles +cuándo +cuanta +cuánta +cuantas +cuántas +cuanto cuánto +cuantos +cuántos +cuatro +cuenta +d +da +dado +dan +dar de +debajo debe +deben +debido +decir +dejó del +delante +demás +demasiado +dentro desde +despues después destacó +detras +detrás +dia +día +dias +días +dice +dicen dicho +dieron +diferente +diferentes +dijeron dijo +dio donde +dónde +dos durante e +ejemplo el +él ella ellas +ello ellos +empleais +emplean +emplear +empleas +empleo en +encima +encuentra +enfrente +entonces entre era erais +eramos +éramos eran eras eres es esa +ésa esas +ésas ese +ése eso esos +ésos esta +está +ésta estaba estabais +estábamos estaban estabas estad @@ -71,154 +201,233 @@ estada estadas estado estados +estais +estáis estamos +estan +están estando estar -estaremos estará estarán estarás estaré estaréis +estaremos estaría estaríais estaríamos estarían estarías estas +estás +éstas este +esté +éste +estéis estemos +estén +estés esto estos +éstos estoy estuve estuviera estuvierais +estuviéramos estuvieran estuvieras estuvieron estuviese estuvieseis +estuviésemos estuviesen estuvieses estuvimos estuviste estuvisteis -estuviéramos -estuviésemos estuvo -está -estábamos -estáis -están -estás -esté -estéis -estén -estés +ex excepto +existe +existen +explicó expresó +f fue fuera fuerais +fuéramos fueran fueras fueron fuese fueseis +fuésemos fuesen fueses fui fuimos fuiste fuisteis -fuéramos -fuésemos +g +general +gran +grandes +gueno +h ha +habéis +haber +habia +había +habíais +habíamos +habían +habías habida habidas habido habidos habiendo -habremos +habla +hablan habrá habrán habrás habré habréis +habremos habría habríais habríamos habrían habrías -habéis -había -habíais -habíamos -habían -habías hace +haceis +hacemos +hacen hacer +hacerlo +haces hacia hacía +haciendo +hago han has hasta hay haya +hayáis hayamos hayan hayas -hayáis he +hecho hemos hicieron hicimos +hizo +horas +hoy hube hubiera hubierais +hubiéramos hubieran hubieras hubieron hubiese hubieseis +hubiésemos hubiesen hubieses hubimos hubiste hubisteis -hubiéramos -hubiésemos hubo +i +igual +incluso indicó +informo informó +intenta +intentais +intentamos +intentan +intentar +intentas +intento +ir +j +junto +k +l la lado lados +largo las le +lejos les +llegó lleva +llevar lo los luego +lugar +m +mal +manera +manifestó +mas +más +mayor me mediante +medio +mejor +mencionó +menos +menudo mi +mí +mia +mía +mias +mías +mientras +mio +mío +mios +míos mis misma +mismas mismo +mismos +modo +momento +mucha +muchas mucho muchos muy -más -mí -mía -mías -mío -míos +n nada +nadie ni +ningún +ninguna +ningunas +ninguno +ningunos no nos nosotras @@ -227,147 +436,291 @@ nuestra nuestras nuestro nuestros +nueva +nuevas +nuevo +nuevos +nunca o obstante +ocho os otra otras otro otros +p +pais +paìs para +parece parte +partir +pasada +pasado +peor pero +pesar +poca +pocas poco +pocos +podeis +podemos +poder +podrá +podrán +podria +podría +podriais +podriamos +podrian +podrían +podrias +poner por +por qué porque porqué +posible +primer +primera +primero +primeros +principalmente +pronto +propia +propias +propio +propios +proximo +próximo +próximos pudieron pudiese pudimos +pudo +pueda puede +pueden +puedo +pues +q +qeu que +qué +quedó +queremos quien +quién quienes -qué +quiénes +quiere +quiza +quizá +quizas +quizás +r +s +sabe +sabeis +sabemos +saben +saber +sabes +sal +salvo se +sé sea +seáis seamos sean seas +segun según -seremos +segunda +segundo +seis +señaló +ser +sera será serán serás seré seréis +seremos sería seríais seríamos serían serías -seáis -señaló si +sí sido +siempre siendo +siete +sigue +siguiente sin +sino sobre sois -solo +sola +solamente +solas solía +solo +sólo +solos somos son soy +soyos su suele +supuesto sus suya suyas suyo suyos -sí -sólo +t +tal +tambien también +tampoco +tan tanto +tarde te -tendremos +temprano tendrá tendrán tendrás tendré tendréis +tendremos tendría tendríais tendríamos tendrían tendrías tened +teneis +tenéis tenemos +tener tenga +tengáis tengamos tengan tengas tengo -tengáis -tenida -tenidas -tenido -tenidos -teniendo -tenéis tenía teníais teníamos tenían tenías +tenida +tenidas +tenido +tenidos +teniendo +tercera ti +tiempo tiene tienen tienes toda todas +todavia +todavía todo todos +total +trabaja +trabajais +trabajamos +trabajan +trabajar +trabajas +trabajo tras +trata través +tres tu +tú tus tuve tuviera tuvierais +tuviéramos tuvieran tuvieras tuvieron tuviese tuvieseis +tuviésemos tuviesen tuvieses tuvimos tuviste tuvisteis -tuviéramos -tuviésemos tuvo tuya tuyas tuyo tuyos -tú +u +última +últimas +ultimo +último +últimos un una unas uno unos +usa +usais +usamos +usan +usar +usas +uso +usted +ustedes +v +va +vais +valor +vamos +van +varias +varios +vaya +veces +ver +verdad +verdadera +verdadero vez vosotras vosotros +voy vuestra vuestras vuestro vuestros +w +x y ya yo -él -éramos +z \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/es/es_stop_words_old.txt b/apps/common/src/python/mediawords/languages/es/es_stop_words_old.txt new file mode 100644 index 0000000000..4f08f76cb8 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/es/es_stop_words_old.txt @@ -0,0 +1,373 @@ +# +# This is a stop word list for the Spanish language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +a +acerca +además +adónde +al +algo +algunas +algunos +ante +antes +aquel +aquella +aquellas +aquellos +aseveró +así +aunque +cada +como +con +contra +cual +cuales +cualquiera +cuando +cuál +cuáles +cuánto +de +debe +del +desde +después +destacó +dicho +dijo +donde +durante +e +el +ella +ellas +ellos +en +entre +era +erais +eran +eras +eres +es +esa +esas +ese +eso +esos +esta +estaba +estabais +estaban +estabas +estad +estada +estadas +estado +estados +estamos +estando +estar +estaremos +estará +estarán +estarás +estaré +estaréis +estaría +estaríais +estaríamos +estarían +estarías +estas +este +estemos +esto +estos +estoy +estuve +estuviera +estuvierais +estuvieran +estuvieras +estuvieron +estuviese +estuvieseis +estuviesen +estuvieses +estuvimos +estuviste +estuvisteis +estuviéramos +estuviésemos +estuvo +está +estábamos +estáis +están +estás +esté +estéis +estén +estés +excepto +expresó +fue +fuera +fuerais +fueran +fueras +fueron +fuese +fueseis +fuesen +fueses +fui +fuimos +fuiste +fuisteis +fuéramos +fuésemos +ha +habida +habidas +habido +habidos +habiendo +habremos +habrá +habrán +habrás +habré +habréis +habría +habríais +habríamos +habrían +habrías +habéis +había +habíais +habíamos +habían +habías +hace +hacer +hacia +hacía +han +has +hasta +hay +haya +hayamos +hayan +hayas +hayáis +he +hemos +hicieron +hicimos +hube +hubiera +hubierais +hubieran +hubieras +hubieron +hubiese +hubieseis +hubiesen +hubieses +hubimos +hubiste +hubisteis +hubiéramos +hubiésemos +hubo +indicó +informó +la +lado +lados +las +le +les +lleva +lo +los +luego +me +mediante +mi +mis +misma +mismo +mucho +muchos +muy +más +mí +mía +mías +mío +míos +nada +ni +no +nos +nosotras +nosotros +nuestra +nuestras +nuestro +nuestros +o +obstante +os +otra +otras +otro +otros +para +parte +pero +poco +por +porque +porqué +pudieron +pudiese +pudimos +puede +que +quien +quienes +qué +se +sea +seamos +sean +seas +según +seremos +será +serán +serás +seré +seréis +sería +seríais +seríamos +serían +serías +seáis +señaló +si +sido +siendo +sin +sobre +sois +solo +solía +somos +son +soy +su +suele +sus +suya +suyas +suyo +suyos +sí +sólo +también +tanto +te +tendremos +tendrá +tendrán +tendrás +tendré +tendréis +tendría +tendríais +tendríamos +tendrían +tendrías +tened +tenemos +tenga +tengamos +tengan +tengas +tengo +tengáis +tenida +tenidas +tenido +tenidos +teniendo +tenéis +tenía +teníais +teníamos +tenían +tenías +ti +tiene +tienen +tienes +toda +todas +todo +todos +tras +través +tu +tus +tuve +tuviera +tuvierais +tuvieran +tuvieras +tuvieron +tuviese +tuvieseis +tuviesen +tuvieses +tuvimos +tuviste +tuvisteis +tuviéramos +tuviésemos +tuvo +tuya +tuyas +tuyo +tuyos +tú +un +una +unas +uno +unos +vez +vosotras +vosotros +vuestra +vuestras +vuestro +vuestros +y +ya +yo +él +éramos diff --git a/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt b/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt index aa2cb4cdf7..d1457203fe 100644 --- a/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt @@ -1,18 +1,158 @@ -# -# This is a stop word list for the Finnish language. -# # Sources: -# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) +aiemmin +aika +aikaa +aikaan +aikaisemmin +aikaisin +aikajen +aikana +aikoina +aikoo +aikovat +aina +ainakaan +ainakin +ainoa +ainoat +aiomme +aion +aiotte +aist +aivan +ajan +alas +alemmas +alkuisin +alkuun +alla +alle +aloitamme +aloitan +aloitat +aloitatte +aloitattivat +aloitettava +aloitettevaksi +aloitettu +aloitimme +aloitin +aloitit +aloititte +aloittaa +aloittamatta +aloitti +aloittivat +alta +aluksi +alussa +alusta +annettavaksi +annetteva +annettu +ansiosta +antaa +antamatta +antoi +aoua +apu +asia +asiaa +asian +asiasta +asiat +asioiden +asioihin +asioita +asti +avuksi +avulla +avun +avutta +edelle +edelleen +edellä +edeltä +edemmäs +edes +edessä +edestä +ehkä ei +eikä +eilen eivät +eli +ellei +elleivät +ellemme +ellen +ellet +ellette emme en +enemmän +eniten +ennen +ensi +ensimmäinen +ensimmäiseksi +ensimmäisen +ensimmäisenä +ensimmäiset +ensimmäisiksi +ensimmäisinä +ensimmäisiä +ensimmäistä +ensin +entinen +entisen +entisiä +entisten +entistä +enää +eri +erittäin +erityisesti +eräiden +eräs +eräät +esi +esiin +esillä +esimerkiksi et +eteen +etenkin +etessa ette +ettei että +haikki +halua +haluaa +haluamatta +haluamme +haluan +haluat +haluatte +haluavat +halunnut +halusi +halusimme +halusin +halusit +halusitte +halusivat +halutessa +haluton he +hei heidän heidät heihin @@ -22,6 +162,27 @@ heiltä heissä heistä heitä +helposti +heti +hetkellä +hieman +hitaasti +hoikein +huolimatta +huomenna +hyvien +hyviin +hyviksi +hyville +hyviltä +hyvin +hyvinä +hyvissä +hyvistä +hyviä +hyvä +hyvät +hyvää hän häneen hänelle @@ -32,8 +193,13 @@ hänessä hänestä hänet häntä +ihan +ilmeisesti itse +itsensä +itseään ja +jo johon joiden joihin @@ -46,18 +212,90 @@ joissa joista joita joka +jokainen +jokin +joko joksi +joku jolla jolle +jolloin jolta +jompikumpi jona jonka +jonkin +jonne +joo +jopa jos +joskus jossa josta jota +jotain +joten +jotenkin +jotenkuten jotka +jotta +jouduimme +jouduin +jouduit +jouduitte +joudumme +joudun +joudutte +joukkoon +joukossa +joukosta +joutua +joutui +joutuivat +joutumaan +joutuu +joutuvat +juuri +jälkeen +jälleen +jää +kahdeksan +kahdeksannen +kahdella +kahdelle +kahdelta +kahden +kahdessa +kahdesta +kahta +kahteen +kai +kaiken +kaikille +kaikilta +kaikkea +kaikki +kaikkia +kaikkiaan +kaikkialla +kaikkialle +kaikkialta +kaikkien +kaikkin +kaksi +kannalta +kannattaa kanssa +kanssaan +kanssamme +kanssani +kanssanne +kanssasi +kauan +kauemmas +kaukana +kautta +kehen keiden keihin keiksi @@ -67,6 +305,7 @@ keiltä keinä keissä keistä +keitten keitä keneen keneksi @@ -78,13 +317,68 @@ kenenä kenessä kenestä kenet -ketkä +kenettä +kennessästä +kenties +kerran +kerta +kertaa +keskellä +kesken ketkä ketä +kiitos +kohti +koko +kokonaan +kolmas +kolme +kolmen +kolmesti koska +koskaan +kovin kuin +kuinka +kuinkan +kuitenkaan +kuitenkin kuka +kukaan +kukin +kukka +kumpainen +kumpainenkaan +kumpi +kumpikaan +kumpikin kun +kuten +kuuden +kuusi +kuutta +kylliksi +kyllä +kymmenen +kyse +liian +liki +lisäksi +lisää +lla +luo +luona +lähekkäin +lähelle +lähellä +läheltä +lähemmäs +lähes +lähinnä +lähtien +läpi +mahdollisimman +mahdollista me meidän meidät @@ -95,14 +389,36 @@ meiltä meissä meistä meitä +melkein +melko +menee +meneet +menemme +menen +menet +menette +menevät +meni +menimme +menin +menit +menivät +mennessä +mennyt +menossa mihin +mikin miksi mikä +mikäli +mikään mille +milloin +milloinkan millä miltä minkä -minkä +minne minua minulla minulle @@ -113,14 +429,48 @@ minusta minut minuun minä -minä missä mistä +miten mitkä mitä +mitään +moi +molemmat +mones +monesti +monet +moni +monta +muassa +muiden +muita +muka mukaan +mukaansa +mukana mutta +muu +muualla +muualle +muualta +muuanne +muulloin +muun +muut +muuta +muutama +muutaman +muuten +myöhemmin +myös +myöskin +myöskään +myötä ne +neljä +neljän +neljää niiden niihin niiksi @@ -128,7 +478,6 @@ niille niillä niiltä niin -niin niinä niissä niistä @@ -144,6 +493,7 @@ noina noissa noista noita +nro nuo nyt näiden @@ -152,16 +502,28 @@ näiksi näille näillä näiltä +näin näinä näissä +näissähin +näissälle +näissältä +näissästä näistä näitä nämä +ohi +oikea +oikealla +oikein ole olemme olen olet olette +oleva +olevan +olevat oli olimme olin @@ -176,21 +538,84 @@ olitte olivat olla olleet +olli ollut +oma +omaa +omaan +omaksi +omalle +omalta +oman +omassa +omat +omien +omiin +omiksi +omille +omilta +omissa +omista on +onkin +onko ovat +paikoittain +paitsi +pakosti +paljon +paremmin +parempi +parhaillaan +parhaiten +perusteella +peräti +pian +pieneen +pieneksi +pienelle +pienellä +pieneltä +pienempi +pienestä +pienin poikki +puolesta +puolestaan +päälle +saakka +sadam +sama +samaa +samaan +samalla +samallalta +samallassa +samallasta +saman +samat +samoin +sata +satojen se +seitsemän sekä sen +seuraavat +siellä +sieltä siihen siinä +siis siitä +sijaan siksi sille +silloin sillä -sillä +silti siltä +sinne sinua sinulla sinulle @@ -201,10 +626,32 @@ sinusta sinut sinuun sinä -sinä +siten +sitten sitä +ssa +sta +suoraan +suuntaan +suuret +suuri +suuria +suurin +suurten +taa +taas +taemmas +tahansa tai +takaa +takaisin +takana +takia tallä +tapauksessa +tarpeeksi +tavalla +tavoitteena te teidän teidät @@ -215,6 +662,19 @@ teiltä teissä teistä teitä +tietysti +todella +toinen +tois +toisaalla +toisaalle +toisaalta +toiseen +toiseksi +toisella +toiselle +toiselta +toisemme tuo tuohon tuoksi @@ -239,4 +699,4 @@ tätä vaan vai vaikka -yli +yli \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/fi/fi_stop_words_old.txt b/apps/common/src/python/mediawords/languages/fi/fi_stop_words_old.txt new file mode 100644 index 0000000000..aa2cb4cdf7 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/fi/fi_stop_words_old.txt @@ -0,0 +1,242 @@ +# +# This is a stop word list for the Finnish language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +ei +eivät +emme +en +et +ette +että +he +heidän +heidät +heihin +heille +heillä +heiltä +heissä +heistä +heitä +hän +häneen +hänelle +hänellä +häneltä +hänen +hänessä +hänestä +hänet +häntä +itse +ja +johon +joiden +joihin +joiksi +joilla +joille +joilta +joina +joissa +joista +joita +joka +joksi +jolla +jolle +jolta +jona +jonka +jos +jossa +josta +jota +jotka +kanssa +keiden +keihin +keiksi +keille +keillä +keiltä +keinä +keissä +keistä +keitä +keneen +keneksi +kenelle +kenellä +keneltä +kenen +kenenä +kenessä +kenestä +kenet +ketkä +ketkä +ketä +koska +kuin +kuka +kun +me +meidän +meidät +meihin +meille +meillä +meiltä +meissä +meistä +meitä +mihin +miksi +mikä +mille +millä +miltä +minkä +minkä +minua +minulla +minulle +minulta +minun +minussa +minusta +minut +minuun +minä +minä +missä +mistä +mitkä +mitä +mukaan +mutta +ne +niiden +niihin +niiksi +niille +niillä +niiltä +niin +niin +niinä +niissä +niistä +niitä +noiden +noihin +noiksi +noilla +noille +noilta +noin +noina +noissa +noista +noita +nuo +nyt +näiden +näihin +näiksi +näille +näillä +näiltä +näinä +näissä +näistä +näitä +nämä +ole +olemme +olen +olet +olette +oli +olimme +olin +olisi +olisimme +olisin +olisit +olisitte +olisivat +olit +olitte +olivat +olla +olleet +ollut +on +ovat +poikki +se +sekä +sen +siihen +siinä +siitä +siksi +sille +sillä +sillä +siltä +sinua +sinulla +sinulle +sinulta +sinun +sinussa +sinusta +sinut +sinuun +sinä +sinä +sitä +tai +tallä +te +teidän +teidät +teihin +teille +teillä +teiltä +teissä +teistä +teitä +tuo +tuohon +tuoksi +tuolla +tuolle +tuolta +tuon +tuona +tuossa +tuosta +tuotä +tähän +täksi +tälle +tältä +tämä +tämän +tänä +tässä +tästä +tätä +vaan +vai +vaikka +yli diff --git a/apps/common/src/python/mediawords/languages/fr/fr_stop_words.txt b/apps/common/src/python/mediawords/languages/fr/fr_stop_words.txt index 2f7ed427ca..291bd4a78d 100644 --- a/apps/common/src/python/mediawords/languages/fr/fr_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/fr/fr_stop_words.txt @@ -1,18 +1,44 @@ -# -# This is a stop word list for the French language. -# # Sources: -# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-fr/blob/master/stopwords-fr.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) +a +à +â a-t-on +abord +absolument +afin +ah ai -aie aient aies +ailleurs +ainsi ait +allaient +allo +allô +allons +alors +anterieur +anterieure +anterieures +apres +après as +assez +attendu au +aucun +aucune +aucuns +aujourd +aujourd'hui +aupres +auquel aura aurai auraient @@ -24,41 +50,190 @@ auriez aurions aurons auront +aussi +autant +autre +autrement +autres +autrui aux +auxquelles +auxquels avaient avais avait +avant avec avez aviez -avions +avoir avons ayant ayante ayantes -ayants ayez ayons +b +bah +bas +basee +beaucoup +bien +bigre +bon +boum +brrr c +ça +car ce +ceci +cela +celà celle +celle-ci +celle-là +celles +celles-ci +celles-là +celui +celui-ci +celui-là +cent +cependant +certain +certaine +certaines +certains +certes ces +cet +cette +ceux +ceux-ci +ceux-là +chacun +chacune +chaque +chère +chères +chers +chez +ci +cinq +cinquantaine +cinquante +cinquantième +cinquième +clac +clic +combien +comme +comment +comparable +comparables +compris +concernant +contre d d'une +da dans de +debout +début +dedans +dehors +deja +delà +depuis +dernier +derniere +derriere +derrière des +dès +desormais +désormais +desquelles +desquels +dessous +dessus +deux +deuxième +deuxièmement +devant +devers +devra +devrait +different +différent +différente +differentes +différentes +differents +différents +dire +dit +dite +dits +dix +dix-huit +dix-neuf +dix-sept +dixième +doit +doivent donc dont +dos +douze +douzième +dring +droite du +duquel +durant +e +effet +egalement +egales +eh elle +elle-même +elles +elles-mêmes en +encore +enfin +entre +envers +environ es +ès +essai est et +étaient +étais +était +etant +étant +étante +étantes +étants +état +etc +êtes +étiez +étions +etre +être eu eue eues +euh +eûmes eurent eus eusse @@ -67,10 +242,27 @@ eusses eussiez eussions eut -eux -eûmes eût eûtes +eux +eux-mêmes +exactement +excepté +extenso +exterieur +f +façon +fais +faisaient +faisant +fait +faites +feront +fi +flac +fois +font +fûmes furent fus fusse @@ -79,56 +271,270 @@ fusses fussiez fussions fut -fûmes fût fûtes +g +gens +h +ha +haut +hé +hein +hélas +hem +hep +hi +ho +holà +hop +hormis +hors +hou +houp +hue +hui +huit +huitième +hum +i +ici il ils +importe j je +jusqu +jusque +juste +k l l' la +là +laisser +laquelle +las le +lequel les +lès +lesquelles +lesquels leur leurs +longtemps +lors +lorsque lui +lui-meme +lui-même m ma +maint +maintenant mais +malgre +malgré me +meme +même +memes +mêmes +merci mes +mien +mienne +miennes +miens +mille +mince +mine +minimale moi +moi-meme +moi-même +moindres +moins mon -même +mot +moyennant +multiple +multiples n n'a n'est +na +naturelles ne +neanmoins +néanmoins +necessaire +necessairement +neuf +neuvième ni +nombreuses +nommés +non nos +notamment notre +nôtre +nôtres nous +nous-mêmes +nouveau +nouveaux +nul +o +ô +oh +ohé +olé +ollé on ont +onze +onzième +ore ou où +ouf +ouias +oust +ouste +ouvert +ouverte +ouverts +o| +p +paf +pan par +parce +parfois +parle +parlent +parler +parmi +parole +parseme +partant +particulier +particulière +particulièrement pas +passé +pendant +pense +permet +personne +personnes +peu +peut +peuvent +peux +pff +pfft +pfut +pièce +pif +pire +plein +plupart +plus +plusieurs +plutôt +possessif +possessifs +possible +possibles +pouah pour +pourquoi +pourrais +pourrait +pouvait +prealable +precisement +premier +première +premièrement +pres +près +probable +probante +procedant +proche +psitt +pu +puis +puisque +pur +pure +q qu qu'elle qu'il qu'on qu'une quand +quant +quant-à-soi +quanta +quarante +quatorze +quatre +quatre-vingt +quatrième +quatrièmement que +quel +quelconque +quelle +quelles +quelqu'un +quelque +quelques +quels qui +quiconque +quinze +quoi +quoique +r +rare +rarement +relative +relativement +rend +rendre +restant +reste +restent +retour +revoici +revoilà +rien s s'est sa +sacrebleu +sait +sans +sapristi +sauf se +sein +seize +selon +semblable +semblaient +semble +semblent +sent +sept +septième sera serai seraient @@ -141,45 +547,125 @@ serions serons seront ses +seul +seule +seulement si +sien +sienne +siennes +siens +sinon +six +sixième +soi +soi-même soient sois soit +soixante sommes son sont +sous +souvent soyez soyons +stop +strictement +suffit suis +suit +suivant +suivante +suivantes +suivants +suivre +sujet sur +surtout t ta +tac +tandis +tant +tardive te +té +telle +tellement +telles +tels +tenant +tend +tenir +tente tes +tic +tien +tienne +tiennes +tiens +toc toi +toi-même ton +touchant +toujours +tous +tout +toute +toutefois +toutes +treize +trente +tres +très +trois +troisième +troisièmement +trop +tsoin +tsouin tu +u un une +unes +uniformement +unique +uniques +uns +v va vais +valeur +vas +vé +vers +via +vif +vifs +vingt +vivat +vive +vives +vlan +voici +voie +voient +voilà +voire +vont vos votre +vôtre +vôtres vous +vous-mêmes +vu +w +x y -à -étaient -étais -était -étant -étante -étantes -étants -étiez -étions -été -étée -étées -étés -êtes -être +z +zut \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/fr/fr_stop_words_old.txt b/apps/common/src/python/mediawords/languages/fr/fr_stop_words_old.txt new file mode 100644 index 0000000000..2f7ed427ca --- /dev/null +++ b/apps/common/src/python/mediawords/languages/fr/fr_stop_words_old.txt @@ -0,0 +1,185 @@ +# +# This is a stop word list for the French language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +a-t-on +ai +aie +aient +aies +ait +as +au +aura +aurai +auraient +aurais +aurait +auras +aurez +auriez +aurions +aurons +auront +aux +avaient +avais +avait +avec +avez +aviez +avions +avons +ayant +ayante +ayantes +ayants +ayez +ayons +c +ce +celle +ces +d +d'une +dans +de +des +donc +dont +du +elle +en +es +est +et +eu +eue +eues +eurent +eus +eusse +eussent +eusses +eussiez +eussions +eut +eux +eûmes +eût +eûtes +furent +fus +fusse +fussent +fusses +fussiez +fussions +fut +fûmes +fût +fûtes +il +ils +j +je +l +l' +la +le +les +leur +leurs +lui +m +ma +mais +me +mes +moi +mon +même +n +n'a +n'est +ne +ni +nos +notre +nous +on +ont +ou +où +par +pas +pour +qu +qu'elle +qu'il +qu'on +qu'une +quand +que +qui +s +s'est +sa +se +sera +serai +seraient +serais +serait +seras +serez +seriez +serions +serons +seront +ses +si +soient +sois +soit +sommes +son +sont +soyez +soyons +suis +sur +t +ta +te +tes +toi +ton +tu +un +une +va +vais +vos +votre +vous +y +à +étaient +étais +était +étant +étante +étantes +étants +étiez +étions +été +étée +étées +étés +êtes +être diff --git a/apps/common/src/python/mediawords/languages/ha/ha_stop_words.txt b/apps/common/src/python/mediawords/languages/ha/ha_stop_words.txt index 07c7723d36..cc7896d042 100644 --- a/apps/common/src/python/mediawords/languages/ha/ha_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ha/ha_stop_words.txt @@ -1,9 +1,8 @@ # # This is a stop word list for the Hausa language. -# # Sources: # https://github.com/stopwords-iso/stopwords-ha/blob/master/raw/gh-stopwords-json-ha.txt -# +# (Lightly edited to remove words in the original lists that are actually meaningful) a amma @@ -43,4 +42,4 @@ ya yake yana yi -za +za \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/ha/ha_stop_words_old.txt b/apps/common/src/python/mediawords/languages/ha/ha_stop_words_old.txt new file mode 100644 index 0000000000..07c7723d36 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/ha/ha_stop_words_old.txt @@ -0,0 +1,46 @@ +# +# This is a stop word list for the Hausa language. +# +# Sources: +# https://github.com/stopwords-iso/stopwords-ha/blob/master/raw/gh-stopwords-json-ha.txt +# + +a +amma +ba +ban +ce +cikin +da +don +ga +in +ina +ita +ji +ka +ko +kuma +lokacin +ma +mai +na +ne +ni +sai +shi +su +suka +sun +ta +tafi +take +tana +wani +wannan +wata +ya +yake +yana +yi +za diff --git a/apps/common/src/python/mediawords/languages/hi/__init__.py b/apps/common/src/python/mediawords/languages/hi/__init__.py index 4a98351b3b..12f6e60b8f 100644 --- a/apps/common/src/python/mediawords/languages/hi/__init__.py +++ b/apps/common/src/python/mediawords/languages/hi/__init__.py @@ -21,6 +21,9 @@ class HindiLanguage(StopWordsFromFileMixIn): # Stop words map '__stop_words_map', + # FIXME remove once stopword comparison is over + '__stop_words_old_map', + # Hunspell instance '__hindi_hunspell', diff --git a/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt b/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt index 27440bfb15..0682f8985c 100644 --- a/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt @@ -1,28 +1,34 @@ -# -# This is a stop word list for the Hindi language. -# # Sources: -# http://www.ranks.nl/stopwords/hindi +# # http://members.unine.ch/jacques.savoy/clef/hindiST.txt -# https://sites.google.com/site/kevinbouge/stopwords-lists # http://resgtholpadi.blogspot.com/2012/07/hindi-stop-words-list.html -# +# http://www.ranks.nl/stopwords/hindi +# https://github.com/stopwords-iso/stopwords-hi/blob/master/stopwords-hi.txt +# https://sites.google.com/site/kevinbouge/stopwords-lists +# (Lightly edited to remove words in the original lists that are actually meaningful) अंदर अत अथवा +अदि अन्य +अप अपना +अपनि अपनी अपने अब +अभि अभी आज आदि आप +इंहिं +इंहें +इंहों +इतयादि इत्यादि इन -इन इनका इनके इन्हीं @@ -30,38 +36,46 @@ इन्हों इस इसका +इसकि इसकी इसके इसमें +इसि इसी इसे +उंहिं +उंहें +उंहों उच्च उत्तर उन उनका +उनकि उनकी उनके उनको उन्हीं उन्हें -उन्हें उन्हों उस उसकी उसके +उसि उसी उसे ऊपर एक एवं एस +एसे ऐसा ऐसे +ओर और +कइ कई कभी कम -कर करता करते करना @@ -71,15 +85,16 @@ कहते कहा का +काफि काफ़ी कि +किंहें +किंहों किए कितना किन्हें किन्हों किया -किर -किस किसी किसे की @@ -87,7 +102,10 @@ कुल के को +कोइ कोई +कोन +कोनसा कौन कौनसा गई @@ -95,15 +113,18 @@ गया गयी गये -घर जब जहाँ +जहां जा जाता जाती जाते जाने +जिंहें +जिंहों जितना +जिधर जिन जिन्हें जिन्हों @@ -112,6 +133,8 @@ जिससे जिसे जीधर +जेसा +जेसे जैसा जैसे जो @@ -119,6 +142,8 @@ तथा तब तरह +तिंहें +तिंहों तिन तिन्हें तिन्हों @@ -127,26 +152,31 @@ तुम तो था +थि थी थे दबारा +दवारा दिया दुसरा +दुसरे दूर दूसरे दो दोनों द्वारा न +नहिं नहीं ना +निचे निहायत नीचे ने पर -पर परंतु पहले +पुरा पूरा पूरे पे @@ -154,14 +184,17 @@ फिर बड़ा बड़े +बनि बनी -बही +बहि बहुत बाद बाला बाहर बिलकुल बीच +भि +भितर भी भीतर मगर @@ -174,11 +207,13 @@ यह यहाँ यहां +यहि यही या यिह ये रखें +रवासा रहती रहा रहे @@ -189,10 +224,12 @@ लेकर लेकिन व +वगेरह वर्ग वह -वह वहाँ +वहां +वहिं वहीं वाले वुह @@ -203,26 +240,32 @@ सकती सकते सबसे +सभि सभी समय साथ साबुत -साभ सारा से सो स्थान +हि ही +हुअ हुआ +हुइ हुई हुए हुये +हे +हें है हैं हो होता +होति होती होते होना होने -के +के \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/hi/hi_stop_words_old.txt b/apps/common/src/python/mediawords/languages/hi/hi_stop_words_old.txt new file mode 100644 index 0000000000..27440bfb15 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/hi/hi_stop_words_old.txt @@ -0,0 +1,228 @@ +# +# This is a stop word list for the Hindi language. +# +# Sources: +# http://www.ranks.nl/stopwords/hindi +# http://members.unine.ch/jacques.savoy/clef/hindiST.txt +# https://sites.google.com/site/kevinbouge/stopwords-lists +# http://resgtholpadi.blogspot.com/2012/07/hindi-stop-words-list.html +# + +अंदर +अत +अथवा +अन्य +अपना +अपनी +अपने +अब +अभी +आज +आदि +आप +इत्यादि +इन +इन +इनका +इनके +इन्हीं +इन्हें +इन्हों +इस +इसका +इसकी +इसके +इसमें +इसी +इसे +उच्च +उत्तर +उन +उनका +उनकी +उनके +उनको +उन्हीं +उन्हें +उन्हें +उन्हों +उस +उसकी +उसके +उसी +उसे +ऊपर +एक +एवं +एस +ऐसा +ऐसे +और +कई +कभी +कम +कर +करता +करते +करना +करने +करें +कल +कहते +कहा +का +काफ़ी +कि +किए +कितना +किन्हें +किन्हों +किया +किर +किस +किसी +किसे +की +कुछ +कुल +के +को +कोई +कौन +कौनसा +गई +गए +गया +गयी +गये +घर +जब +जहाँ +जा +जाता +जाती +जाते +जाने +जितना +जिन +जिन्हें +जिन्हों +जिस +जिसमें +जिससे +जिसे +जीधर +जैसा +जैसे +जो +तक +तथा +तब +तरह +तिन +तिन्हें +तिन्हों +तिस +तिसे +तुम +तो +था +थी +थे +दबारा +दिया +दुसरा +दूर +दूसरे +दो +दोनों +द्वारा +न +नहीं +ना +निहायत +नीचे +ने +पर +पर +परंतु +पहले +पूरा +पूरे +पे +प्रति +फिर +बड़ा +बड़े +बनी +बही +बहुत +बाद +बाला +बाहर +बिलकुल +बीच +भी +भीतर +मगर +मध्य +मानो +मे +में +मै +यदि +यह +यहाँ +यहां +यही +या +यिह +ये +रखें +रहती +रहा +रहे +ऱ्वासा +लिए +लिया +लिये +लेकर +लेकिन +व +वर्ग +वह +वह +वहाँ +वहीं +वाले +वुह +वे +वग़ैरह +संग +सकता +सकती +सकते +सबसे +सभी +समय +साथ +साबुत +साभ +सारा +से +सो +स्थान +ही +हुआ +हुई +हुए +हुये +है +हैं +हो +होता +होती +होते +होना +होने +के diff --git a/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt b/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt index 13c70d9d6f..da87882c7a 100644 --- a/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt @@ -1,206 +1,792 @@ -# -# This is a stop word list for the Hungarian language. -# # Sources: -# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-hu/blob/master/stopwords-hu.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) a +abba abban +abból +addig ahhoz ahogy ahol aki akik akkor +akár +alapján alatt +alatta +alattad +alattam +alattatok +alattuk +alattunk +alá +alád +alájuk +alám +alánk +alátok +alól +alóla +alólad +alólam +alólatok +alóluk +alólunk amely +amelybol amelyek amelyekben amelyeket amelyet +amelyik amelynek ami amikor amit amolyan +amott amíg annak +annál arra arról +attól az +aznap azok +azokat +azokba +azokban +azokból +azokhoz +azokig +azokkal +azokká +azoknak +azoknál +azokon +azokra +azokról +azoktól +azokért azon azonban +azonnal azt aztán azután azzal +azzá azért +bal +balra +ban be +belé +beléd +beléjük +belém +belénk +belétek belül +belőle +belőled +belőlem +belőletek +belőlük +belőlünk +ben benne +benned +bennem +bennetek +bennük +bennünk bár +bárcsak +bármilyen +búcsú cikk cikkek cikkeket csak +csakhogy +csupán de +dehogy e +ebbe ebben +ebből eddig egy +egyebek +egyebet +egyedül +egyelőre egyes +egyet egyetlen egyik +egymás egyre +egyszerre egyéb +együtt egész +egészen ehhez ekkor el +eleinte ellen +ellenes +elleni +ellenére +elmondta elsõ +első +elsők +elsősorban +elsőt +elé +eléd elég +eléjük +elém +elénk +elétek elõ elõször elõtt +elő +előbb +elől +előle +előled +előlem +előletek +előlük +előlünk +először +előtt +előtte +előtted +előttem +előttetek +előttük +előttünk +előző emilyen +engem ennek +ennyi +ennél +enyém erre +erről +esetben +ettől ez ezek +ezekbe +ezekben +ezekből +ezeken +ezeket +ezekhez +ezekig +ezekkel +ezekké +ezeknek +ezeknél +ezekre +ezekről +ezektől +ezekért ezen +ezentúl +ezer +ezret ezt +ezután ezzel +ezzé ezért fel +fele +felek +felet +felett felé +fent +fenti +fél +fölé +gyakran +ha +halló +hamar hanem +harmadik +harmadikat +harminc +hat +hatodik +hatodikat +hatot +hatvan +helyett +hetedik +hetediket +hetet +hetven +hirtelen hiszen +hiába hogy hogyan +hol +holnap +holnapot +honnan +hova +hozzá +hozzád +hozzájuk +hozzám +hozzánk +hozzátok +hurrá +huszadik +hány +hányszor +hármat +három +hát +hátha +hátulsó +hét +húsz +ide +ide-оda +idén +igazán igen ill ill. illetve ilyen ilyenkor +immár +inkább +is ismét ison itt +jelenleg jobban +jobbra jó jól +jólesik +jóval +jövőre kell +kellene kellett +kelljen keressünk keresztül +ketten +kettő +kettőt +kevés ki +kiben +kiből +kicsit +kicsoda +kihez +kik +kikbe +kikben +kikből +kiken +kiket +kikhez +kikkel +kikké +kiknek +kiknél +kikre +kikről +kiktől +kikért +kilenc +kilencedik +kilencediket +kilencet +kilencven +kin +kinek +kinél +kire +kiről +kit +kitől +kivel +kivé +kié +kiért +korábban +képest +kérem +kérlek +kész +késő +később +későn +két +kétszer kívül +körül +köszönhetően +köszönöm +közben +közel +közepesen +közepén +közé között közül +külön +különben +különböző +különbözőbb +különbözőek +lassan +le legalább legyen lehet +lehetetlen lehetett +lehetőleg +lehetőség lenne lenni +lennék +lennének lesz +leszek +lesznek +leszünk lett +lettek +lettem +lettünk +lévő +ma maga +magad +magam +magatokat +magukat +magunkat magát +mai majd -majd +majdnem +manapság meg +megcsinál +megcsinálnak +megint +megvan mellett +mellette +melletted +mellettem +mellettetek +mellettük +mellettünk +mellé +melléd +melléjük +mellém +mellénk +mellétek +mellől +mellőle +mellőled +mellőlem +mellőletek +mellőlük +mellőlünk mely melyek +melyik +mennyi mert mi +miatt +miatta +miattad +miattam +miattatok +miattuk +miattunk +mibe +miben +miből +mihez +mik +mikbe +mikben +mikből +miken +miket +mikhez +mikkel +mikké +miknek +miknél mikor +mikre +mikről +miktől +mikért milyen +min +mind +mindegyik +mindegyiket minden +mindenesetre mindenki mindent +mindenütt mindig +mindketten +minek mint mintha +minél +mire +miről mit +mitől mivel +mivé miért +mondta most +mostanáig már más másik +másikat +másnap +második +másodszor +mások +másokat +mást még +mégis míg +mögé +mögéd +mögéjük +mögém +mögénk +mögétek +mögött +mögötte +mögötted +mögöttem +mögöttetek +mögöttük +mögöttünk +mögüle +mögüled +mögülem +mögületek +mögülük +mögülünk +múltkor +múlva +na nagy nagyobb nagyon +naponta +napot ne +negyedik +negyediket +negyven +neked nekem neki +nekik +nektek +nekünk nem +nemcsak +nemrég nincs +nyolc +nyolcadik +nyolcadikat +nyolcat +nyolcvan +nála +nálad +nálam +nálatok +náluk +nálunk +négy +négyet néha néhány nélkül +o +oda +ok olyan +onnan ott pedig persze +pár +például +rajta +rajtad +rajtam +rajtatok +rajtuk +rajtunk +rendben +rosszul rá +rád +rájuk +rám +ránk +rátok +régen +régóta +részére +róla +rólad +rólam +rólatok +róluk +rólunk +rögtön s saját +se sem semmi +semmilyen +semmiség +senki +soha sok +sokan sokat sokkal +sokszor +sokáig +során +stb. szemben szerint +szerinte +szerinted +szerintem +szerintetek +szerintük +szerintünk +szervusz szinte számára +száz +századik +százat +szépen +szét +szíves +szívesen +szíveskedjék +sőt talán +tavaly +te +tegnap +tegnapelőtt tehát +tele teljes +tessék +ti +tied +titeket +tizedik +tizediket +tizenegy +tizenegyedik +tizenhat +tizenhárom +tizenhét +tizenkettedik +tizenkettő +tizenkilenc +tizenkét +tizennyolc +tizennégy +tizenöt +tizet tovább +további továbbá +távol +téged +tényleg +tíz több -ugyanis +többi +többször +túl +tőle +tőled +tőlem +tőletek +tőlük +tőlünk +ugyanakkor +ugyanez +ugyani +ugye +urak +uram +urat +utoljára utolsó után utána vagy vagyis vagyok +vagytok +vagyunk +vajon +valahol valaki +valakit +valamelyik valami valamint való van vannak vele +veled +velem +veletek +velük +velünk vissza +viszlát viszont +viszontlátásra volna +volnának +volnék volt voltak voltam voltunk +végre +végén +végül által általában +ám át +éljen én éppen +érte +érted +értem +értetek +értük +értünk és +év +évben +éve +évek +éves +évi +évvel így +óta õ õk õket +ön +önbe +önben +önből +önhöz +önnek +önnel +önnél +önre +önről +önt +öntől +önért +önök +önökbe +önökben +önökből +önöket +önökhöz +önökkel +önöknek +önöknél +önökre +önökről +önöktől +önökért +önökön +önön össze +öt +ötven +ötödik +ötödiket +ötöt úgy +úgyis +úgynevezett új újabb újra +úr +ő +ők +őket +őt \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/hu/hu_stop_words_old.txt b/apps/common/src/python/mediawords/languages/hu/hu_stop_words_old.txt new file mode 100644 index 0000000000..13c70d9d6f --- /dev/null +++ b/apps/common/src/python/mediawords/languages/hu/hu_stop_words_old.txt @@ -0,0 +1,206 @@ +# +# This is a stop word list for the Hungarian language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +a +abban +ahhoz +ahogy +ahol +aki +akik +akkor +alatt +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amikor +amit +amolyan +amíg +annak +arra +arról +az +azok +azon +azonban +azt +aztán +azután +azzal +azért +be +belül +benne +bár +cikk +cikkek +cikkeket +csak +de +e +ebben +eddig +egy +egyes +egyetlen +egyik +egyre +egyéb +egész +ehhez +ekkor +el +ellen +elsõ +elég +elõ +elõször +elõtt +emilyen +ennek +erre +ez +ezek +ezen +ezt +ezzel +ezért +fel +felé +hanem +hiszen +hogy +hogyan +igen +ill +ill. +illetve +ilyen +ilyenkor +ismét +ison +itt +jobban +jó +jól +kell +kellett +keressünk +keresztül +ki +kívül +között +közül +legalább +legyen +lehet +lehetett +lenne +lenni +lesz +lett +maga +magát +majd +majd +meg +mellett +mely +melyek +mert +mi +mikor +milyen +minden +mindenki +mindent +mindig +mint +mintha +mit +mivel +miért +most +már +más +másik +még +míg +nagy +nagyobb +nagyon +ne +nekem +neki +nem +nincs +néha +néhány +nélkül +olyan +ott +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +szemben +szerint +szinte +számára +talán +tehát +teljes +tovább +továbbá +több +ugyanis +utolsó +után +utána +vagy +vagyis +vagyok +valaki +valami +valamint +való +van +vannak +vele +vissza +viszont +volna +volt +voltak +voltam +voltunk +által +általában +át +én +éppen +és +így +õ +õk +õket +össze +úgy +új +újabb +újra diff --git a/apps/common/src/python/mediawords/languages/it/it_stop_words.txt b/apps/common/src/python/mediawords/languages/it/it_stop_words.txt index 4448e81c70..85e66e3ba8 100644 --- a/apps/common/src/python/mediawords/languages/it/it_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/it/it_stop_words.txt @@ -1,27 +1,55 @@ -# -# This is a stop word list for the Italian language. -# # Sources: -# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-it/blob/master/stopwords-it.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) a +abbastanza abbia abbiamo abbiano abbiate +accidenti ad +adesso +affinché agl agli +ahime +ahimè ai al +alcuna +alcuni +alcuno all alla alle allo +allora +altre +altri +altrimenti +altro +altrove +altrui anche +ancora +anni +anno +ansa +anticipo +assai +attesa +attraverso +avanti avemmo avendo +avente +aver +avere +averlo avesse avessero avessi @@ -35,6 +63,7 @@ avevano avevate avevi avevo +avrà avrai avranno avrebbe @@ -45,22 +74,67 @@ avremo avreste avresti avrete -avrà avrò avuta avute avuti avuto +basta +ben +bene +benissimo +brava +bravo +buono c +caso +cento +certa +certe +certi +certo che chi +chicchessia +chiunque ci +ciascuna +ciascuno +cima +cinque +cio +ciò +cioe +cioè +circa +citta +città +co +codesta +codesti +codesto coi col +colei +coll +coloro +colui come +cominci +comprare +comunque con +concernente +conclusione +consecutivi +consecutivo contro +cos +cosa +cosi +così cui +d da dagl dagli @@ -70,6 +144,8 @@ dall dalla dalle dallo +dappertutto +davanti degl degli dei @@ -78,25 +154,58 @@ dell della delle dello +dentro +detto +deve +devo di +dice +dietro +dire +dirimpetto +diventa +diventare +diventato +dopo +doppio dov dove +dovra +dovrà +dovunque +due +dunque +durante e +è ebbe ebbero ebbi +ecc +ecco ed +effettivamente +egli +ella +entrambi +eppure era erano eravamo eravate eri ero +esempio +esse essendo +esser +essere +essi +ex +fa faccia facciamo facciano -facciate faccio facemmo facendo @@ -114,8 +223,10 @@ facevi facevo fai fanno +farà farai faranno +fare farebbe farebbero farei @@ -124,44 +235,112 @@ faremo fareste faresti farete -farà -farò +fatto +favore fece fecero feci +fin +finalmente +finche +fine +fino +forse +forza fosse fossero fossi fossimo foste fosti +fra +frattempo fu fui fummo +fuori furono +futuro +generale +gente +gia +già +giorni +giorno +giu gli +gliela +gliele +glieli +glielo +gliene +grande +grazie +gruppo ha +haha hai hanno ho i +ie +ieri il in +inc +indietro +infatti +inoltre +insieme +intanto +intorno +invece io l la +là +lasciato +lato le lei li lo +lontano loro lui +lungo +luogo ma +macche +magari +mai +male +malgrado +malissimo +me +medesimo +mediante +meglio +meno +mentre +mesi +mezzo mi mia mie miei +mila +miliardi +milioni +minimi mio +modo +molta +molti +moltissimo +molto +momento +mondo ne negl negli @@ -171,29 +350,127 @@ nell nella nelle nello +nemmeno +neppure +nessun +nessuna +nessuno +niente +no noi +nome non +nondimeno +nonostante +nonsia nostra nostre nostri nostro +novanta +nove +nulla +nuovi +nuovo o +od +oggi +ogni +ognuna +ognuno +oltre +oppure +ora +ore +osi +ossia +ottanta +otto +paese +parecchi +parecchie +parecchio +parte +partendo +peccato +peggio per +perche perché +perchè +percio +perciò +perfino +pero +però +persino +persone +piedi +pieno +piglia +piu più +piuttosto +po +pochissimo +poco +poi +poiche +possa +possedere +posteriore +posto +potrebbe +preferibilmente +presa +press +prima +primo +principalmente +probabilmente +promesso +proprio +puo +può +pure +purtroppo +qua +qualche +qualcosa +qualcuna +qualcuno quale +quali +qualunque +quando quanta quante quanti quanto +quantunque +quarto +quasi +quattro +quel quella quelle quelli quello +quest questa queste questi questo +qui +quindi +quinto +realmente +recente +recentemente +riecco +salvo +sara +sarà sarai saranno sarebbe @@ -204,21 +481,41 @@ saremo sareste saresti sarete -sarà sarò +scorso se +secondo +seguente +seguito sei +sembra +sembrare +sembrato +sembrava +sembri +sempre +senza +sette si sia siamo siano siate siete +sig +solito +solo +soltanto sono +sopra +soprattutto +sotto +spesso sta stai stando stanno +starà starai staranno starebbe @@ -229,8 +526,11 @@ staremo stareste staresti starete -starà starò +stata +state +stati +stato stava stavamo stavano @@ -238,10 +538,12 @@ stavate stavi stavo stemmo +stessa stesse stessero stessi stessimo +stesso steste stesti stette @@ -254,6 +556,9 @@ stiate sto su sua +subito +successivamente +successivo sue sugl sugli @@ -265,22 +570,56 @@ sulle sullo suo suoi +tale +tali +talvolta +tanto +te +tempo +terzo +th ti +titolo tra +tranne +tre +trenta +triplo +troppo +trovato tu tua tue tuo tuoi +tutta +tuttavia +tutte tutti tutto +uguali +ulteriore +ultimo un una uno +uomo +va +vai +vale +vari +varia +varie +vario +verso vi +vicino +visto +vita voi +volta +volte vostra vostre vostri -vostro -è +vostro \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/it/it_stop_words_old.txt b/apps/common/src/python/mediawords/languages/it/it_stop_words_old.txt new file mode 100644 index 0000000000..4448e81c70 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/it/it_stop_words_old.txt @@ -0,0 +1,286 @@ +# +# This is a stop word list for the Italian language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +a +abbia +abbiamo +abbiano +abbiate +ad +agl +agli +ai +al +all +alla +alle +allo +anche +avemmo +avendo +avesse +avessero +avessi +avessimo +aveste +avesti +avete +aveva +avevamo +avevano +avevate +avevi +avevo +avrai +avranno +avrebbe +avrebbero +avrei +avremmo +avremo +avreste +avresti +avrete +avrà +avrò +avuta +avute +avuti +avuto +c +che +chi +ci +coi +col +come +con +contro +cui +da +dagl +dagli +dai +dal +dall +dalla +dalle +dallo +degl +degli +dei +del +dell +della +delle +dello +di +dov +dove +e +ebbe +ebbero +ebbi +ed +era +erano +eravamo +eravate +eri +ero +essendo +faccia +facciamo +facciano +facciate +faccio +facemmo +facendo +facesse +facessero +facessi +facessimo +faceste +facesti +faceva +facevamo +facevano +facevate +facevi +facevo +fai +fanno +farai +faranno +farebbe +farebbero +farei +faremmo +faremo +fareste +faresti +farete +farà +farò +fece +fecero +feci +fosse +fossero +fossi +fossimo +foste +fosti +fu +fui +fummo +furono +gli +ha +hai +hanno +ho +i +il +in +io +l +la +le +lei +li +lo +loro +lui +ma +mi +mia +mie +miei +mio +ne +negl +negli +nei +nel +nell +nella +nelle +nello +noi +non +nostra +nostre +nostri +nostro +o +per +perché +più +quale +quanta +quante +quanti +quanto +quella +quelle +quelli +quello +questa +queste +questi +questo +sarai +saranno +sarebbe +sarebbero +sarei +saremmo +saremo +sareste +saresti +sarete +sarà +sarò +se +sei +si +sia +siamo +siano +siate +siete +sono +sta +stai +stando +stanno +starai +staranno +starebbe +starebbero +starei +staremmo +staremo +stareste +staresti +starete +starà +starò +stava +stavamo +stavano +stavate +stavi +stavo +stemmo +stesse +stessero +stessi +stessimo +steste +stesti +stette +stettero +stetti +stia +stiamo +stiano +stiate +sto +su +sua +sue +sugl +sugli +sui +sul +sull +sulla +sulle +sullo +suo +suoi +ti +tra +tu +tua +tue +tuo +tuoi +tutti +tutto +un +una +uno +vi +voi +vostra +vostre +vostri +vostro +è diff --git a/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt b/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt index bfff6d32ff..6ec40c9b08 100755 --- a/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt @@ -1,10 +1,9 @@ -# # This is a stop word list for the Japanese language. -# # Sources: +# # https://github.com/stopwords/japanese-stopwords/blob/master/data/japanese-stopwords.txt # Lucene's stopwords_ja.txt -# +# (Lightly edited to remove words in the original lists that are actually meaningful) $ % @@ -50,7 +49,6 @@ url いつ いま います -いや いる いろいろ う @@ -92,13 +90,11 @@ url これから これら ご -ごっちゃ ごと ごろ さ さま さまざま -さらい さらに される さん @@ -252,7 +248,6 @@ url ひと ひとつ ふく -ぶり へ への へん @@ -360,7 +355,6 @@ url 作 作ら 例 -係 俺 個 億 @@ -400,38 +394,15 @@ url 向け 向こう 和 -哀 -品 -員 -喜 -器 四 回 -国 -土 在 -地 -報じ -場 -場合 -境 -士 -夏 -外 多く 大 女 奴 婦 子 -字 -安 -官 -室 -家 -対 -小 -屋 巡る 左 市 @@ -442,79 +413,49 @@ url 店 府 度 -式 形 役 彼 彼女 後 -怒 -思わ -性 -情 -感 -感じ 我々 所 手 手段 -扱い 数 文 新た -新着 方 方法 日 -春 時 時点 時間 -更新 -書 月 期 -木 未満 -末 -本 本当 -村 -束 -枚 -校 -楽 様 様々 次 歳 -歴 段 毎 毎日 -気 -水 求め -法 -派 -火 点 -版 特に 玉 用 男 町 -界 略 百 -的 目 相 県 確か 示し -社 私 私達 秋 @@ -530,7 +471,6 @@ url 結局 続き 線 -署 考え 者 自体 @@ -544,34 +484,23 @@ url 計 話 話し -誌 語っ 読む 誰 課 -調べ -論 貴方 貴方方 輪 近く 述べ -通 -速報 -連 週 道 達 違い 選 -部 -都 -金 -銭 開か 間 関 -関係 関連 際 集 @@ -633,4 +562,4 @@ url → ↓ ■ -○ +○ \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/ja/ja_stop_words_old.txt b/apps/common/src/python/mediawords/languages/ja/ja_stop_words_old.txt new file mode 100755 index 0000000000..bfff6d32ff --- /dev/null +++ b/apps/common/src/python/mediawords/languages/ja/ja_stop_words_old.txt @@ -0,0 +1,636 @@ +# +# This is a stop word list for the Japanese language. +# +# Sources: +# https://github.com/stopwords/japanese-stopwords/blob/master/data/japanese-stopwords.txt +# Lucene's stopwords_ja.txt +# + +$ +% +& +@ +lwa +posted +ref +url +” +… +▽ +、 +。 +「 +」 +『 +』 +【 +】 +あそこ +あたり +あちら +あっ +あった +あっち +あと +あな +あなた +あの +あのかた +あの人 +あり +ありました +あります +ある +あれ +い +いい +いう +いく +いくつ +いつ +いま +います +いや +いる +いろいろ +う +うち +え +お +おおまか +おまえ +およそ +および +おり +おります +おれ +か +かく +かたち +かつて +かも +かやの +から +が +がい +がら +き +きき +きた +くせ +ください +くれ +くれる +けど +こうした +ここ +こちら +こっち +こと +この +これ +これから +これら +ご +ごっちゃ +ごと +ごろ +さ +さま +さまざま +さらい +さらに +される +さん +し +しか +しかし +しかた +した +したい +して +しまう +します +しまっ +しよう +しれ +しろ +じゃ +す +すか +すでに +すね +すべて +する +すれ +ず +ずつ +せ +せる +ぜんぶ +そう +そこ +そして +そちら +そっち +そで +その +その他 +その後 +それ +それから +それぞれ +それで +それと +それなり +それに +そんな +た +たい +たくさん +ただ +ただし +たち +たび +ため +たら +たり +だ +だけ +だっ +だめ +だれ +だろ +ちゃ +ちゃん +った +って +つ +ついに +つつ +て +てる +てん +で +でき +できる +でし +でしょ +です +では +でも +と +という +といった +とおり +とか +とき +ところ +として +とって +とともに +となる +とは +とも +と共に +どう +どういう +どこ +どこか +どちら +どっか +どっち +どの +どれ +な +ない +なお +なか +なかっ +なかば +ながら +なく +なけれ +なし +なぜ +なっ +なった +など +なに +なのか +なのに +なら +なり +なる +なん +なんか +に +において +における +について +にて +にとって +によって +により +による +に対し +に対して +に対する +に関して +に関する +ね +の +ので +のに +のみ +は +はじめ +はず +はるか +ば +ばかり +ひと +ひとつ +ふく +ぶり +へ +への +へん +べき +べつ +ぺん +ほう +ほか +ほとんど +ほど +ま +まさ +まし +ましょ +ます +ませ +また +または +まで +まとも +まま +み +みたい +みつ +みなさん +みんな +も +もし +もしくは +もっと +もと +もの +ものの +もん +や +やすい +やっ +やつ +よ +よう +ような +よく +よそ +より +よる +よると +ら +られ +られる +れ +れる +ろ +わ +わけ +わたし +を +を通じて +ん +エラー +カ所 +カ月 +キロ +センチ +ページ +メートル +レ +ヵ所 +ヵ月 +ヶ所 +ヶ月 +・ +ー +一 +一つ +一方 +一覧 +七 +万 +三 +上 +上記 +下 +下記 +中 +九 +事 +二 +五 +人 +今 +今回 +他 +代 +以上 +以下 +以前 +以後 +以降 +会 +伸 +位 +体 +何 +何人 +作 +作ら +例 +係 +俺 +個 +億 +元 +兆 +先 +全部 +八 +六 +内 +円 +再 +冬 +出 +分 +列 +別 +前 +前回 +力 +化 +匹 +区 +十 +千 +半ば +及び +受け +口 +台 +右 +各 +同 +同じ +名 +名前 +向け +向こう +和 +哀 +品 +員 +喜 +器 +四 +回 +国 +土 +在 +地 +報じ +場 +場合 +境 +士 +夏 +外 +多く +大 +女 +奴 +婦 +子 +字 +安 +官 +室 +家 +対 +小 +屋 +巡る +左 +市 +席 +年 +年生 +幾つ +店 +府 +度 +式 +形 +役 +彼 +彼女 +後 +怒 +思わ +性 +情 +感 +感じ +我々 +所 +手 +手段 +扱い +数 +文 +新た +新着 +方 +方法 +日 +春 +時 +時点 +時間 +更新 +書 +月 +期 +木 +未満 +末 +本 +本当 +村 +束 +枚 +校 +楽 +様 +様々 +次 +歳 +歴 +段 +毎 +毎日 +気 +水 +求め +法 +派 +火 +点 +版 +特に +玉 +用 +男 +町 +界 +略 +百 +的 +目 +相 +県 +確か +示し +社 +私 +私達 +秋 +秒 +第 +等 +箇所 +箇月 +簿 +系 +紀 +約 +結局 +続き +線 +署 +考え +者 +自体 +自分 +行 +行わ +見 +見る +観 +言わ +計 +話 +話し +誌 +語っ +読む +誰 +課 +調べ +論 +貴方 +貴方方 +輪 +近く +述べ +通 +速報 +連 +週 +道 +達 +違い +選 +部 +都 +金 +銭 +開か +間 +関 +関係 +関連 +際 +集 +面 +頃 +類 +首 +高 +! +!? +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +: +; +< += +> +? +@ +[ +\ +] +^ +_ +` +{ +| +} +~ +⦅ +⦆ +。 +「 +」 +、 +・ +¢ +£ +¬ + ̄ +¦ +¥ +₩ +│ +← +↑ +→ +↓ +■ +○ diff --git a/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt b/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt old mode 100755 new mode 100644 index 69707d4e8c..5db1a5f6ef --- a/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt @@ -1,19 +1,74 @@ -# # This is a stop word list for the Lithuanian language. -# # Sources: -# http://www.filewatcher.com/p/punbb-1.2.16.tbz.620109/www/punbb/lang/Lithuanian/stopwords.txt.html -# auto-generated sources # +# auto-generated sources +# http://www.filewatcher.com/p/punbb-1.2.16.tbz.620109/www/punbb/lang/Lithuanian/stopwords.txt.html +# https://github.com/stopwords-iso/stopwords-lt/blob/master/stopwords-lt.txt + a +abi +abidvi +abiejose +abiejuose +abiejų +abiem +abigaliai +abipus +abu +abudu +ai +ana +anaiptol +anaisiais +anajai +anajam +anajame +anapus +anas +anasai +anasis +anei +aniedvi +anieji +aniesiems +anoji +anojo +anojoje +anokia +anoks +anosiomis +anosioms +anosios +anosiose +anot +ant +antai +anuodu +anuoju +anuosiuose +anuosius +anąja +anąją +anąjį +anąsias +anųjų apie +aplink ar arba +argi +arti +aukščiau aš be bei +beje +bemaž +bent bet +betgi +beveik bus buvo būti @@ -22,112 +77,474 @@ d dabar dar darbo +dargi daryti daug daugiau daugiausia +daugmaž dažnai +deja dieną +dėka dėl +dėlei +dėlto +ech +et +gal +galbūt +galgi gali +gan +gana gauna gauti +gi +greta +idant iki ir +irgi +it +itin iš +išilgai +išvis +jaisiais +jajai +jajam +jajame jam jau jei +jeigu ji jie +jiedu +jiedvi +jieji +jiesiems +jinai jis +jisai jo jog +joji +jojo +jojoje +jokia +joks jos +josiomis +josioms +josios +josiose +judu +judvi +juk +jumis +jums +jumyse +juodu +juoju +juosiuose +juosius +jus +jąja +jąją +jąsias jį +jįjį +jūs +jūsiškis +jūsiškė +jūsų jų +jųjų kad kada +kadangi kai kaip +kaipgi kam kartą kas +katra +katras +katriedvi +katruodu +kažin +kažkas +kažkatra +kažkatras +kažkokia +kažkoks +kažkuri +kažkuris +kiaurai +kiek kiekvienas +kieno +kita kitas +kitokia +kitoks klausimas klausti +kodėl +kokia +koks +kol +kolei +kone kovo +kuomet kur +kurgi +kuri kurie +kuriedvi kurios kuris +kuriuodu kurių labai +lai lietuva lietuvoje lietuvos +lig +ligi +link +lyg m man +manaisiais +manajai +manajam +manajame +manas +manasai +manasis +mane +manieji +maniesiems +manim +manimi +maniškis +maniškė mano +manoji +manojo +manojoje +manosiomis +manosioms +manosios +manosiose +manuoju +manuosiuose +manuosius +manyje +manąja +manąją +manąjį +manąsias +manęs +manųjų +mat mažai mažas +maždaug mažiau +mažne mes metais metu metus metų +mudu +mudvi +mumis +mums +mumyse +mus +mūsiškis +mūsiškė mūsų +na +nagi ne +nebe +nebent negali +negi negu nei +nejau +nejaugi +nekaip +nelyginant nes net +netgi +netoli +neva niekada niekas nors nuo +nė nėra o +ogi +oi +paeiliui pagal +pakeliui +palaipsniui +palei +pas pasak pasakė +paskos +paskui +paskum pat +pati +patiems +paties +pats +patys +patį +pačiais +pačiam +pačiame +pačiu +pačiuose +pačius +pačių per +pernelyg +pirm +pirma +pirmiau po prašau prie prieš +priešais +pro +pusiau r +rasi reikia +rodos sakyti sakė +sau +savaisiais +savajai +savajam +savajame +savas +savasai +savasis +save +savieji +saviesiems +savimi +saviškis +saviškė savo +savoji +savojo +savojoje +savosiomis +savosioms +savosios +savosiose +savuoju +savuosiuose +savuosius +savyje +savąja +savąją +savąjį +savąsias +savęs +savųjų +skersai +skradžiai +stačiai su +sulig +ta +tad tai +taigi taip taip pat +taipogi +taisiais +tajai +tajam +tajame +tamsta tarp +tarsi +tartum +tarytum tas +tasai +tau +tavaisiais +tavajai +tavajam +tavajame +tavas +tavasai +tavasis +tave +tavieji +taviesiems +tavimi +taviškis +taviškė tavo +tavoji +tavojo +tavojoje +tavosiomis +tavosioms +tavosios +tavosiose +tavuoju +tavuosiuose +tavuosius +tavyje +tavąja +tavąją +tavąjį +tavąsias +tavęs +tavųjų tačiau +te +tegu +tegul +tiedvi +tieji +ties +tiesiems +tiesiog tik tikrai +tikriausiai +tiktai to todėl +toji +tojo +tojoje +tokia +toks +tol +tolei +toliau +tosiomis +tosioms +tosios +tosiose +tu tuo +tuodu +tuoju +tuosiuose +tuosius +turbūt turi turėjo +tąja +tąją +tąjį +tąsias +tūlas +tųjų už +užtat +užvis +va +vai val +viduj +vidury +vien vienas +vienokia +vienoks +vietoj +virš +viršuj +viršum +vis +vis dėlto +visa +visas +visgi visi +visokia +visoks +vos +vėl +vėlgi +ypač yra čia į +įkypai +įstrižai šalia šalies +še +ši +šiaisiais +šiajai +šiajam +šiajame +šiapus +šiedvi +šieji +šiesiems +šioji +šiojo +šiojoje +šiokia +šioks šios -žmonių +šiosiomis +šiosioms +šiosios +šiosiose +šis +šisai +šit +šita +šitas +šitiedvi +šitokia +šitoks +šituodu +šiuodu +šiuoju +šiuosiuose +šiuosius +šiąja +šiąją +šiąsias +šiųjų +štai +šįjį +žemiau +žmonių \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/lt/lt_stop_words_old.txt b/apps/common/src/python/mediawords/languages/lt/lt_stop_words_old.txt new file mode 100755 index 0000000000..69707d4e8c --- /dev/null +++ b/apps/common/src/python/mediawords/languages/lt/lt_stop_words_old.txt @@ -0,0 +1,133 @@ +# +# This is a stop word list for the Lithuanian language. +# +# Sources: +# http://www.filewatcher.com/p/punbb-1.2.16.tbz.620109/www/punbb/lang/Lithuanian/stopwords.txt.html +# auto-generated sources +# + +a +apie +ar +arba +aš +be +bei +bet +bus +buvo +būti +būtų +d +dabar +dar +darbo +daryti +daug +daugiau +daugiausia +dažnai +dieną +dėl +gali +gauna +gauti +iki +ir +iš +jam +jau +jei +ji +jie +jis +jo +jog +jos +jį +jų +kad +kada +kai +kaip +kam +kartą +kas +kiekvienas +kitas +klausimas +klausti +kovo +kur +kurie +kurios +kuris +kurių +labai +lietuva +lietuvoje +lietuvos +m +man +mano +mažai +mažas +mažiau +mes +metais +metu +metus +metų +mūsų +ne +negali +negu +nei +nes +net +niekada +niekas +nors +nuo +nėra +o +pagal +pasak +pasakė +pat +per +po +prašau +prie +prieš +r +reikia +sakyti +sakė +savo +su +tai +taip +taip pat +tarp +tas +tavo +tačiau +tik +tikrai +to +todėl +tuo +turi +turėjo +už +val +vienas +visi +yra +čia +į +šalia +šalies +šios +žmonių diff --git a/apps/common/src/python/mediawords/languages/nl/nl_stop_words.txt b/apps/common/src/python/mediawords/languages/nl/nl_stop_words.txt index 1ee9a2887d..6ef3790c11 100644 --- a/apps/common/src/python/mediawords/languages/nl/nl_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/nl/nl_stop_words.txt @@ -1,108 +1,415 @@ -# -# This is a stop word list for the Dutch language. -# # Sources: -# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-nl/blob/master/stopwords-nl.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) aan +aangaande +aangezien +achte +achter +achterna +af +afgelopen al +aldaar +aldus +alhoewel +alias +alle +allebei +alleen alles als +alsnog altijd +altoos +ander andere +anders +anderszins +beetje +behalve +behoudens +beide +beiden ben +beneden +bent +bepaald +betreffende bij +bijna +bijv +binnen +binnenin +blijkbaar +blijken +boven +bovenal +bovendien +bovengenoemd +bovenstaand +bovenvermeld +buiten +bv daar +daardoor +daarheen +daarin +daarna +daarnet +daarom +daarop +daaruit +daarvanlangs dan dat de +deden +deed der +derde +derhalve +dertig deze +dhr die +dikwijls dit doch +doe doen +doet door +doorgaand +drie +duizend dus +echter een eens +eer +eerdat +eerder +eerlang +eerst +eerste +eigen +eigenlijk +elk +elke en +enig +enige +enigszins +enkel er +erdoor +erg +ergens +etc +etcetera +even +eveneens +evenwel +gauw ge +gedurende geen +gehad +gekund +geleden +gelijk +gemoeten +gemogen +genoeg geweest +gewoon +gewoonweg haar +haarzelf had +hadden +hare heb hebben +hebt +hedden heeft +heel hem +hemzelf +hen het +hetzelfde hier +hierbeneden +hierboven +hierin +hierna +hierom hij +hijzelf hoe +hoewel +honderd hun +hunne +ieder +iedere +iedereen iemand iets ik +ikzelf in +inderdaad +inmiddels +intussen +inzake is ja je +jezelf +jij +jijzelf +jou +jouw +jouwe +juist +jullie kan +klaar kon +konden +krachtens +kun kunnen +kunt +laatst +later +liever +lijken +lijkt +maak +maakt +maakte +maakten maar +mag +maken me meer +meest +meestal men met +mevr +mezelf mij mijn +mijnent +mijner +mijzelf +minder +miss +misschien +missen +mits +mocht +mochten +moest +moesten moet +moeten +mogen +mr +mrs +mw na naar +nadat +nam +namelijk +nee +neem +negen +nemen +nergens +net +niemand niet niets +niks +noch +nochtans nog +nogal +nooit nu +nv of +ofschoon om omdat +omhoog +omlaag +omstreeks +omtrent +omver +ondanks onder -ons +ondertussen +ongeveer +onszelf +onze +onzeker +ooit ook op +opnieuw +opzij over +overal +overeind +overige +overigens +paar +pas +per +precies +recent reeds +rond +rondom +samen +sedert +sinds +sindsdien +slechts +sommige +spoedig +steeds +tamelijk te -tegen +tenzij +terwijl +thans +tien +tiende +tijdens +tja toch +toe toen +toenmaals +toenmalig tot +totdat +tussen +twee +tweede u uit +uitgezonderd uw +vaak +vaakwat van +vanaf +vandaan +vanuit +vanwege veel +veeleer +veertig +verder +verscheidene +verschillende +vervolgens +via +vier +vierde +vijf +vijfde +vijftig +vol +volgend +volgens voor +vooraf +vooral +vooralsnog +voorbij +voordat +voordezen +voordien +voorheen +voorop +voorts +vooruit +vrij +vroeg +waar +waarom +waarschijnlijk +wanneer want waren was wat +we +wederom +weer +weg +wegens +weinig +wel +weldra +welk +welke werd +werden +werder wezen +whatever wie +wiens +wier +wij +wijzelf wil +wilden +willen +word worden wordt zal ze +zei +zeker zelf +zelfde +zelfs +zes +zeven zich +zichzelf zij zijn +zijne +zijzelf zo +zoals +zodat +zodra zonder zou +zouden +zowat +zulk +zulke +zullen +zult \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/nl/nl_stop_words_old.txt b/apps/common/src/python/mediawords/languages/nl/nl_stop_words_old.txt new file mode 100644 index 0000000000..1ee9a2887d --- /dev/null +++ b/apps/common/src/python/mediawords/languages/nl/nl_stop_words_old.txt @@ -0,0 +1,108 @@ +# +# This is a stop word list for the Dutch language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +aan +al +alles +als +altijd +andere +ben +bij +daar +dan +dat +de +der +deze +die +dit +doch +doen +door +dus +een +eens +en +er +ge +geen +geweest +haar +had +heb +hebben +heeft +hem +het +hier +hij +hoe +hun +iemand +iets +ik +in +is +ja +je +kan +kon +kunnen +maar +me +meer +men +met +mij +mijn +moet +na +naar +niet +niets +nog +nu +of +om +omdat +onder +ons +ook +op +over +reeds +te +tegen +toch +toen +tot +u +uit +uw +van +veel +voor +want +waren +was +wat +werd +wezen +wie +wil +worden +wordt +zal +ze +zelf +zich +zij +zijn +zo +zonder +zou diff --git a/apps/common/src/python/mediawords/languages/no/no_stop_words.txt b/apps/common/src/python/mediawords/languages/no/no_stop_words.txt index 2fd8a00993..5949a9c321 100644 --- a/apps/common/src/python/mediawords/languages/no/no_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/no/no_stop_words.txt @@ -1,13 +1,17 @@ -# -# This is a stop word list for the Norwegian language. -# # Sources: -# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-no/blob/master/stopwords-no.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) +å alle +andre +arbeid at av +både +båe bare begge ble @@ -15,9 +19,10 @@ blei bli blir blitt -både -båe +bort +bruke da +då de deg dei @@ -39,7 +44,6 @@ ditt du dykk dykkar -då eg ein eit @@ -47,15 +51,26 @@ eitt eller elles en +ene +eneste +enhver enn er et ett etter +få +folk for +før fordi +forsûke fra -før +fûr +gå +gjorde +gjûre +god ha hadde han @@ -84,11 +99,11 @@ hvorfor i ikke ikkje -ikkje ingen ingi inkje inn +innen inni ja jeg @@ -104,8 +119,15 @@ kvarhelst kven kvi kvifor +lage +lang +lik +like +må +makt man mange +måte me med medan @@ -113,13 +135,21 @@ meg meget mellom men +mens +mer +mest mi min mine mitt mot +mye mykje +nå +når +navn ned +nei no noe noen @@ -128,8 +158,7 @@ noko nokon nokor nokre -nå -når +ny og også om @@ -137,35 +166,47 @@ opp oss over på +part +punkt +så samme +sånn +sant seg selv si -si sia sidan siden sin sine +sist sitt sjøl skal skulle slik +slutt so som -som somme somt -så -sånn +start +stille +tid til +tilbake +tilstand um +under upp ut uten var +vår +være vart +vært varte ved vere @@ -173,11 +214,9 @@ verte vi vil ville +vite vore +vöre vors vort -vår -være -være -vært -å +vört \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/no/no_stop_words_old.txt b/apps/common/src/python/mediawords/languages/no/no_stop_words_old.txt new file mode 100644 index 0000000000..2fd8a00993 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/no/no_stop_words_old.txt @@ -0,0 +1,183 @@ +# +# This is a stop word list for the Norwegian language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +alle +at +av +bare +begge +ble +blei +bli +blir +blitt +både +båe +da +de +deg +dei +deim +deira +deires +dem +den +denne +der +dere +deres +det +dette +di +din +disse +ditt +du +dykk +dykkar +då +eg +ein +eit +eitt +eller +elles +en +enn +er +et +ett +etter +for +fordi +fra +før +ha +hadde +han +hans +har +hennar +henne +hennes +her +hjå +ho +hoe +honom +hoss +hossen +hun +hva +hvem +hver +hvilke +hvilken +hvis +hvor +hvordan +hvorfor +i +ikke +ikkje +ikkje +ingen +ingi +inkje +inn +inni +ja +jeg +kan +kom +korleis +korso +kun +kunne +kva +kvar +kvarhelst +kven +kvi +kvifor +man +mange +me +med +medan +meg +meget +mellom +men +mi +min +mine +mitt +mot +mykje +ned +no +noe +noen +noka +noko +nokon +nokor +nokre +nå +når +og +også +om +opp +oss +over +på +samme +seg +selv +si +si +sia +sidan +siden +sin +sine +sitt +sjøl +skal +skulle +slik +so +som +som +somme +somt +så +sånn +til +um +upp +ut +uten +var +vart +varte +ved +vere +verte +vi +vil +ville +vore +vors +vort +vår +være +være +vært +å diff --git a/apps/common/src/python/mediawords/languages/pt/pt_stop_words.txt b/apps/common/src/python/mediawords/languages/pt/pt_stop_words.txt index d49861eea5..964c0d13d1 100644 --- a/apps/common/src/python/mediawords/languages/pt/pt_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/pt/pt_stop_words.txt @@ -1,11 +1,11 @@ -# -# This is a "long" stop word list for the Portuguese language. -# +# (Lightly edited to remove words in the original lists that are actually meaningful) # Sources: -# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-pt/blob/master/stopwords-pt.txt a +à a meta abaixo abastecimento @@ -22,18 +22,17 @@ abrir abriu absoluta absolutamente -absurdo -abuso acaba acabam acabar acabaram acabou -academia +ação acaso aceita aceitar aceitou +acerca acertar acertou acesso @@ -43,8 +42,6 @@ achar achei acho achou -acidente -acidentes acima acompanha acompanhada @@ -60,144 +57,52 @@ acontecerá aconteceu acontecido acontecimentos -acordo acredita acreditam acreditar acredito acrescenta acrescentou -acumulado -acusado -acusados -acusação -acusações -adequada -adequado -adesão +adeus adianta adiante adiantou -administrador -administrar -administrativa -administrativo -administração -admite -admitiu -adolescente -adolescentes -adotar -adoção -adquirir -adultos -adversário -adversários -advogada -advogado -advogados -aeroporto afastado afastar afinal -afirma -afirmam afirmando afirmar afirmou -agenda -agente -agentes agir agora agosto agradecer -agressão -agricultores -agricultura -agrícola aguarda aguardar -agência -agências +águas ah +aí ainda -ajuda -ajudam -ajudar -ajudou ala alcançar alega alegou -alegre -alegria -alemão +alem +além alerta +algmas algo +alguém algum alguma algumas alguns -alguém ali -aliado -aliados -aliança -alimentar -alimentação -alimento -alimentos aliás -alma -almoço -alta -altas -alteração -alterações -alternativa -alternativas -alto -altos -altura -aluguel -aluno -alunos alves -alvinegro -alvo -além -ama amanhã -amarelo ambas -ambientais -ambiental -ambiente ambos -ameaça -ameaças -americano -americanos -amiga -amigo -amigos -amizade -amor -ampla -ampliar -ampliação -amplo -analisa -analisar -analistas -anda -andamento -andar -animais -animal -animação -aniversário ano anos ante @@ -212,151 +117,63 @@ anual anuncia anunciado anunciar -anunciou -análise anúncio +anunciou ao +aonde aos aparece aparecem aparecer apareceu aparecida -aparelho -aparelhos -apartamento -apelo apenas apesar +aplicação aplicada aplicado aplicar -aplicação apoia apoiar apoio aponta apontam +apontar apontou -aposentado -aposentadoria -aposentados -aposta -apreensão -aprender -aprendizado -apresenta -apresentada -apresentadas -apresentado -apresentados -apresentam -apresentar -apresentaram -apresentação -apresentações -apresentou -aprovada -aprovado -aprovados -aprovar -aprovação -aproveitar -aproveitou +apos +após aproximadamente apuração -após aquela aquelas aquele aqueles aqui aquilo -aquisição ar -areia -arena -argumento -argumentos -arma -armado -armas -arrecadação -arroz -arruda -art -arte -artes -artigo -artigos -artilheiro -artista -artistas +área +áreas as -asfalto -aspecto -aspectos -assaltantes -assalto -assassinato -assembleia -assessor -assessores -assessoria +às assim -assinado -assinar -assinatura -assinou -assistente -assistir -assistência -associados -associação -associações assume assumir assumiu assunto assuntos at -atacante -atacar -ataque -ataques +até atende atendendo atender atendidas -atendido -atendidos -atendimento -atendimentos -atento -atenção -atinge -atingido -atingir -atingiu -atitude -atitudes -ativa -atividade -atividades -atleta -atletas ato -ator -atores atos atrair +atrás atraso através -atração -atrações -atriz -atrás -atua +atuação atuais atual atualizado @@ -364,102 +181,27 @@ atualmente atuam atuando atuar -atuação atuou -até -auditório -audiência -aula -aulas aumenta aumentando aumentar aumento aumentou -ausência -automóveis -automóvel -autonomia -autor -autores -autoria -autoridade -autoridades -autorização -autos -auxiliar -auxílio -avalia -avaliar -avaliação avaliou -avançar -avanço -avanços -avançou -avenida -avisa -avião -avó -azul -ação -ações -aérea -aí -baiano -baile -bairro -bairros -baixa -baixo -baixos -balanço -bancada -banco -bancos -banda -bandas -bandeira -bandido -bandidos -banheiro -banho -bar -barato -barco -bares -barra -barreiras -barros -barulho base baseado bases +básica +básicas +básico basta bastante -bastidores -batalha -batalhão -bate -bater -bateria -bateu -beber -bebida -bebidas -bebê -beira -bela -beleza -belo bem beneficiar benefício benefícios bens bernardo -biblioteca -bicicleta bilhão bilhões bloco @@ -467,175 +209,28 @@ blocos blog boa boas -boca -bola -boletim -bolsa -bolsas -bolso bom -bomba -bombeiros -bonita -bonito bons -branca -branco -brancos -brasileiras -brasileiro -brasileiros -braço -braços -breve -briga -brilhante -brincadeira -brincar -brinquedos -bruto -buraco -buracos -busca -buscam -buscando -buscar -básica -básicas -básico +cá cabe cabeceou -cabelo -cabelos -cabeça -cabo -cachorro cada -cadastro -cadeia -cadeira -cadeiras cadê -cai -cair -caiu -caixa -caixas -calendário -calma -calor -calçada -cama -caminhada -caminho -caminhos -caminhão -caminhões -camisa -campanha -campanhas -campeonato -campeão -campeões -campo -campos -campus -cana -canal -candidata -candidato -candidatos -candidatura -candidaturas -canto -cantor -cantora -caos -capa -capacidade -capacitação -capaz -capazes -capitais -capital -capitão -capixaba -capítulo -cara -característica -características -caras -carga -cargo -cargos -carinho -carioca -carne -caro -carreira -carro -carros -carta -cartas -carteira -cartão -cartório -cartões -caruaru -caráter -casa -casado -casal -casamento -casar -casas -caso -casos -cassado -cassação -castelo catarinense categoria categorias -causa -causar -causas -causou -caíram -cd -cedo -celular -cem -cemitério -cena +cenário cenas centenas cento centrais central -centro -centros -cenário -cerca -cerimônia -certa -certamente -certas -certeza -certo -certos -cerveja -chama chamada chamado chamados chamar chamou -chance -chances -chapa -chave -chefe +chão chega chegada chegam @@ -645,204 +240,59 @@ chegar chegaram chegou cheguei -cheia -cheio -cheiro -cheque -choque -chute -chutou -chuva -chuvas -chão -ciclo -cidadania -cidade -cidades -cidadão -cidadãos -cientistas cima cinco -cinema -circo -circuito -circulação -cirurgia cita citado citar citou -civil -civis -ciência -ciências -classe -classes -classificação -cliente -clientes -clima -clique -clube -clubes -clássico -clínica -cobertura -cobra -cobrança -cobrar -cobrou -cofres coisa coisas colaboradores -colega -colegas -coleta -coletiva -coletivo -coleção -coligação coloca +colocação colocada colocado colocados colocando colocar -colocação colocou coloque -colorado -coluna -colunista -colégio com -comandante -comando -combate -combater -combustível -comecei -comemora -comemorar -comemoração -comemorou -comenta -comentar -comentou -comentário -comentários -comer -comerciais -comercial -comercialização -comerciante -comerciantes começa começam começando começar começaram +comecei começo começou -comida +comenta +comentar +comentário +comentários +comentou comigo -comissão -comitê como -companheiro -companheiros -companhia -companhias -comparação -competente -competição -competições -competência complementar -completa completamente -completar -completo -completou -complexo -complicado -compor -comportamento -composição -composta -composto -compra -comprar -compras -compreensão -compromisso -compromissos -comprou -computador -computadores -comum -comunicado -comunicação -comunidade -comunidades -comuns -comércio -conceito -conceitos -concentração -concessão -conclui -concluir -concluiu -conclusão -concorda concordo -concorrentes -concorrer -concorrência -concreto -concurso -concursos -condenado -condenação -condição -condições -conduta -confederação conferir -conferência confiança confira confirma confirmado confirmar confirmou -conflito -conflitos conforme -conforto -confronto -confusão conhece conhecem conhecer conheceu conhecida conhecido -conhecidos -conhecimento -conhecimentos conheço conjunto -conquista -conquistar -conquistas -conquistou -consciente -conscientização -consciência consegue conseguem consegui @@ -851,45 +301,21 @@ conseguimos conseguir conseguiram conseguiu -conselheiro -conselho -conselhos -consenso -conservação considera +consideração considerada considerado considerados considerando considerar -consideração considerou consigo consta -constante -constantes -constitucional -construir -construção -construída -construído -consulta -consultas -consultoria -consumidor -consumidores -consumo -consórcio -conta -contam -contando contar contará -contas contato contatos conter -contexto conteúdo continua continuam @@ -900,492 +326,139 @@ continuidade continuou contou contra -contrapartida -contratado -contratados -contratar -contratação -contratações -contrato -contratos -contribuinte -contribuir -contribuição -controlar -controle contrário contudo -convencer -convenção -conversa -conversar -conversas -convidado -convidados -convite -conviver -convivência -convênio -cooperativa -coordenador -coordenadora -coordenação -cor -coragem -coração -cores -coronel -corpo -corpos -corre -correndo -corrente -correr -correta -correto -correção -corrida -cortar -corte -cortes costas costuma costumam cotidiano -cozinha -credibilidade -creio -cresce -crescendo -crescente -crescer -cresceu -crescimento -cria -criada criado criando -crianças criar -criatividade -criação -crime -crimes -criminal -criminalidade -criminosos criou -crise -criticar -criticou -critério -critérios -cruzamento -cruzes -cruzou -crédito -créditos -crítica -críticas -crítico -cuidado -cuidados -cuidar cuja +cujas cujo -culpa -cultura -culturais -cultural -cumprimento -cumprir -cumpriu -cunha -currículo -curso -cursos -curto -custa -custo -custos -cá -cães -cérebro -céu -código -cúpula -da -dada -dado +cujos dados +daí +dança dando -danos dantas -dança daquela +daquelas daquele daqueles daqui -dar -daria -dará -das -data -dava -daí de -de deus -debaixo -debate -debates -decide -decidir -decidiu -decisão -decisões -declaração -declarações -declarou -decoração -decreto -dedicação -defende -defender -defendeu -defensor -defesa -deficiência -define -definida -definido -definir -definitivamente -definiu -definição -deixa deixado -deixam -deixando -deixar -deixaram -deixe -deixou -dela -delas -dele -delegacia -delegado -deles -demais -demanda -demandas -demissão -democrático -demonstra -demonstrou -demora -demorou dentre dentro -denunciar -denúncia -denúncias -departamento -depende dependendo depender -depoimento -depoimentos depois -deputada -deputado -deputados -der -deram -derrota -derrotado -desafio -desafios -descoberta -descobre -descobrir -descobriu -desconto -desculpas desde -deseja -desejar -desejo -desembargador -desempenho -desemprego -desenvolver -desenvolvido -desenvolvimento -desespero -desfile -despesas +desligado dessa dessas desse desses desta -destaca -destacar -destacou -destaque -destaques destas deste destes -destinado -destinados -destino -desvio -desviou -detalhe -detalhes -determina -determinado -determinação -determinou deu deve deve-se devem devemos dever -deveria -deveriam deverá deverão +deveria +deveriam devia devido dez +dezanove +dezasseis +dezassete dezembro dezenas +dezoito dia dia-a-dia -diagnóstico diante +diária diariamente +diárias +diário dias dica dicas -diferente -diferentes diferença diferenças +diferente +diferentes +difíceis +difícil dificilmente dificuldade dificuldades -difíceis -difícil diga digital dignidade digo diminuir -diminuição -dinheiro -diploma +direção +direita direito direta diretamente -direto -diretor -diretora -diretores -diretoria -diretório -direção diria -dirigente -dirigentes -dirigir disciplina -disco -discurso -discussão -discussões -discutir +dispõe +dispoem disponíveis disponível -disposição disposto -disputa -disputar -dispõe disse disseram disso -distante -distribuição -distribuídos distrito dito -diversas diversos -divisão -divulgada -divulgado -divulgados -divulgar -divulgação -divulgou diz dizem dizendo dizer dizia -diálogo -diária -diárias -diário do -doação -doações -doce -documentação -documento -documentos -doente -doença -doenças dois domingo -domínio -dona -dono -donos -dor -dores -dormir dos -dose -doutor +doze duas -duelo -dupla -dura -durante duração -duro +durante durou dutra -dvd -dão -década -décadas -déficit -dívida -dívidas -dólar -dólares -dúvida -dúvidas e -e-mail -economista -econômica -econômicas -econômico -econômicos -edital -edição -edições -educacional +é efeito efeitos -efetivamente -efetivo -eficiente -eficiência eis ela -elaboração elas ele -eleger -elegeu -eleita -eleito -eleitor -eleitorado -eleitorais -eleitores -eleitos -eleição -eleições -elementos -elenco eles -eletrônica -eletrônico -elevado -elevação -elite -elogios -elétrica em embora -emenda -emendas -emergência -emissora -emissoras -emissão -emocional -emoção -emoções -empate -empatou -empenho -empreendimento -empreendimentos -empregados -emprego -empregos -empresa -empresarial -empresas -empresário -empresários -empréstimo -empréstimos -encaminhado -encaminhados -encarar -encerramento -encerrou -enchentes -encontra -encontrada -encontrado -encontrados -encontram -encontrar -encontraram -encontro -encontros -encontrou -endereço -energia -enfatizou enfim -enfrenta -enfrentar -engenharia -engenheiro -enorme enquanto -ensinar -ensino entanto -entende -entender -entendeu -entendimento +entao +então entendo entidade entidades @@ -1395,117 +468,41 @@ entram entrar entraram entre -entrega -entregar -entregou -entregue -entregues -entretanto -entrevista -entrevistados -entrevistas -entrou -então -enviado -enviar -enviou -envolve -envolvendo -envolvido -envolvidos -envolvimento -episódio -equilíbrio -equipamento -equipamentos -equipe -equipes -equivalente era eram -errado -erro -erros -escala -escanteio -esclarecer -escola -escolar -escolas -escolha -escolher -escolheu -escolhido -esconder -escrever -escreveu -escrita -escrito -escritor -escritório -esforço -esforços -espanhol -espaço -espaços -especiais -especial -especialista -especialistas -especializada +éramos +és especialmente específica específico espera esperado esperamos -esperando esperança +esperando esperar esperava espero -espetáculo -espiritual -esportiva -esportivo -esposa -espécie -espécies -espírito -esquecer -esquema esquerdo esquina essa essas esse -essencial esses esta -estabelece -estabelecer -estabelecimento -estabelecimentos -estabilidade -estacionamento -estado -estados -estaduais -estadual +está estamos +estão estar -estaria -estariam estará estarão +estaria +estariam estas -estatal -estatuto -estatística -estatísticas +estás estava estavam -estação +estávamos este esteja estejam @@ -1514,176 +511,62 @@ estes esteve estilo estimativa -estimular estive estivemos estiver estivera estiveram +estivéramos estiverem estivermos estivesse estivessem -estivéramos estivéssemos +estiveste +estivestes estou -estrada -estradas -estrangeiros -estranho -estratégia -estratégias -estreia -estrela -estrelas -estrutura -estudar -estudo -estudos -está -estádio -estágio -estávamos -estão -etapa -etapas etc etc. eu -evento -eventos -eventual -evidente -evitar -evolução -ex-deputado -ex-governador -ex-prefeito -ex-presidente -exame -exames exatamente -excelente -excelência -excesso -exceção exclusivamente -executiva -executivo -execução -exemplo -exemplos -exercer -exercício -exercícios -exige -exigir -exigência -exigências -existe -existem -existentes -existia -existir -existência -expandir -expansão -expectativa -expectativas -experiência -experiências explica -explicar explicação explicações +explicar explicou -exploração -exportações -exposição -expressão -expulso -extensão -exterior -externa extra extremamente -exército +façam face -facilidade +fácil facilitar facilmente -faculdade +faço faixa -faixas -fala -falam -falando -falar falei -falha -falhas falou -falta -faltam -faltando -faltou -fama -familiar -familiares -famoso -famílias -faria -farmácia -farroupilha fará -farão -fase -fato -fator -fatores -fatos -faturamento -favor -favorável +faria faz +fazeis fazem fazemos fazenda fazendo fazer +fazes fazia -faça -façam -faço -febre fechada fechado fechados fechamento fechar fechou -federais -federal -federação -feira feita feitas feito feitos -felicidade -feliz -felizes -feminina -feminino -fenômeno -feriado -feridos -ferramenta -ferramentas -ferro -festa -festas -festival fevereiro fez fica @@ -1691,852 +574,143 @@ ficam ficamos ficando ficar -ficaram -ficaria ficará +ficaram ficarão +ficaria ficava -ficha fico ficou -fiel -figura -fila -filha -filhas -filho -filhos -filme -filmes -filosofia -fim -finais final -finalidade finalizou finalmente -financeira -financeiras -financeiro -financeiros -financiamento -finanças fins fique fiquei -firme -fiscais -fiscal -fiscalizar -fiscalização fiz fizemos fizeram -fiéis -flagrante -flor -flores -floresta -fluxo foco -fogo foi -folha fomos -fonte -fontes for -fora foram +fôramos +forças forem -forma -formada -formado -formar -formas -formato -formação formos -forró -fortalecer -forte -força -forças fosse fossem -foto -fotos -fraco -francês -frase -fraude -freitas -frente -frio -frisou -fronteira -frota -frutas -fruto -frutos -fuga -fugir -fugiu -fui -funciona -funcionamento -funcionando -funcionar -funcionário -funcionários -fundamentais -fundamental -fundação -fundo -fundos -função -funções -furto -futebol -futsal -futuro -futuros -fábrica -fácil -fãs -fé -férias -física -físicas -físico -fórmula -fórum -fôramos fôssemos -gabinete -gado -galeria -ganha -ganham -ganhando -ganhar -ganho -ganhos -ganhou -garante -garantia -garantir -garantiu -garota -garoto -gastar -gasto -gastos -gaúcha -gaúchos -general -gente -geografia -gera -gerais -geral -geralmente -gerando -gerar -geração -gerente -gerou -gestor -gestores -gestão -ginásio -global -gol -goleiro -golpe -gols -gosta -gostam +foste +fostes +fui gostaria -gostei -gosto -gostou -governador -governadora -governadores -governantes -governar -governo -governos -gramado -grande -grandes -gratuita -gratuito -grau -grave -graves -graças -grossa -grosso -grupo -grupos -grãos -guarda -guia -gás -gênero -habitantes -habitação +ha +há haja hajam hajamos +hão harmonia havemos haver -haveria haverá +haveria havia haviam -hectares -hei -helena -hipótese -história -histórias -histórica -histórico hoje -homem -homenagem -homens -homicídio -homicídios -honra hora -horas -horizonte -horário -horários -hospital -hotel -hotéis houve houvemos houver houvera +houverá houveram +houvéramos +houverão houverei houverem houveremos houveria houveriam -houvermos -houverá -houverão houveríamos +houvermos houvesse houvessem -houvéramos houvéssemos -hugo -humana -humanidade -humano -humanos -humor -há -hábito -hão ia ibope ida -idade -ideal -identidade -identificado -identificar -identificação -idosos -idéia -idéias -iguais -igual -igualdade -ilegal -ilha -iluminação -imagem -imagens -imaginar -imediata -imediatamente -imediato -impacto -impede -impedir -implantar -implantação -impor -importa -importante -importantes -impossível -imposto -impostos -imprensa -impressão -imóveis -imóvel -inauguração -incentivar -incentivo -inclui -incluindo -inclusive -inclusão -incrível -incêndio -indenização -independente -independentemente -independência -indica -indicado -indicar -indicação -individuais -individual -indivíduo -indivíduos indo -industrial -indícios -indígena -indígenas -indústria -indústrias -infantil -infelizmente -inferior -influência -informa -informado -informar -informação -informações -informou -informática -infra-estrutura -infraestrutura -inglês -ingresso -ingressos -inicia -iniciada -inicial -inicialmente -iniciar -iniciativa -iniciativas -iniciou -inquérito -inscritos -inscrição -inscrições -instalada -instalar -instalação -instalações -institucional -instituição -instituições -instituto -instrumento -instrumentos -integra -integral -integrante -integrantes -integrar -integração -inteira -inteiro -inteligente -inteligência -intensa -intenso -intenção -intenções -inter -interessa -interessados -interessante -interesse -interesses -interior -interna -internacionais -internacional -internado -internet -interno -interpretação -intervalo -intervenção -intuito -invadiu -inverno -investidores -investigar -investigação -investigações -investimento -investimentos -investir invés -início -inúmeras -inúmeros ir -iria -irmã -irmão -irmãos -irregular -irregularidades irá irão +iria isso +ista +iste isto -italiano item itens +já jamais janeiro -janela -jantar -jardim -jc -jeito -joga -jogada -jogadas -jogador -jogadores -jogando -jogar -jogo -jogos -jogou -jornada -jornais -jornal -jornalismo -jornalistas -judicial -judiciário -juiz -julgamento -julgar julho junho -juntamente -junto -juntos -juros -jurídica -jurídico -justa -justamente -justifica -justificar -justificativa -justiça -justo -juventude -juíza -juízes -juízo -já km -laboratório -lado -lados -ladrões -lago -lamentável -lance -lança -lançado -lançamento -lançar -lançou -lar -larga -lateral -latina -lazer -leal -legais -legal -legenda -legislativa -legislativo -legislação -lei -leia -leilão -leis -leite -leitor -leitores -leitura -lembra -lembrando -lembrar -lembro -lembrou -ler -leste -lesão -letra -letras -leva -levada -levado -levados -levam -levando -levantamento -levantar levar levaram -leve levou lhe lhes -li -liberado -liberação -liberdade -licença -licitação -lidar -liderança -lideranças -liga -ligada -ligadas -ligado -ligados -ligar -ligação -ligações -liminar -limite -limites -limpa -limpeza -linda -linguagem -linha -linhas -lista -literatura -litoral -litros -livre -livres -livro -livros -lixo -lição -locais -local -localidade -localizada -localizado logo -loja -lojas -longa -longe -longo -lua -lucro -lucros -lugar -lugares -luta -lutar -luxo -luz -lá -lê -líder -líderes -língua -líquido -lógica -madeira -madrugada maio -maior -maiores -maioria -mais -mal -manda -mandado -mandar -mandato -mandatos -mandou -maneira -manhã -manifestação -mano -manter -manteve -mantido -mantém -manutenção -mar -marca -marcada -marcado -marcador -marcar -marcas -marcação -marcou -margem -margens -marido -marinho -marketing -março mas -masculino -massa -mata -matar -matemática -materiais -material -mato -matou -matriz -matéria -matérias -mau -mauro -maus me -medalha mediante -medida -medidas -medo -meia meio -meio-campo -meios -melhor -melhora -melhorar -melhores -melhoria -melhorias -melhorou -membro -membros -memória -menina -meninas -menino -meninos -menor -menores menos -mensagem -mensagens -mensais -mensal -mental -mente -mentira -mercado -mercadorias -mercados -merece -merecem -mesa +mês meses mesma mesmas mesmo mesmos -mestre -meta -metade -metas -metropolitana metros meu meus mil -milhares -milho milhão -milhões -militar -militares +milhares mim -mineiro -minha minhas -ministra -ministros -minuto -minutos -mirim -missão -mistura -mobilização -moda -modalidade -modelo -modelos -moderna -moderno modo -moeda momento momentos -montagem montante -montar -monte -mora -moradia -morador -moradora -moradores -morais -moral -moram -morar -morava -moro -morre -morrer -morreram -morreu -morro -morte -mortes -morto -mortos mostra mostram mostrando mostrar mostrou -motivo -motivos -moto -motor -motoristas -motos -movimentação -movimento -movimentos -moça -muda -mudança -mudanças -mudar -mudou muita muitas muito muitos -mulher -mulheres -multa -multas -mundial -mundo -municipais -municipal município municípios -muro -museu -musical -má -máquina -máquinas -máxima -máximo -mãe -mães -mão -mãos -média -médica -médio -mérito -mês -mídia -mínima -mínimo -mínimos -móveis -móvel -música -músicas -músicos na -nacionais -nacional -nada -namorada -namorado +nao +não naquela +naquelas naquele +naqueles nas nasceu nascido -nascimento -naturais -natural -naturalmente -natureza -nação -nações -necessidade -necessidades -necessita -necessária -necessárias -necessário -necessários -nega -negar -negativa -negativo -negociar -negociação -negociações -negou -negra -negro -negros -negócio -negócios nela nele nem @@ -2547,259 +721,83 @@ nessas nesse nesses nesta +nestas neste nestes -neto -news ninguém nisso no -nobre -noite -noites nome nomes -norma -normal -normalmente -normas norte nos +nós nossa nossas nosso nossos -nota -notas -notícia -notícias -nova novamente -novas nove novembro -novidade -novidades -novo -novos num numa -nunca -não -né -níveis -nível -nós -núcleo -número -números +numas +nuns o -objetivo -objetivos -objeto -objetos -obra -obras -obrigado -obrigados -obrigação -observa -observar -observou obter obteve ocasião ocorre ocorrem +ocorrência +ocorrências ocorrer ocorreram ocorreu ocorrido -ocorrência -ocorrências -ocupa -ocupar -ocupação -oeste -oferece -oferecem -oferecer -oferecido -oferecidos -oferta -oficiais -oficial -oficialmente -oficina -oficinas -ofício +oitavo oito -olha -olhando -olhar -olho -olhos -oliveira -olímpico onda onde -ong -online ontem -operação -operações -opinião -opiniões -oportunidade -oportunidades -optar -opção -opções +onze ora -ordem -organismo -organizada -organizado -organizar -organização -organizações -orientação -origem -original -orçamento os ou -ouro outra outras outro outros outubro -ouvi -ouvido -ouvidos -ouvir -ouviu -paciente -pacientes -paciência -pacote -padrão -padrões -paga -pagam -pagamento -pagamentos -pagando -pagar -pago -pagos -pagou -pai -paixão -palanque -palavra -palavras -palco -palestra -palestras -palácio -papai -papel -papéis -par para -parabéns -parada -parado -paralisação -paranaense -parar -parceiro -parceiros -parcela -parceria -parcerias parece parecem parecer parecia -paredes -parentes -parlamentar -parlamentares -parlamento -parou -parque parte partes -participa -participam -participantes -participar -participaram -participação -participou -particular -particulares -partida -partidas -partido -partidos -partidária -partir partiu passa passada -passado -passageiros -passagem -passagens passam passando passar -passaram passará +passaram passava passe passei -passeio -passo -passo fundo -passos -passou -pasta -patamar -patrimônio -pau -paula -paulistas -pauta -pavimentação -paz -país -países +pé +peça +peças pede -pedido -pedidos pedindo pedir -pediu -pedra -pedras -pega pegar pegou -peito -peixe -peixes pela pelas -pele -pelo pelos -pena -penal pensa pensam pensamento @@ -2809,360 +807,68 @@ pensar pensei penso pensou -pensão -pequena -pequenas -pequeno -pequenos -perante -percebe -perceber -percebeu -percentual -percurso -perda -perdas -perde -perdendo -perder -perderam -perdeu -perdido -perfeito -perfil -pergunta -perguntar -perguntas perguntou -perigo -perigoso permanece permanecer -permaneceu -permanente -permanência -permite -permitido -permitir -permitiu -perna -pernas -personagem -personagens -personalidade -perspectiva -pertence -perto -período -períodos -pesado -pesca -peso -pesquisa -pesquisadores -pesquisas -pessoa -pessoais -pessoal -pessoalmente -pessoas -peça -peças -piloto -pilotos -pintura -pior -piores -piso -pista -placa -placas -planejamento -planeta -plano -planos -planta -plantas -plantio -plantão -plateia -pleito -plena pleno -plenário -plástico -pneus -pobre -pobres -pobreza pode +pôde pode-se podem podemos podendo poder -poderes -poderia -poderiam poderá poderão +poderia +poderiam podia -poeta +põe +põem pois -policiais -policial -politicamente -polêmica -políticas -político -políticos -ponta -ponte -ponto -pontos -popular -populares -população por -porque -porta -portal -portanto -portas -porte -porto -português porém -posicionamento -positiva -positivo -positivos -posição -posições +porque +porquê possa possam -posse -possibilidade -possibilidades +possíveis +possível +possivelmente posso possuem possui -possíveis -possível -posteriormente -posto -postos -postura -potencial pouca poucas -pouco poucos -povo -povos pps pq pra -praia -praias -prata praticamente -praticar -prato -pratos -prazer -prazo -prazos -praça -praças -precisa -precisam -precisamos -precisar -precisava -preciso -precisou -preconceito -preencher -prefeita -prefeitos -prefeituras -prefere -preferiu -preferência -prejudicar -prejuízo -prejuízos -premiação -preocupa -preocupado -preocupar -preocupação -prepara -preparado -preparados -preparar -preparação -presa -presente -presentes -presença -preservar -preservação -presidencial -presidente -presidentes -presidência -preso -presos -pressão -prestar -prestação -presídio -preta -pretende -preto -prevenção -previdência -prevista -previstas -previsto -previstos -previsão -prevê -preço -preços -primavera primeira primeiras primeiro primeiros -principais -principal -principalmente -princípio -princípios -prioridade -prioridades -prisão -privada -privado -pro -problema -problemas -procedimento -procedimentos -processo -processos -procura -procurado -procurador -procuram -procurando -procurar -procure -procurou -produtividade -produto -produtor -produtores -produtos -produz -produzido -produzir -produção -professor -professora -profissionais -profissional -profissão -profunda -programa -programas -programação -progresso -proibido -projeto -projetos -prol -promessa -promessas -promete -prometeu -promotor -promove -promover -promovido -promoção pronta pronto -propaganda -proposta -propostas -propriedade -propriedades -proprietário -proprietários -propósito -propõe -proteger -protesto -proteção -prova -provar -provas -provavelmente -providências -provisória -provocar -provocou -provável -proximidades -prudente -prática -práticas -pré-candidato -prédio -prédios -prévia -prêmio -prêmios +propios própria próprias +proprio próprio próprios +provável +provavelmente próxima próximas +proximidades próximo próximos -publicada -publicado -publicação -publicidade +puderam pudesse -punição -pura -página -páginas -pátio -pão -pé -pés -pênalti -pólo -pública -públicas -público -públicos -quadra -quadrados -quadrilha -quadro -quadros quais +quáis qual -qualidade -qualificação qualquer quando quantas @@ -3172,279 +878,56 @@ quanto quantos quarta quarta-feira -quarto quase quatro que -quebra -quebrar -queda -queira +quê quem -quente -quer -querem -queremos -querendo -querer -queria -queriam querido quero -questionado -questão -questões -quilos +quieto quilômetros +quilos quinta quinta-feira quinto -quis -quiser -rainha -ramo -ranking -rapaz -rapidamente -razão -razões +quinze reais -reajuste real -realidade -realiza -realizada -realizadas -realizado -realizados -realizando -realizar -realização realizou realmente -reação -rebaixamento -recado -recebe -recebem -recebendo -receber -receberam -receberá -recebeu -recebi -recebido -receita -receitas recente recentemente recentes -reclama -reclamar -reclamação -reclamações -reclamou -reconhece -reconhecer -reconhecido -reconhecimento -recorde -recorrer -recuperar -recuperação -recurso -recursos -redação -rede -redes redonda redor -reduzir -redução -reeleito -reeleição -refere -referente -referência -reflete -refletir -reflexão -reforma -reformas -reforçar -reforço -regime -regionais -regional -registrada -registrado -registrados -registrar -registro -registros -registrou -região -regiões -regra -regras -regular -rei -reino -reivindicações relacionados -relacionamento -relacionamentos -relata -relator -relatou -relatório -relação -relações -religioso -remuneração -remédio -remédios -renda -rendimento -renovação -repasse -repente -repercussão -repetir -reportagem -representa -representam -representante -representantes -representar -representação -repórter -república -reserva -reservas -resgate -residência -residências -resistência -resolução -resolve -resolver -resolveu resolvido -respectivamente -respeitar -respeito -responde -responder -respondeu -responsabilidade -responsáveis -responsável resposta respostas ressalta ressaltar -ressaltou -resta -restante -restaurante -restaurantes resto resultado resultados -retirada -retirar retornar -retorno -reunir -reuniu -reunião -reuniões -revela -revelou -rever -reverter -revista -revistas -revisão -revolução -reúne -rica -rico -ricos -rio -rio de janeiro -rios -riqueza -risco -riscos -ritmo -rival -rock -rodada -rodadas -rodovia -rodovias -rodoviária -romance -rosto -roteiro -rotina -roubo -roupa -roupas -rua -ruas -rubro-negro -ruim -rumo -rurais -rural -rádio -rápida -rápido +sábado +sábados sabe -sabedoria sabem sabemos sabendo saber sabia -saco saem sai saia saiba saindo sair -saiu -sala -salarial -salas -saldo -salto -salvar -salário -salários -salão -saneamento -sangue -santista -satisfação -satisfeito -saudade -saudável -saída saíram +são se -secretaria -secretarias -secretário -secretários -sede -segmento -segmentos -segredo segue seguem seguida @@ -3458,230 +941,131 @@ segunda segunda-feira segundo segundos -segura -segurança -segurar seguro sei seis seja sejam sejamos -seleção sem semana semanas semelhante semelhantes -semestre -seminário sempre -senado -senador -senadora -senadores +senão sendo senhor senhora senhores -sensação senso sente -sentença sentido sentimento sentimentos sentindo sentir sentiu -senão sequer -sequência ser +será +serão serei serem seremos seres seria seriam -serve -servidor -servidores -servir -serviu -serviço -serviços -será -serão seríamos +série +sério +serve sessão sessões sete setembro +sétima +sétimo setor setores seu seus -sexo sexta sexta-feira -sexual -shopping -show -shows +sexto si sido -sigilo -sigla -significa -significado -silêncio sim simples simplesmente -sinais -sinal sinto -sintomas -sistema -sistemas site sites -situação -situações +sítio +só sob sobe sobra sobre sobretudo -sobrinho -sociais -social -socorro -sofre -sofrem -sofrendo -sofrer -sofreu -sofrimento -sol -soldados -solenidade -solicitação -solicitou -solidariedade +sois solo -solução -soluções -som -soma -sombra somente somos -sonho -sonhos -sono -sorriso -sorte -sorteio sou soube sousa sozinha -sozinho sua suas subir subiu -substituir -substituição -sucesso -sucessão sudeste suficiente suficientes sugere -sugestão -sugestões sujeito sul -super -superar -superintendente -superior -superiores -supermercado -superou -suplente -suporte -suposto -supremo -surge -surgiu -surpresa -suspeita -suspeito -suspeitos -suspensão -sábado -sábados -são -século -série -sério -sítio -só -sócios -tabela +tá tais tal -talento talvez -tamanho +tambem também tanta tantas tanto tantos -taques -tarde -tarefa -tarifa -taxa -taxas -taça +tão te -teatro -tecnologia -tecnologias -tela -telefone -telefones -televisão tem -tema -temas +tém +têm temos -temperatura -tempo temporada -tempos tende -tendo tendência +tendes +tendo tenha tenham tenhamos tenho +tens tenta tentam tentando tentar tentaram tentativa +tente +tentei tentou -teoria ter +terá +terão +terça +terça-feira terceira terceiro terei @@ -3689,92 +1073,52 @@ terem teremos teria teriam -termina -terminal -terminar -terminou -termo -termos -terra -terras -terreno -terrenos -território -terá -terão -terça -terça-feira teríamos -tese -tesouro -teste -testemunhas -testes -teto teu teus teve -texto -textos the ti tido -time times tinha tinham -tio +tínhamos tipo tipos -tira -tirar -tiro -tiros tirou -titular -titulares tive tivemos tiver tivera tiveram +tivéramos tiverem tivermos tivesse tivessem -tivéramos tivéssemos +tiveste +tivestes tocar -tocou toda todas todo todos -tom toma -tomada -tomadas tomando tomar tomou toneladas toque -torcedor -torcedores -torcida torna tornando tornar -torneio -torno tornou tornou-se -torres total totalmente trabalha -trabalhador -trabalhadores trabalham trabalhando trabalhar @@ -3782,281 +1126,101 @@ trabalhava trabalho trabalhos trabalhou -tradicionais -tradicional -tradição -traficantes -tragédia -trajetória -tranquilidade -transferência -transformar -transformação -transformou -transição -transmissão -transparência -transporte -transportes -trata +trás trata-se -tratado -tratamento -tratar -trave -travessão traz trazendo trazer -trecho -trechos -treinador -treinamento -treino -trem -tribuna -tribunal -tributária +três +treze trimestre trinta trio -triste -tristeza -troca -trocar -troféu trouxe -tráfego -trás -três tu tua tuas tudo -turismo -turistas -turma -tv -twitter -tá -tão -técnica -técnicas -técnico -técnicos -tém -término -tênis -tínhamos -título -títulos +última +últimas +último +últimos um uma umas -unidade +única unidades -unidos -unir -universidade -universidades -universitário -universo -união uns -urbana -urbano -urgência -urnas +usa usada usado usados usam usando usar -usina -usinas uso usou usuário usuários +útil +utilização utilizada utilizado utilizados utilizar -utilização -vacinação -vaga -vagas +vá vai +vais vale -valer -valor -valores -valorizar -valorização vamos -vantagem -vantagens -vara -variação +vão +várias +vários vc vcs +vê veio -vejo velha velho velhos -velocidade vem +vêm vemos -vence -vencedor -vencer -venceu -venda -vendas -vender -vendidos vendo venha -vento +vens ver vera -verba -verbas -verdade -verdadeira -verdadeiro -verdadeiros -verde -vereador -vereadora -vereadores -vergonha -verificar -vermelha -vermelho -versão -verão vez vezes -veículo -veículos vi via -viagem -viagens -viajar vias -vice -vice-governador -vice-prefeito -vice-presidente -vida -vidas vieram -vigor -vila vinda vindo vinha -vinho vinte -violência vir vira virada viram virar virou -virtude -visa -visando -visita -visitantes -visitar -visitas -vista -visto -visual -visão -vitória -vitórias -viu -viva -vive -vivem -vivemos -vivendo -viver -viveu -vivo -vizinho -vizinhos você vocês -volante volta voltada voltado -voltam -voltando voltar voltaram voltou -volume -voluntários vontade -voos vos -votado -votar -votação -voto -votos -votou +vós +vossa +vossas +vosso +vossos vou -voz -vá -várias -vários -várzea -vão -véspera -vê -vídeo -vídeos -vítima -vítimas -vôo -zagueiro -zero -zona -à -às -água -águas -árbitro -área -áreas -árvore -árvores -época -éramos -êxito -índia -índice -índices -óleo -órgão -órgãos -ótima -ótimo -ônibus -última -últimas -último -últimos -única -único -útil +zero \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/pt/pt_stop_words_old.txt b/apps/common/src/python/mediawords/languages/pt/pt_stop_words_old.txt new file mode 100644 index 0000000000..d49861eea5 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/pt/pt_stop_words_old.txt @@ -0,0 +1,4062 @@ +# +# This is a "long" stop word list for the Portuguese language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +a +a meta +abaixo +abastecimento +aberta +abertas +aberto +abertos +abertura +abraço +abre +abreu +abril +abrir +abriu +absoluta +absolutamente +absurdo +abuso +acaba +acabam +acabar +acabaram +acabou +academia +acaso +aceita +aceitar +aceitou +acertar +acertou +acesso +acha +acham +achar +achei +acho +achou +acidente +acidentes +acima +acompanha +acompanhada +acompanhado +acompanhamento +acompanhar +acompanhou +acontece +acontecem +acontecendo +acontecer +acontecerá +aconteceu +acontecido +acontecimentos +acordo +acredita +acreditam +acreditar +acredito +acrescenta +acrescentou +acumulado +acusado +acusados +acusação +acusações +adequada +adequado +adesão +adianta +adiante +adiantou +administrador +administrar +administrativa +administrativo +administração +admite +admitiu +adolescente +adolescentes +adotar +adoção +adquirir +adultos +adversário +adversários +advogada +advogado +advogados +aeroporto +afastado +afastar +afinal +afirma +afirmam +afirmando +afirmar +afirmou +agenda +agente +agentes +agir +agora +agosto +agradecer +agressão +agricultores +agricultura +agrícola +aguarda +aguardar +agência +agências +ah +ainda +ajuda +ajudam +ajudar +ajudou +ala +alcançar +alega +alegou +alegre +alegria +alemão +alerta +algo +algum +alguma +algumas +alguns +alguém +ali +aliado +aliados +aliança +alimentar +alimentação +alimento +alimentos +aliás +alma +almoço +alta +altas +alteração +alterações +alternativa +alternativas +alto +altos +altura +aluguel +aluno +alunos +alves +alvinegro +alvo +além +ama +amanhã +amarelo +ambas +ambientais +ambiental +ambiente +ambos +ameaça +ameaças +americano +americanos +amiga +amigo +amigos +amizade +amor +ampla +ampliar +ampliação +amplo +analisa +analisar +analistas +anda +andamento +andar +animais +animal +animação +aniversário +ano +anos +ante +anterior +anteriores +anteriormente +antes +antiga +antigo +antigos +anual +anuncia +anunciado +anunciar +anunciou +análise +anúncio +ao +aos +aparece +aparecem +aparecer +apareceu +aparecida +aparelho +aparelhos +apartamento +apelo +apenas +apesar +aplicada +aplicado +aplicar +aplicação +apoia +apoiar +apoio +aponta +apontam +apontou +aposentado +aposentadoria +aposentados +aposta +apreensão +aprender +aprendizado +apresenta +apresentada +apresentadas +apresentado +apresentados +apresentam +apresentar +apresentaram +apresentação +apresentações +apresentou +aprovada +aprovado +aprovados +aprovar +aprovação +aproveitar +aproveitou +aproximadamente +apuração +após +aquela +aquelas +aquele +aqueles +aqui +aquilo +aquisição +ar +areia +arena +argumento +argumentos +arma +armado +armas +arrecadação +arroz +arruda +art +arte +artes +artigo +artigos +artilheiro +artista +artistas +as +asfalto +aspecto +aspectos +assaltantes +assalto +assassinato +assembleia +assessor +assessores +assessoria +assim +assinado +assinar +assinatura +assinou +assistente +assistir +assistência +associados +associação +associações +assume +assumir +assumiu +assunto +assuntos +at +atacante +atacar +ataque +ataques +atende +atendendo +atender +atendidas +atendido +atendidos +atendimento +atendimentos +atento +atenção +atinge +atingido +atingir +atingiu +atitude +atitudes +ativa +atividade +atividades +atleta +atletas +ato +ator +atores +atos +atrair +atraso +através +atração +atrações +atriz +atrás +atua +atuais +atual +atualizado +atualmente +atuam +atuando +atuar +atuação +atuou +até +auditório +audiência +aula +aulas +aumenta +aumentando +aumentar +aumento +aumentou +ausência +automóveis +automóvel +autonomia +autor +autores +autoria +autoridade +autoridades +autorização +autos +auxiliar +auxílio +avalia +avaliar +avaliação +avaliou +avançar +avanço +avanços +avançou +avenida +avisa +avião +avó +azul +ação +ações +aérea +aí +baiano +baile +bairro +bairros +baixa +baixo +baixos +balanço +bancada +banco +bancos +banda +bandas +bandeira +bandido +bandidos +banheiro +banho +bar +barato +barco +bares +barra +barreiras +barros +barulho +base +baseado +bases +basta +bastante +bastidores +batalha +batalhão +bate +bater +bateria +bateu +beber +bebida +bebidas +bebê +beira +bela +beleza +belo +bem +beneficiar +benefício +benefícios +bens +bernardo +biblioteca +bicicleta +bilhão +bilhões +bloco +blocos +blog +boa +boas +boca +bola +boletim +bolsa +bolsas +bolso +bom +bomba +bombeiros +bonita +bonito +bons +branca +branco +brancos +brasileiras +brasileiro +brasileiros +braço +braços +breve +briga +brilhante +brincadeira +brincar +brinquedos +bruto +buraco +buracos +busca +buscam +buscando +buscar +básica +básicas +básico +cabe +cabeceou +cabelo +cabelos +cabeça +cabo +cachorro +cada +cadastro +cadeia +cadeira +cadeiras +cadê +cai +cair +caiu +caixa +caixas +calendário +calma +calor +calçada +cama +caminhada +caminho +caminhos +caminhão +caminhões +camisa +campanha +campanhas +campeonato +campeão +campeões +campo +campos +campus +cana +canal +candidata +candidato +candidatos +candidatura +candidaturas +canto +cantor +cantora +caos +capa +capacidade +capacitação +capaz +capazes +capitais +capital +capitão +capixaba +capítulo +cara +característica +características +caras +carga +cargo +cargos +carinho +carioca +carne +caro +carreira +carro +carros +carta +cartas +carteira +cartão +cartório +cartões +caruaru +caráter +casa +casado +casal +casamento +casar +casas +caso +casos +cassado +cassação +castelo +catarinense +categoria +categorias +causa +causar +causas +causou +caíram +cd +cedo +celular +cem +cemitério +cena +cenas +centenas +cento +centrais +central +centro +centros +cenário +cerca +cerimônia +certa +certamente +certas +certeza +certo +certos +cerveja +chama +chamada +chamado +chamados +chamar +chamou +chance +chances +chapa +chave +chefe +chega +chegada +chegam +chegamos +chegando +chegar +chegaram +chegou +cheguei +cheia +cheio +cheiro +cheque +choque +chute +chutou +chuva +chuvas +chão +ciclo +cidadania +cidade +cidades +cidadão +cidadãos +cientistas +cima +cinco +cinema +circo +circuito +circulação +cirurgia +cita +citado +citar +citou +civil +civis +ciência +ciências +classe +classes +classificação +cliente +clientes +clima +clique +clube +clubes +clássico +clínica +cobertura +cobra +cobrança +cobrar +cobrou +cofres +coisa +coisas +colaboradores +colega +colegas +coleta +coletiva +coletivo +coleção +coligação +coloca +colocada +colocado +colocados +colocando +colocar +colocação +colocou +coloque +colorado +coluna +colunista +colégio +com +comandante +comando +combate +combater +combustível +comecei +comemora +comemorar +comemoração +comemorou +comenta +comentar +comentou +comentário +comentários +comer +comerciais +comercial +comercialização +comerciante +comerciantes +começa +começam +começando +começar +começaram +começo +começou +comida +comigo +comissão +comitê +como +companheiro +companheiros +companhia +companhias +comparação +competente +competição +competições +competência +complementar +completa +completamente +completar +completo +completou +complexo +complicado +compor +comportamento +composição +composta +composto +compra +comprar +compras +compreensão +compromisso +compromissos +comprou +computador +computadores +comum +comunicado +comunicação +comunidade +comunidades +comuns +comércio +conceito +conceitos +concentração +concessão +conclui +concluir +concluiu +conclusão +concorda +concordo +concorrentes +concorrer +concorrência +concreto +concurso +concursos +condenado +condenação +condição +condições +conduta +confederação +conferir +conferência +confiança +confira +confirma +confirmado +confirmar +confirmou +conflito +conflitos +conforme +conforto +confronto +confusão +conhece +conhecem +conhecer +conheceu +conhecida +conhecido +conhecidos +conhecimento +conhecimentos +conheço +conjunto +conquista +conquistar +conquistas +conquistou +consciente +conscientização +consciência +consegue +conseguem +consegui +conseguia +conseguimos +conseguir +conseguiram +conseguiu +conselheiro +conselho +conselhos +consenso +conservação +considera +considerada +considerado +considerados +considerando +considerar +consideração +considerou +consigo +consta +constante +constantes +constitucional +construir +construção +construída +construído +consulta +consultas +consultoria +consumidor +consumidores +consumo +consórcio +conta +contam +contando +contar +contará +contas +contato +contatos +conter +contexto +conteúdo +continua +continuam +continuar +continuará +continue +continuidade +continuou +contou +contra +contrapartida +contratado +contratados +contratar +contratação +contratações +contrato +contratos +contribuinte +contribuir +contribuição +controlar +controle +contrário +contudo +convencer +convenção +conversa +conversar +conversas +convidado +convidados +convite +conviver +convivência +convênio +cooperativa +coordenador +coordenadora +coordenação +cor +coragem +coração +cores +coronel +corpo +corpos +corre +correndo +corrente +correr +correta +correto +correção +corrida +cortar +corte +cortes +costas +costuma +costumam +cotidiano +cozinha +credibilidade +creio +cresce +crescendo +crescente +crescer +cresceu +crescimento +cria +criada +criado +criando +crianças +criar +criatividade +criação +crime +crimes +criminal +criminalidade +criminosos +criou +crise +criticar +criticou +critério +critérios +cruzamento +cruzes +cruzou +crédito +créditos +crítica +críticas +crítico +cuidado +cuidados +cuidar +cuja +cujo +culpa +cultura +culturais +cultural +cumprimento +cumprir +cumpriu +cunha +currículo +curso +cursos +curto +custa +custo +custos +cá +cães +cérebro +céu +código +cúpula +da +dada +dado +dados +dando +danos +dantas +dança +daquela +daquele +daqueles +daqui +dar +daria +dará +das +data +dava +daí +de +de deus +debaixo +debate +debates +decide +decidir +decidiu +decisão +decisões +declaração +declarações +declarou +decoração +decreto +dedicação +defende +defender +defendeu +defensor +defesa +deficiência +define +definida +definido +definir +definitivamente +definiu +definição +deixa +deixado +deixam +deixando +deixar +deixaram +deixe +deixou +dela +delas +dele +delegacia +delegado +deles +demais +demanda +demandas +demissão +democrático +demonstra +demonstrou +demora +demorou +dentre +dentro +denunciar +denúncia +denúncias +departamento +depende +dependendo +depender +depoimento +depoimentos +depois +deputada +deputado +deputados +der +deram +derrota +derrotado +desafio +desafios +descoberta +descobre +descobrir +descobriu +desconto +desculpas +desde +deseja +desejar +desejo +desembargador +desempenho +desemprego +desenvolver +desenvolvido +desenvolvimento +desespero +desfile +despesas +dessa +dessas +desse +desses +desta +destaca +destacar +destacou +destaque +destaques +destas +deste +destes +destinado +destinados +destino +desvio +desviou +detalhe +detalhes +determina +determinado +determinação +determinou +deu +deve +deve-se +devem +devemos +dever +deveria +deveriam +deverá +deverão +devia +devido +dez +dezembro +dezenas +dia +dia-a-dia +diagnóstico +diante +diariamente +dias +dica +dicas +diferente +diferentes +diferença +diferenças +dificilmente +dificuldade +dificuldades +difíceis +difícil +diga +digital +dignidade +digo +diminuir +diminuição +dinheiro +diploma +direito +direta +diretamente +direto +diretor +diretora +diretores +diretoria +diretório +direção +diria +dirigente +dirigentes +dirigir +disciplina +disco +discurso +discussão +discussões +discutir +disponíveis +disponível +disposição +disposto +disputa +disputar +dispõe +disse +disseram +disso +distante +distribuição +distribuídos +distrito +dito +diversas +diversos +divisão +divulgada +divulgado +divulgados +divulgar +divulgação +divulgou +diz +dizem +dizendo +dizer +dizia +diálogo +diária +diárias +diário +do +doação +doações +doce +documentação +documento +documentos +doente +doença +doenças +dois +domingo +domínio +dona +dono +donos +dor +dores +dormir +dos +dose +doutor +duas +duelo +dupla +dura +durante +duração +duro +durou +dutra +dvd +dão +década +décadas +déficit +dívida +dívidas +dólar +dólares +dúvida +dúvidas +e +e-mail +economista +econômica +econômicas +econômico +econômicos +edital +edição +edições +educacional +efeito +efeitos +efetivamente +efetivo +eficiente +eficiência +eis +ela +elaboração +elas +ele +eleger +elegeu +eleita +eleito +eleitor +eleitorado +eleitorais +eleitores +eleitos +eleição +eleições +elementos +elenco +eles +eletrônica +eletrônico +elevado +elevação +elite +elogios +elétrica +em +embora +emenda +emendas +emergência +emissora +emissoras +emissão +emocional +emoção +emoções +empate +empatou +empenho +empreendimento +empreendimentos +empregados +emprego +empregos +empresa +empresarial +empresas +empresário +empresários +empréstimo +empréstimos +encaminhado +encaminhados +encarar +encerramento +encerrou +enchentes +encontra +encontrada +encontrado +encontrados +encontram +encontrar +encontraram +encontro +encontros +encontrou +endereço +energia +enfatizou +enfim +enfrenta +enfrentar +engenharia +engenheiro +enorme +enquanto +ensinar +ensino +entanto +entende +entender +entendeu +entendimento +entendo +entidade +entidades +entra +entrada +entram +entrar +entraram +entre +entrega +entregar +entregou +entregue +entregues +entretanto +entrevista +entrevistados +entrevistas +entrou +então +enviado +enviar +enviou +envolve +envolvendo +envolvido +envolvidos +envolvimento +episódio +equilíbrio +equipamento +equipamentos +equipe +equipes +equivalente +era +eram +errado +erro +erros +escala +escanteio +esclarecer +escola +escolar +escolas +escolha +escolher +escolheu +escolhido +esconder +escrever +escreveu +escrita +escrito +escritor +escritório +esforço +esforços +espanhol +espaço +espaços +especiais +especial +especialista +especialistas +especializada +especialmente +específica +específico +espera +esperado +esperamos +esperando +esperança +esperar +esperava +espero +espetáculo +espiritual +esportiva +esportivo +esposa +espécie +espécies +espírito +esquecer +esquema +esquerdo +esquina +essa +essas +esse +essencial +esses +esta +estabelece +estabelecer +estabelecimento +estabelecimentos +estabilidade +estacionamento +estado +estados +estaduais +estadual +estamos +estar +estaria +estariam +estará +estarão +estas +estatal +estatuto +estatística +estatísticas +estava +estavam +estação +este +esteja +estejam +estejamos +estes +esteve +estilo +estimativa +estimular +estive +estivemos +estiver +estivera +estiveram +estiverem +estivermos +estivesse +estivessem +estivéramos +estivéssemos +estou +estrada +estradas +estrangeiros +estranho +estratégia +estratégias +estreia +estrela +estrelas +estrutura +estudar +estudo +estudos +está +estádio +estágio +estávamos +estão +etapa +etapas +etc +etc. +eu +evento +eventos +eventual +evidente +evitar +evolução +ex-deputado +ex-governador +ex-prefeito +ex-presidente +exame +exames +exatamente +excelente +excelência +excesso +exceção +exclusivamente +executiva +executivo +execução +exemplo +exemplos +exercer +exercício +exercícios +exige +exigir +exigência +exigências +existe +existem +existentes +existia +existir +existência +expandir +expansão +expectativa +expectativas +experiência +experiências +explica +explicar +explicação +explicações +explicou +exploração +exportações +exposição +expressão +expulso +extensão +exterior +externa +extra +extremamente +exército +face +facilidade +facilitar +facilmente +faculdade +faixa +faixas +fala +falam +falando +falar +falei +falha +falhas +falou +falta +faltam +faltando +faltou +fama +familiar +familiares +famoso +famílias +faria +farmácia +farroupilha +fará +farão +fase +fato +fator +fatores +fatos +faturamento +favor +favorável +faz +fazem +fazemos +fazenda +fazendo +fazer +fazia +faça +façam +faço +febre +fechada +fechado +fechados +fechamento +fechar +fechou +federais +federal +federação +feira +feita +feitas +feito +feitos +felicidade +feliz +felizes +feminina +feminino +fenômeno +feriado +feridos +ferramenta +ferramentas +ferro +festa +festas +festival +fevereiro +fez +fica +ficam +ficamos +ficando +ficar +ficaram +ficaria +ficará +ficarão +ficava +ficha +fico +ficou +fiel +figura +fila +filha +filhas +filho +filhos +filme +filmes +filosofia +fim +finais +final +finalidade +finalizou +finalmente +financeira +financeiras +financeiro +financeiros +financiamento +finanças +fins +fique +fiquei +firme +fiscais +fiscal +fiscalizar +fiscalização +fiz +fizemos +fizeram +fiéis +flagrante +flor +flores +floresta +fluxo +foco +fogo +foi +folha +fomos +fonte +fontes +for +fora +foram +forem +forma +formada +formado +formar +formas +formato +formação +formos +forró +fortalecer +forte +força +forças +fosse +fossem +foto +fotos +fraco +francês +frase +fraude +freitas +frente +frio +frisou +fronteira +frota +frutas +fruto +frutos +fuga +fugir +fugiu +fui +funciona +funcionamento +funcionando +funcionar +funcionário +funcionários +fundamentais +fundamental +fundação +fundo +fundos +função +funções +furto +futebol +futsal +futuro +futuros +fábrica +fácil +fãs +fé +férias +física +físicas +físico +fórmula +fórum +fôramos +fôssemos +gabinete +gado +galeria +ganha +ganham +ganhando +ganhar +ganho +ganhos +ganhou +garante +garantia +garantir +garantiu +garota +garoto +gastar +gasto +gastos +gaúcha +gaúchos +general +gente +geografia +gera +gerais +geral +geralmente +gerando +gerar +geração +gerente +gerou +gestor +gestores +gestão +ginásio +global +gol +goleiro +golpe +gols +gosta +gostam +gostaria +gostei +gosto +gostou +governador +governadora +governadores +governantes +governar +governo +governos +gramado +grande +grandes +gratuita +gratuito +grau +grave +graves +graças +grossa +grosso +grupo +grupos +grãos +guarda +guia +gás +gênero +habitantes +habitação +haja +hajam +hajamos +harmonia +havemos +haver +haveria +haverá +havia +haviam +hectares +hei +helena +hipótese +história +histórias +histórica +histórico +hoje +homem +homenagem +homens +homicídio +homicídios +honra +hora +horas +horizonte +horário +horários +hospital +hotel +hotéis +houve +houvemos +houver +houvera +houveram +houverei +houverem +houveremos +houveria +houveriam +houvermos +houverá +houverão +houveríamos +houvesse +houvessem +houvéramos +houvéssemos +hugo +humana +humanidade +humano +humanos +humor +há +hábito +hão +ia +ibope +ida +idade +ideal +identidade +identificado +identificar +identificação +idosos +idéia +idéias +iguais +igual +igualdade +ilegal +ilha +iluminação +imagem +imagens +imaginar +imediata +imediatamente +imediato +impacto +impede +impedir +implantar +implantação +impor +importa +importante +importantes +impossível +imposto +impostos +imprensa +impressão +imóveis +imóvel +inauguração +incentivar +incentivo +inclui +incluindo +inclusive +inclusão +incrível +incêndio +indenização +independente +independentemente +independência +indica +indicado +indicar +indicação +individuais +individual +indivíduo +indivíduos +indo +industrial +indícios +indígena +indígenas +indústria +indústrias +infantil +infelizmente +inferior +influência +informa +informado +informar +informação +informações +informou +informática +infra-estrutura +infraestrutura +inglês +ingresso +ingressos +inicia +iniciada +inicial +inicialmente +iniciar +iniciativa +iniciativas +iniciou +inquérito +inscritos +inscrição +inscrições +instalada +instalar +instalação +instalações +institucional +instituição +instituições +instituto +instrumento +instrumentos +integra +integral +integrante +integrantes +integrar +integração +inteira +inteiro +inteligente +inteligência +intensa +intenso +intenção +intenções +inter +interessa +interessados +interessante +interesse +interesses +interior +interna +internacionais +internacional +internado +internet +interno +interpretação +intervalo +intervenção +intuito +invadiu +inverno +investidores +investigar +investigação +investigações +investimento +investimentos +investir +invés +início +inúmeras +inúmeros +ir +iria +irmã +irmão +irmãos +irregular +irregularidades +irá +irão +isso +isto +italiano +item +itens +jamais +janeiro +janela +jantar +jardim +jc +jeito +joga +jogada +jogadas +jogador +jogadores +jogando +jogar +jogo +jogos +jogou +jornada +jornais +jornal +jornalismo +jornalistas +judicial +judiciário +juiz +julgamento +julgar +julho +junho +juntamente +junto +juntos +juros +jurídica +jurídico +justa +justamente +justifica +justificar +justificativa +justiça +justo +juventude +juíza +juízes +juízo +já +km +laboratório +lado +lados +ladrões +lago +lamentável +lance +lança +lançado +lançamento +lançar +lançou +lar +larga +lateral +latina +lazer +leal +legais +legal +legenda +legislativa +legislativo +legislação +lei +leia +leilão +leis +leite +leitor +leitores +leitura +lembra +lembrando +lembrar +lembro +lembrou +ler +leste +lesão +letra +letras +leva +levada +levado +levados +levam +levando +levantamento +levantar +levar +levaram +leve +levou +lhe +lhes +li +liberado +liberação +liberdade +licença +licitação +lidar +liderança +lideranças +liga +ligada +ligadas +ligado +ligados +ligar +ligação +ligações +liminar +limite +limites +limpa +limpeza +linda +linguagem +linha +linhas +lista +literatura +litoral +litros +livre +livres +livro +livros +lixo +lição +locais +local +localidade +localizada +localizado +logo +loja +lojas +longa +longe +longo +lua +lucro +lucros +lugar +lugares +luta +lutar +luxo +luz +lá +lê +líder +líderes +língua +líquido +lógica +madeira +madrugada +maio +maior +maiores +maioria +mais +mal +manda +mandado +mandar +mandato +mandatos +mandou +maneira +manhã +manifestação +mano +manter +manteve +mantido +mantém +manutenção +mar +marca +marcada +marcado +marcador +marcar +marcas +marcação +marcou +margem +margens +marido +marinho +marketing +março +mas +masculino +massa +mata +matar +matemática +materiais +material +mato +matou +matriz +matéria +matérias +mau +mauro +maus +me +medalha +mediante +medida +medidas +medo +meia +meio +meio-campo +meios +melhor +melhora +melhorar +melhores +melhoria +melhorias +melhorou +membro +membros +memória +menina +meninas +menino +meninos +menor +menores +menos +mensagem +mensagens +mensais +mensal +mental +mente +mentira +mercado +mercadorias +mercados +merece +merecem +mesa +meses +mesma +mesmas +mesmo +mesmos +mestre +meta +metade +metas +metropolitana +metros +meu +meus +mil +milhares +milho +milhão +milhões +militar +militares +mim +mineiro +minha +minhas +ministra +ministros +minuto +minutos +mirim +missão +mistura +mobilização +moda +modalidade +modelo +modelos +moderna +moderno +modo +moeda +momento +momentos +montagem +montante +montar +monte +mora +moradia +morador +moradora +moradores +morais +moral +moram +morar +morava +moro +morre +morrer +morreram +morreu +morro +morte +mortes +morto +mortos +mostra +mostram +mostrando +mostrar +mostrou +motivo +motivos +moto +motor +motoristas +motos +movimentação +movimento +movimentos +moça +muda +mudança +mudanças +mudar +mudou +muita +muitas +muito +muitos +mulher +mulheres +multa +multas +mundial +mundo +municipais +municipal +município +municípios +muro +museu +musical +má +máquina +máquinas +máxima +máximo +mãe +mães +mão +mãos +média +médica +médio +mérito +mês +mídia +mínima +mínimo +mínimos +móveis +móvel +música +músicas +músicos +na +nacionais +nacional +nada +namorada +namorado +naquela +naquele +nas +nasceu +nascido +nascimento +naturais +natural +naturalmente +natureza +nação +nações +necessidade +necessidades +necessita +necessária +necessárias +necessário +necessários +nega +negar +negativa +negativo +negociar +negociação +negociações +negou +negra +negro +negros +negócio +negócios +nela +nele +nem +nenhum +nenhuma +nessa +nessas +nesse +nesses +nesta +neste +nestes +neto +news +ninguém +nisso +no +nobre +noite +noites +nome +nomes +norma +normal +normalmente +normas +norte +nos +nossa +nossas +nosso +nossos +nota +notas +notícia +notícias +nova +novamente +novas +nove +novembro +novidade +novidades +novo +novos +num +numa +nunca +não +né +níveis +nível +nós +núcleo +número +números +o +objetivo +objetivos +objeto +objetos +obra +obras +obrigado +obrigados +obrigação +observa +observar +observou +obter +obteve +ocasião +ocorre +ocorrem +ocorrer +ocorreram +ocorreu +ocorrido +ocorrência +ocorrências +ocupa +ocupar +ocupação +oeste +oferece +oferecem +oferecer +oferecido +oferecidos +oferta +oficiais +oficial +oficialmente +oficina +oficinas +ofício +oito +olha +olhando +olhar +olho +olhos +oliveira +olímpico +onda +onde +ong +online +ontem +operação +operações +opinião +opiniões +oportunidade +oportunidades +optar +opção +opções +ora +ordem +organismo +organizada +organizado +organizar +organização +organizações +orientação +origem +original +orçamento +os +ou +ouro +outra +outras +outro +outros +outubro +ouvi +ouvido +ouvidos +ouvir +ouviu +paciente +pacientes +paciência +pacote +padrão +padrões +paga +pagam +pagamento +pagamentos +pagando +pagar +pago +pagos +pagou +pai +paixão +palanque +palavra +palavras +palco +palestra +palestras +palácio +papai +papel +papéis +par +para +parabéns +parada +parado +paralisação +paranaense +parar +parceiro +parceiros +parcela +parceria +parcerias +parece +parecem +parecer +parecia +paredes +parentes +parlamentar +parlamentares +parlamento +parou +parque +parte +partes +participa +participam +participantes +participar +participaram +participação +participou +particular +particulares +partida +partidas +partido +partidos +partidária +partir +partiu +passa +passada +passado +passageiros +passagem +passagens +passam +passando +passar +passaram +passará +passava +passe +passei +passeio +passo +passo fundo +passos +passou +pasta +patamar +patrimônio +pau +paula +paulistas +pauta +pavimentação +paz +país +países +pede +pedido +pedidos +pedindo +pedir +pediu +pedra +pedras +pega +pegar +pegou +peito +peixe +peixes +pela +pelas +pele +pelo +pelos +pena +penal +pensa +pensam +pensamento +pensamentos +pensando +pensar +pensei +penso +pensou +pensão +pequena +pequenas +pequeno +pequenos +perante +percebe +perceber +percebeu +percentual +percurso +perda +perdas +perde +perdendo +perder +perderam +perdeu +perdido +perfeito +perfil +pergunta +perguntar +perguntas +perguntou +perigo +perigoso +permanece +permanecer +permaneceu +permanente +permanência +permite +permitido +permitir +permitiu +perna +pernas +personagem +personagens +personalidade +perspectiva +pertence +perto +período +períodos +pesado +pesca +peso +pesquisa +pesquisadores +pesquisas +pessoa +pessoais +pessoal +pessoalmente +pessoas +peça +peças +piloto +pilotos +pintura +pior +piores +piso +pista +placa +placas +planejamento +planeta +plano +planos +planta +plantas +plantio +plantão +plateia +pleito +plena +pleno +plenário +plástico +pneus +pobre +pobres +pobreza +pode +pode-se +podem +podemos +podendo +poder +poderes +poderia +poderiam +poderá +poderão +podia +poeta +pois +policiais +policial +politicamente +polêmica +políticas +político +políticos +ponta +ponte +ponto +pontos +popular +populares +população +por +porque +porta +portal +portanto +portas +porte +porto +português +porém +posicionamento +positiva +positivo +positivos +posição +posições +possa +possam +posse +possibilidade +possibilidades +posso +possuem +possui +possíveis +possível +posteriormente +posto +postos +postura +potencial +pouca +poucas +pouco +poucos +povo +povos +pps +pq +pra +praia +praias +prata +praticamente +praticar +prato +pratos +prazer +prazo +prazos +praça +praças +precisa +precisam +precisamos +precisar +precisava +preciso +precisou +preconceito +preencher +prefeita +prefeitos +prefeituras +prefere +preferiu +preferência +prejudicar +prejuízo +prejuízos +premiação +preocupa +preocupado +preocupar +preocupação +prepara +preparado +preparados +preparar +preparação +presa +presente +presentes +presença +preservar +preservação +presidencial +presidente +presidentes +presidência +preso +presos +pressão +prestar +prestação +presídio +preta +pretende +preto +prevenção +previdência +prevista +previstas +previsto +previstos +previsão +prevê +preço +preços +primavera +primeira +primeiras +primeiro +primeiros +principais +principal +principalmente +princípio +princípios +prioridade +prioridades +prisão +privada +privado +pro +problema +problemas +procedimento +procedimentos +processo +processos +procura +procurado +procurador +procuram +procurando +procurar +procure +procurou +produtividade +produto +produtor +produtores +produtos +produz +produzido +produzir +produção +professor +professora +profissionais +profissional +profissão +profunda +programa +programas +programação +progresso +proibido +projeto +projetos +prol +promessa +promessas +promete +prometeu +promotor +promove +promover +promovido +promoção +pronta +pronto +propaganda +proposta +propostas +propriedade +propriedades +proprietário +proprietários +propósito +propõe +proteger +protesto +proteção +prova +provar +provas +provavelmente +providências +provisória +provocar +provocou +provável +proximidades +prudente +prática +práticas +pré-candidato +prédio +prédios +prévia +prêmio +prêmios +própria +próprias +próprio +próprios +próxima +próximas +próximo +próximos +publicada +publicado +publicação +publicidade +pudesse +punição +pura +página +páginas +pátio +pão +pé +pés +pênalti +pólo +pública +públicas +público +públicos +quadra +quadrados +quadrilha +quadro +quadros +quais +qual +qualidade +qualificação +qualquer +quando +quantas +quantia +quantidade +quanto +quantos +quarta +quarta-feira +quarto +quase +quatro +que +quebra +quebrar +queda +queira +quem +quente +quer +querem +queremos +querendo +querer +queria +queriam +querido +quero +questionado +questão +questões +quilos +quilômetros +quinta +quinta-feira +quinto +quis +quiser +rainha +ramo +ranking +rapaz +rapidamente +razão +razões +reais +reajuste +real +realidade +realiza +realizada +realizadas +realizado +realizados +realizando +realizar +realização +realizou +realmente +reação +rebaixamento +recado +recebe +recebem +recebendo +receber +receberam +receberá +recebeu +recebi +recebido +receita +receitas +recente +recentemente +recentes +reclama +reclamar +reclamação +reclamações +reclamou +reconhece +reconhecer +reconhecido +reconhecimento +recorde +recorrer +recuperar +recuperação +recurso +recursos +redação +rede +redes +redonda +redor +reduzir +redução +reeleito +reeleição +refere +referente +referência +reflete +refletir +reflexão +reforma +reformas +reforçar +reforço +regime +regionais +regional +registrada +registrado +registrados +registrar +registro +registros +registrou +região +regiões +regra +regras +regular +rei +reino +reivindicações +relacionados +relacionamento +relacionamentos +relata +relator +relatou +relatório +relação +relações +religioso +remuneração +remédio +remédios +renda +rendimento +renovação +repasse +repente +repercussão +repetir +reportagem +representa +representam +representante +representantes +representar +representação +repórter +república +reserva +reservas +resgate +residência +residências +resistência +resolução +resolve +resolver +resolveu +resolvido +respectivamente +respeitar +respeito +responde +responder +respondeu +responsabilidade +responsáveis +responsável +resposta +respostas +ressalta +ressaltar +ressaltou +resta +restante +restaurante +restaurantes +resto +resultado +resultados +retirada +retirar +retornar +retorno +reunir +reuniu +reunião +reuniões +revela +revelou +rever +reverter +revista +revistas +revisão +revolução +reúne +rica +rico +ricos +rio +rio de janeiro +rios +riqueza +risco +riscos +ritmo +rival +rock +rodada +rodadas +rodovia +rodovias +rodoviária +romance +rosto +roteiro +rotina +roubo +roupa +roupas +rua +ruas +rubro-negro +ruim +rumo +rurais +rural +rádio +rápida +rápido +sabe +sabedoria +sabem +sabemos +sabendo +saber +sabia +saco +saem +sai +saia +saiba +saindo +sair +saiu +sala +salarial +salas +saldo +salto +salvar +salário +salários +salão +saneamento +sangue +santista +satisfação +satisfeito +saudade +saudável +saída +saíram +se +secretaria +secretarias +secretário +secretários +sede +segmento +segmentos +segredo +segue +seguem +seguida +seguido +seguindo +seguinte +seguintes +seguir +seguiu +segunda +segunda-feira +segundo +segundos +segura +segurança +segurar +seguro +sei +seis +seja +sejam +sejamos +seleção +sem +semana +semanas +semelhante +semelhantes +semestre +seminário +sempre +senado +senador +senadora +senadores +sendo +senhor +senhora +senhores +sensação +senso +sente +sentença +sentido +sentimento +sentimentos +sentindo +sentir +sentiu +senão +sequer +sequência +ser +serei +serem +seremos +seres +seria +seriam +serve +servidor +servidores +servir +serviu +serviço +serviços +será +serão +seríamos +sessão +sessões +sete +setembro +setor +setores +seu +seus +sexo +sexta +sexta-feira +sexual +shopping +show +shows +si +sido +sigilo +sigla +significa +significado +silêncio +sim +simples +simplesmente +sinais +sinal +sinto +sintomas +sistema +sistemas +site +sites +situação +situações +sob +sobe +sobra +sobre +sobretudo +sobrinho +sociais +social +socorro +sofre +sofrem +sofrendo +sofrer +sofreu +sofrimento +sol +soldados +solenidade +solicitação +solicitou +solidariedade +solo +solução +soluções +som +soma +sombra +somente +somos +sonho +sonhos +sono +sorriso +sorte +sorteio +sou +soube +sousa +sozinha +sozinho +sua +suas +subir +subiu +substituir +substituição +sucesso +sucessão +sudeste +suficiente +suficientes +sugere +sugestão +sugestões +sujeito +sul +super +superar +superintendente +superior +superiores +supermercado +superou +suplente +suporte +suposto +supremo +surge +surgiu +surpresa +suspeita +suspeito +suspeitos +suspensão +sábado +sábados +são +século +série +sério +sítio +só +sócios +tabela +tais +tal +talento +talvez +tamanho +também +tanta +tantas +tanto +tantos +taques +tarde +tarefa +tarifa +taxa +taxas +taça +te +teatro +tecnologia +tecnologias +tela +telefone +telefones +televisão +tem +tema +temas +temos +temperatura +tempo +temporada +tempos +tende +tendo +tendência +tenha +tenham +tenhamos +tenho +tenta +tentam +tentando +tentar +tentaram +tentativa +tentou +teoria +ter +terceira +terceiro +terei +terem +teremos +teria +teriam +termina +terminal +terminar +terminou +termo +termos +terra +terras +terreno +terrenos +território +terá +terão +terça +terça-feira +teríamos +tese +tesouro +teste +testemunhas +testes +teto +teu +teus +teve +texto +textos +the +ti +tido +time +times +tinha +tinham +tio +tipo +tipos +tira +tirar +tiro +tiros +tirou +titular +titulares +tive +tivemos +tiver +tivera +tiveram +tiverem +tivermos +tivesse +tivessem +tivéramos +tivéssemos +tocar +tocou +toda +todas +todo +todos +tom +toma +tomada +tomadas +tomando +tomar +tomou +toneladas +toque +torcedor +torcedores +torcida +torna +tornando +tornar +torneio +torno +tornou +tornou-se +torres +total +totalmente +trabalha +trabalhador +trabalhadores +trabalham +trabalhando +trabalhar +trabalhava +trabalho +trabalhos +trabalhou +tradicionais +tradicional +tradição +traficantes +tragédia +trajetória +tranquilidade +transferência +transformar +transformação +transformou +transição +transmissão +transparência +transporte +transportes +trata +trata-se +tratado +tratamento +tratar +trave +travessão +traz +trazendo +trazer +trecho +trechos +treinador +treinamento +treino +trem +tribuna +tribunal +tributária +trimestre +trinta +trio +triste +tristeza +troca +trocar +troféu +trouxe +tráfego +trás +três +tu +tua +tuas +tudo +turismo +turistas +turma +tv +twitter +tá +tão +técnica +técnicas +técnico +técnicos +tém +término +tênis +tínhamos +título +títulos +um +uma +umas +unidade +unidades +unidos +unir +universidade +universidades +universitário +universo +união +uns +urbana +urbano +urgência +urnas +usada +usado +usados +usam +usando +usar +usina +usinas +uso +usou +usuário +usuários +utilizada +utilizado +utilizados +utilizar +utilização +vacinação +vaga +vagas +vai +vale +valer +valor +valores +valorizar +valorização +vamos +vantagem +vantagens +vara +variação +vc +vcs +veio +vejo +velha +velho +velhos +velocidade +vem +vemos +vence +vencedor +vencer +venceu +venda +vendas +vender +vendidos +vendo +venha +vento +ver +vera +verba +verbas +verdade +verdadeira +verdadeiro +verdadeiros +verde +vereador +vereadora +vereadores +vergonha +verificar +vermelha +vermelho +versão +verão +vez +vezes +veículo +veículos +vi +via +viagem +viagens +viajar +vias +vice +vice-governador +vice-prefeito +vice-presidente +vida +vidas +vieram +vigor +vila +vinda +vindo +vinha +vinho +vinte +violência +vir +vira +virada +viram +virar +virou +virtude +visa +visando +visita +visitantes +visitar +visitas +vista +visto +visual +visão +vitória +vitórias +viu +viva +vive +vivem +vivemos +vivendo +viver +viveu +vivo +vizinho +vizinhos +você +vocês +volante +volta +voltada +voltado +voltam +voltando +voltar +voltaram +voltou +volume +voluntários +vontade +voos +vos +votado +votar +votação +voto +votos +votou +vou +voz +vá +várias +vários +várzea +vão +véspera +vê +vídeo +vídeos +vítima +vítimas +vôo +zagueiro +zero +zona +à +às +água +águas +árbitro +área +áreas +árvore +árvores +época +éramos +êxito +índia +índice +índices +óleo +órgão +órgãos +ótima +ótimo +ônibus +última +últimas +último +últimos +única +único +útil diff --git a/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt b/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt old mode 100755 new mode 100644 index 2afa1eb3de..a426c284e6 --- a/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt @@ -1,440 +1,591 @@ +# A Romanian stop word list. +# Sources: # -# This is a stop word list for the Romanian language. -# -# Source: http://snowball.tartarus.org/otherapps/romanian/intro.html (romanian2.tgz) -# - - # A Romanian stop word list. Comments begin with vertical bar. Each stop - # word is at the start of a line. - - # Many of the forms below are quite rare but included for completeness. +# http://snowball.tartarus.org/otherapps/romanian/intro.html (romanian2.tgz) +# https://github.com/stopwords-iso/stopwords-ro/blob/master/stopwords-ro.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) - # ARTICLE - # Indefinite article -o # a -unui -unei -unor -nişte # some - # Demonstrative/adjectival article -cel -cea -cei -cele -celui -celei -celor - # Possessive / genitival article -al # of a -ai -ale - # PREPOSITION AND ADVERB -pe # on -la # at -în # in -fără # without -sub # under -despre # about -către # to -cu # with -de # from -din # on -lângă # by -pentru # for -peste # over -spre # to -prin # through -dintre # between -printre # among -până # until -după # after -înspre # towards -ca # as - # ADJECTIVE -mai # more -decât # than -cum # how -foarte # very -mult # much -multă -mulţi -multe -puţin # little -puţină -puţini -puţine -destul # enough -destulă -destui -destule - # PRONOUN - # Personal pronoun -eu # I -tu # you -el # he -ea # she -noi # we -voi # you -ei # they -ele # they -mie # me -îmi -mi -mine -mă -m -ţie # you -îţi -ţi -tine -te -lui # him -îl -l -îi -i -nouă # us -ne -ni -vouă # you -vă -vi -v -lor # them -le -li - # Pronoun of politeness -dumneavoastră # you - # Reflexive pronoun -se # himself -îşi -sie -sieşi -sine - # Pronoun of reinforcement -însumi # myself -însămi -însuţi # youself -însăţi -însuşi # himself -însăşi # herself -înşine # ourselves -însene -înşivă # youselves -însevă -înşişi # themselves -înseşi -însele - # Possessive pronoun -meu # mine -mea -mei -mele -tău # yours -ta -tăi -tale -său # his -sa -săi -sale -nostru # ours -noastră -noştri -noastre -vostru # yours -voastră -voştri -voastre - # Demonstrative pronoun -acesta # this -ăsta -aceştia -ăştia -acestuia -ăstuia -acestora -ăstora +abia +acea aceasta -asta -acestea -astea -acesteia -ăsteia -acest -aceşti -acestui -acestor această -aceste -acestei -acela # that -ăla -acelui -ăluia -aceia -ăia -acelora -ălora aceea -aia -acelea -alea -aceleia -ăleia -acel +aceeasi +aceeaşi acei -acelor -acea -acele -acelei -acelaşi # the same +aceia aceiaşi -aceeaşi +acel +acela +acelasi +acelaşi +acele +acelea aceleaşi -aceluiaşi -aceloraşi +acelei +aceleia aceleiaşi -celălalt # the other -celuilalt -ceilalţi -celorlalţi -cealaltă -celeilalte -celelalte -celorlalte - # Interrogative pronoun -ce # what -cine # who -cui # whom -care # which, what -cărui -cărei -căror -unde # where -când # when - # Indefinite pronoun -cineva # someone -cuiva -altcineva # someone else -altcuiva -oricine # anyone -oricui -orice # anything -unul # one -una -unii -unele -unuia -uneia -unora -altul # other -alta -alţii -altele +acelora +aceloraşi +acelui +aceluiaşi +acest +acesta +aceste +acestea +acestei +acesteia +acestia +acestor +acestora +acestui +acestuia +aceşti +aceştia +acolo +acord +acum +adica +ai +aia +aibă +aici +aiurea +al +ala +alaturi +ale +alea alt -altă -alţi +alta +altceva +altcineva +altcuiva alte -altuia +altei alteia +altele +altfel +alti +altii +altor altora altui -altei -altor -vreunul # somebody, some (of them) -vreuna -vreunii -vreunele -vreun -vreo -vreunuia -vreuneia -vreunora -vreunui -vreunei -vreunor -oricare # anyone -oricăruia -oricăreia -oricărora -oricărui -oricărei -oricăror -fiecare # everyone -fiecăruia -fiecăreia -fiecărui -fiecărei -cât # how, how many -câtă -câţi -câte -câtora -câtor -atât # this much +altuia +altul +altă +alţi +alţii +am +ambele +ambelor +ambii +ambilor +amândoi +amândouă +amânduror +amândurora +anume +apoi +aproape +ar +are +as +asa +asemenea +asta +astazi +astea +astfel +astăzi +asupra +atare +atat +atata +atatea +atatia +ati +atit +atita +atitea +atitia +atunci +atât atâta -atâţi -atâţia atâtea -atâtora atâtor -oricât # however much -oricâtă -oricâţi -oricâte -oricâtora -oricâtor -câtva # some -câţiva +atâtora +atâţi +atâţia +au +avea +aveai +aveam +aveau +aveaţi +avem +aveţi +avut +azi +aş +aşa +aşadar +aţi +b +ba +bine +bucur +bună +c +ca +cam +cand +capat +care +careia +carora +caruia +cat +catre +caut +ce +cea +cealaltă +ceea +cei +ceilalti +ceilalţi +cel +cele +celei +celeilalte +celelalte +celor +celorlalte +celorlalţi +celui +celuilalt +celălalt +ceva +chiar +ci +cinci +cind +cine +cineva +cit +cita +cite +citeva +citi +citiva +conform +contra +cu +cui +cuiva +cum +cumva +curând +curînd +cutare +când +cât +câte câteva +câtor +câtora câtorva -tot # all -toată -toţi -toate -tuturor -totul -cutare # that -oarecare # some -ceva # something -altceva # something else - # Negative pronoun -nimeni # nobody -nimănui -nimic # nothing - # NUMERAL - # Cardinal numeral -unu # one -doi # two +câtva +câtă +câţi +câţiva +cînd +cît +cîte +cîtva +cîţi +că +căci +cărei +căror +cărui +către +d +da +daca +dacă +dar +dat +datorită +dată +dau +de +deasupra +deci +decit +decât +degraba +deja +deoarece +departe +desi +despre +destui +destul +destule +destulă +deşi +din +dinaintea +dintr +dintr- +dintre +doar +doi +doilea +doime doua -trei # three -patru # four -cinci # five -şase # six -şapte # seven -opt # eight -noua # nine -zece # ten - # Fractional numeral -doime # half -treime # third -sutime # hundredth - # Collective numeral -amândoi # both -amândouă -amândurora -amânduror -ambii -ambele -ambilor -ambelor - # Multiplicative numeral -îndoit # double -întreit # threefold -însutit # hundred-fold - # Ordinal numeral -întâiul # the first -întâia -primul # former -prima -primii -primele -primului -primei -primilor -primelor - # VERB - # To be -sunt # (I) am -s -eşti # (you) are -este # (he/she) is +două +drept +dumneavoastră +dupa +după +dă e -suntem # (we) are -sunteţi # (you) are -eram # (I) were -erai # (you) were -era # (he) was -eraţi # (you) were -erau # (they) were -fiu # be -fii +ea +ei +el +ele +era +erai +eram +erau +este +eu +exact +există +eşti +f +face +fara +fata +faţă +fel +fi fie +fiecare +fiecărei +fiecăreia +fiecărui +fiecăruia +fii +fiind fim +fiu fiţi -fi -fiind # being -fost # been - # Auxiliary verb -am # to have - all forms -aţi -au -are -avem -aveţi -aveam -aveai -avea -aveaţi -aveau -aş -ar -oi # to will +foarte +făcut +g +h +i +ia +iar +ieri +ii +il +imi +in +inainte +inapoi +inca +incit +insa +intr +intre +isi +iti +j +k +l +la +le +li +lor +lui +lângă +lîngă +m +ma +mai +mare +mea +mei +mele +mereu +meu +mi +mie +mine +mod +mult +multa +multe +multi +multă +mulţi +mulţumesc +mâine +mîine +mă +n +ne +nevoie +ni +nici +niciodata +nicăieri +nimeni +nimeri +nimic +nimănui +niste +nişte +noastre +noastră +noi +noroc +nostri +nostru +nou +noua +nouă +noştri +nu +numai +o +oarecare +oi om -oţi +opt or -vei +ori +oricare +orice +oricine +oricui +oricum +oricând +oricât +oricâte +oricâtor +oricâtora +oricâtă +oricâţi +oricînd +oricît +oricărei +oricăreia +oricăror +oricărora +oricărui +oricăruia +oriunde +oţi +p +pai +parte +patra +patru +patrulea +pe +pentru +peste +pic +pina +plus +poate +pot +prea +prima +primei +primele +primelor +primii +primilor +primul +primului +prin +printr- +printre +putea +putini +puţin +puţina +puţine +puţini +puţină +până +pînă +r +rog +s +sa +sa-mi +sa-ti +sai +sale +sau +se +si +sie +sieşi +sine +sint +sintem +spate +spre +spune +spus +sub +sunt +suntem +sunteţi +sus +sutime +sută +sînt +sîntem +sînteţi +să +săi +său +t +ta +tale +te +ti +timp +tine +toata +toate +toată +tocmai +tot +toti +totul +totusi +totuşi +toţi +trebuie +trei +treia +treilea +treime +tu +tuturor +tăi +tău +u +ul +un +una +unde +undeva +unei +uneia +unele +uneori +unii +unor +unora +unu +unui +unuia +unul +v va -vom +vei veţi +vi +voastre +voastră +voi +vom vor - # CONJUNCTION -şi # and -nici # neither -dar # but +vostru +vouă +voştri +vreme +vreo +vreun +vreuna +vreunei +vreuneia +vreunele +vreunii +vreunor +vreunora +vreunui +vreunuia +vreunul +vă +x +z +zece +zero +zi +zice +îi +îl +îmi +împotriva +în +înainte +înaintea +încotro +încât +încît +încă +îndoit +însele +însene +însevă +înseşi +înspre +însumi +însutit +însuşi +însuţi însă -iar # and, but, while, again -ci # but, so that -sau # or -ori -deci # so -aşadar -încât # so that -aşa # such -deşi # although -totuşi # though -dacă # if -atunci # then -că # that - # OTHER -nu # no - - # The following is a ranked list (commonest to rarest) of stopwords - # deriving from a large sample of text. - -poate # maybe -ieri # yesterday -mare # big -doar # just -trebuie # must -spus # said -acum # now -putea # can -chiar # even -face # do -astfel # such -pot # can -făcut # done -avut # had -parte # part -spune # says -bine # good -faţă # front -există # exists -încă # still -numai # only -dat # given -asupra # on -aproape # near +însămi +însăşi +însăţi +între +întreit +întrucât +întrucît +întâia +întâiul +înşine +înşivă +înşişi +îşi +îţi +ăia +ăla +ălea +ăleia +ălora +ăluia +ăsta +ăstea +ăstuia +ăştia +şapte +şase +şi +ştiu +ţi +ţie \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/ro/ro_stop_words_old.txt b/apps/common/src/python/mediawords/languages/ro/ro_stop_words_old.txt new file mode 100755 index 0000000000..2afa1eb3de --- /dev/null +++ b/apps/common/src/python/mediawords/languages/ro/ro_stop_words_old.txt @@ -0,0 +1,440 @@ +# +# This is a stop word list for the Romanian language. +# +# Source: http://snowball.tartarus.org/otherapps/romanian/intro.html (romanian2.tgz) +# + + # A Romanian stop word list. Comments begin with vertical bar. Each stop + # word is at the start of a line. + + # Many of the forms below are quite rare but included for completeness. + + # ARTICLE + # Indefinite article +o # a +unui +unei +unor +nişte # some + # Demonstrative/adjectival article +cel +cea +cei +cele +celui +celei +celor + # Possessive / genitival article +al # of +a +ai +ale + # PREPOSITION AND ADVERB +pe # on +la # at +în # in +fără # without +sub # under +despre # about +către # to +cu # with +de # from +din # on +lângă # by +pentru # for +peste # over +spre # to +prin # through +dintre # between +printre # among +până # until +după # after +înspre # towards +ca # as + # ADJECTIVE +mai # more +decât # than +cum # how +foarte # very +mult # much +multă +mulţi +multe +puţin # little +puţină +puţini +puţine +destul # enough +destulă +destui +destule + # PRONOUN + # Personal pronoun +eu # I +tu # you +el # he +ea # she +noi # we +voi # you +ei # they +ele # they +mie # me +îmi +mi +mine +mă +m +ţie # you +îţi +ţi +tine +te +lui # him +îl +l +îi +i +nouă # us +ne +ni +vouă # you +vă +vi +v +lor # them +le +li + # Pronoun of politeness +dumneavoastră # you + # Reflexive pronoun +se # himself +îşi +sie +sieşi +sine + # Pronoun of reinforcement +însumi # myself +însămi +însuţi # youself +însăţi +însuşi # himself +însăşi # herself +înşine # ourselves +însene +înşivă # youselves +însevă +înşişi # themselves +înseşi +însele + # Possessive pronoun +meu # mine +mea +mei +mele +tău # yours +ta +tăi +tale +său # his +sa +săi +sale +nostru # ours +noastră +noştri +noastre +vostru # yours +voastră +voştri +voastre + # Demonstrative pronoun +acesta # this +ăsta +aceştia +ăştia +acestuia +ăstuia +acestora +ăstora +aceasta +asta +acestea +astea +acesteia +ăsteia +acest +aceşti +acestui +acestor +această +aceste +acestei +acela # that +ăla +acelui +ăluia +aceia +ăia +acelora +ălora +aceea +aia +acelea +alea +aceleia +ăleia +acel +acei +acelor +acea +acele +acelei +acelaşi # the same +aceiaşi +aceeaşi +aceleaşi +aceluiaşi +aceloraşi +aceleiaşi +celălalt # the other +celuilalt +ceilalţi +celorlalţi +cealaltă +celeilalte +celelalte +celorlalte + # Interrogative pronoun +ce # what +cine # who +cui # whom +care # which, what +cărui +cărei +căror +unde # where +când # when + # Indefinite pronoun +cineva # someone +cuiva +altcineva # someone else +altcuiva +oricine # anyone +oricui +orice # anything +unul # one +una +unii +unele +unuia +uneia +unora +altul # other +alta +alţii +altele +alt +altă +alţi +alte +altuia +alteia +altora +altui +altei +altor +vreunul # somebody, some (of them) +vreuna +vreunii +vreunele +vreun +vreo +vreunuia +vreuneia +vreunora +vreunui +vreunei +vreunor +oricare # anyone +oricăruia +oricăreia +oricărora +oricărui +oricărei +oricăror +fiecare # everyone +fiecăruia +fiecăreia +fiecărui +fiecărei +cât # how, how many +câtă +câţi +câte +câtora +câtor +atât # this much +atâta +atâţi +atâţia +atâtea +atâtora +atâtor +oricât # however much +oricâtă +oricâţi +oricâte +oricâtora +oricâtor +câtva # some +câţiva +câteva +câtorva +tot # all +toată +toţi +toate +tuturor +totul +cutare # that +oarecare # some +ceva # something +altceva # something else + # Negative pronoun +nimeni # nobody +nimănui +nimic # nothing + # NUMERAL + # Cardinal numeral +unu # one +doi # two +doua +trei # three +patru # four +cinci # five +şase # six +şapte # seven +opt # eight +noua # nine +zece # ten + # Fractional numeral +doime # half +treime # third +sutime # hundredth + # Collective numeral +amândoi # both +amândouă +amândurora +amânduror +ambii +ambele +ambilor +ambelor + # Multiplicative numeral +îndoit # double +întreit # threefold +însutit # hundred-fold + # Ordinal numeral +întâiul # the first +întâia +primul # former +prima +primii +primele +primului +primei +primilor +primelor + # VERB + # To be +sunt # (I) am +s +eşti # (you) are +este # (he/she) is +e +suntem # (we) are +sunteţi # (you) are +eram # (I) were +erai # (you) were +era # (he) was +eraţi # (you) were +erau # (they) were +fiu # be +fii +fie +fim +fiţi +fi +fiind # being +fost # been + # Auxiliary verb +am # to have - all forms +aţi +au +are +avem +aveţi +aveam +aveai +avea +aveaţi +aveau +aş +ar +oi # to will +om +oţi +or +vei +va +vom +veţi +vor + # CONJUNCTION +şi # and +nici # neither +dar # but +însă +iar # and, but, while, again +ci # but, so that +sau # or +ori +deci # so +aşadar +încât # so that +aşa # such +deşi # although +totuşi # though +dacă # if +atunci # then +că # that + # OTHER +nu # no + + # The following is a ranked list (commonest to rarest) of stopwords + # deriving from a large sample of text. + +poate # maybe +ieri # yesterday +mare # big +doar # just +trebuie # must +spus # said +acum # now +putea # can +chiar # even +face # do +astfel # such +pot # can +făcut # done +avut # had +parte # part +spune # says +bine # good +faţă # front +există # exists +încă # still +numai # only +dat # given +asupra # on +aproape # near diff --git a/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt b/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt index e4a59dda4c..f4721e80e3 100644 --- a/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt @@ -1,10 +1,13 @@ +# This is a stop word list for the Russian language. # -# This is a "short" stop word list for the Russian language. -# +# https://github.com/stopwords-iso/stopwords-ru/blob/master/stopwords-ru.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) +# Source: adriver amp bin +c cgi href html @@ -16,517 +19,456 @@ rnd sid style www -а А +а августа -акций -Александр -Александра -Алексей -Анатолий -Андрей +алло АО апрель апреля -Ассошиэйтед Б -без Без +без +близко блог -более Более +более больше -большинство -большой -Борис +будем будет +будете +будешь будто +буду будут +будь бы -бывшего +бывает бывший +бывь был была были было быстро быть -в В +в +важная +важное +важные +важный вам +вами вас ваш +ваша +ваше +ваши +вверх +вдали вдруг -ведь Ведь -века -вести +ведь +везде весь весьма взгляд взять +вид виде +видел +видеть видимо -Виктор -вице включая -Владимира власть вместе вместо -внимание +вниз +внизу вновь -во Во +во вовсе -воды -возможно -возможности -возможность -войск вокруг +вон вообще вопрос -вопросы -воскресение -вот +восемь +восьмой Вот +вот впервые вполне Впрочем +впрочем времена времени время вроде вряд -все Все +все +все еще всегда -всего всей всем +всеми всему всех +всею встречи всю +всюду вся всё вторая второй -вы Вы -выборах +вы выше -выяснилось -г Г +г где -глава -главе -главного главное главный главным -главы +глаз го говорил говорит -говорится -говорить говоря говорят год года году годы -город +голова города городе -градусов -Грозном группа группы -д Д +д Да да -давно +давать дает -даже Даже +даже дал +далекий далеко дальше -данным +даром дать два две -движения двух -действий -действительно -действия +девятый +девять декабря -дел -дела делаем -делам +делал делать -деле -дело -Дело -делу +делаю день -деньги +десятый десять -деятельности -деятельность -директора -для Для +для дней дни дня днях -до До -довольно -документы -долго +до должен должна должно должны -дом -дома -доме -достаточно -друг -друга +должный +дорога +другая другие другим других +друго другое другой другом -е +думать Е -его +е Его +его едва ее ей ему -если Если -естественно -есть +если Есть -еще +есть Еще +еще +ещё +ею +её +ж +ждать же -женщин -женщины -жизни -жизнь -жителей жить -за За -завода -закон -зам -заместитель +за +занят затем +зато +зачем заявил -заявление -здесь Здесь -земли +здесь знает +знать значит знаю знают -зрения -и И -игры +и идет -из +иди +идти Из -Известий -Известия -Известиям -известно +из или -Иллюстрация им имеет +имел имени -именно Именно +именно иметь имеют +ими имя иначе -интервью -интересы -информацию -история +иногда ИТАР итоге -их Их +их июля июня й -к К +к каждая каждого +каждое +каждые каждый кажется -как Как +как +какая какие каким каких какой -касается -качестве -квартиры -километров -когда +кем Когда +когда кого -количество команда команды -комиссии -комитета -комментариев -компания +комната кому -конечно +конец Конечно -конференции +конечно конца конце -коп -корреспонденту которая -которого которое которой котором -которому которую которые который которым -которыми которых -края Кроме кроме -крупных +кругом кстате Кстати кстати -кто Кто +кто куда Л -легко лет -летний ли либо -лидер -лиц -лица -лично +лицо лишь -лучше любая любой людей люди -людям М м мало -марта -массовой мая -между Между +между мене менее меньше меня -мере -меры места -месте -местных место месяц месяца месяцев метра метров -миллиарда миллион -миллиона +мимо минут +минута +мира мире -мировой -Михаил -мне Мне -многие +мне Многие +многие многих много +мной +мною мог могла могли +могу могут -может +мож Может -можно +может +может быть Можно +можно +можхо +мои мой момент -мы +мочь +моя +моё Мы +мы Н -на -На НА +На +на +наверху над -надо назад -наиболее найти наконец нам -например -народа +нами нас -находившегося -находится начала начале -начальник -начальника +начать наш наша +наше нашего нашей наши наших -не -Не НЕ -невозможно +Не +не него недавно недели неделю нее ней -некоторого -некоторые -некоторых -нельзя нем немало нему -необходимо -нескольких -несколько -несмотря -нет Нет -ни +нет +нею +неё Ни +ни нибудь +ниже никак никаких +никакой никогда -Николай никто +никуда ним ними них ничего -но +ничто Но -нового -новой -новостей -новые -новый -новых +но ноября Ну +ну нужно +нужный +нх ныне -нынешнего Нью -о О -об +о Об +об +оба области образом обычно -один Один +один одна +однажды Однако однако одним одно -одновременно одного одной одном одну -оказалась -оказались -оказалось -оказался около октября -он Он -она +он Она -они +она Они +они оно -операции -опыт опять -органы -основном -особенно -остается -от От +от ответ -отдела -отличие -отношении -отношения +отец +откуда +отсюда очень -очередной очередь П -партия первая первого первой @@ -537,149 +479,73 @@ www первых перед период -письмо -площади -по По +по поводу под -подобная -позиции -пока +подойди +позже +пойти Пока -политики -полностью -положение -полтора -получил -получили -получить -помощи -помощь -помощью -понять +пока +пол пор -порядке -посколько -поскольку -после +пора После -последнее -последние -последний -последних -пост -постоянно -потом +после Потом +потом потому похоже -почему Почему -почта +почему почти Поэтому -поэтому -права Правда -правда -правило -право -практически -предприятий -предприятия -председателя -представителей -представители прежде -прежнему -премьер -премьера -Пресс -пресс -при При -придется -примерно -примеру -принять -приходится +при Причем -пришлось про -проблем -проблема -проблемы -провести -продукции -проект -производства -производство -произошло -происходит -прокуратуры просто -против -процента -процентов -процесс -прошла -прошлого -прошлом прямо пути путь пятая пяти +пятый пять -работа -работавшую -работает -работать -работе -работу -равно раз раза -развития -разных -района -районе +разве ранее раньше -резко -результате Рейтер -речь -решения решил решили -рода -роль -руб -рук -руках -руки -руководителей -руководитель -руководство +решить ряд рядом -с С +с +с кем сам сама сами +самим +самими +самих +само самого самое самой самом +самому +саму самые самый самым самых -сборной -свет свое своего своей @@ -691,219 +557,153 @@ www своих свой свою -связи -сделаем сделал -сделать себе себя -сегодня Сегодня -сейчас Сейчас -семьи -сентября -Сергей -Сергея -силу -силы -система -системы -ситуации -ситуацию -ситуация +сейчас +семь +сидеть сих скажем сказал -сказать -сколько скорее -следует -слишком -слова -словам -случае -случай -смерти снова со собой -собственности -событий -события -совершенно +собою совсем -создать сообща -сообщил -сообщили -состоянии -сотрудники -сотрудников -специалистов -специалисты сразу -среди Среди -средств -средства +среди срок -ссылка стал стала стали стало станет -становится стате стать -степени сто стоит -столице столь -столько -сторону -стороны -суббота -суда сумму -сути -существует -счет -считает -считать -считают т -так +та Так +так такая также таки такие -таким Таким +таким таких такого такое такой -там Там +там ТАСС +твои +твой +твоя +твоё те -театра тебе -тем +тебя Тем -теперь +тем +теми Теперь -территории +теперь тех -течение -то То -тогда +то +тобой +тобою Тогда +тогда того тоже той -только Только +только том тому тонн тот -точки -точнее +тою +третий трех три -труда трудно +ту туда тут ты тысяч тысячи -у У -удалось +у уж -уже Уже -уровень +уже +уметь уровне -условия -условиях -утверждает -утверждают -участие -участников -факт -февраля -фирм -фирма -фирмы -фонда -Фото х -ходе хорошо +хотел бы +хотеть хоть -хотя Хотя -хочет -целом -центр -центра -центре -цены +хотя +хочешь час часа часов части -частности часто часть чаще чего -человек -человека чем -через +чему Через +через четыре -четырех числе число членов -что Что -чтобы +что +чтоб Чтобы +чтобы чуть +шестой шесть -эта Эта -эти +эта Эти +эти этим +этими этих -это Это +это этого этой этом этому -этот Этот +этот эту Ю -Юрий -я Я -являетесь -является +я явно якобы -января -ясно +января \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/ru/ru_stop_words_old.txt b/apps/common/src/python/mediawords/languages/ru/ru_stop_words_old.txt new file mode 100644 index 0000000000..e4a59dda4c --- /dev/null +++ b/apps/common/src/python/mediawords/languages/ru/ru_stop_words_old.txt @@ -0,0 +1,909 @@ +# +# This is a "short" stop word list for the Russian language. +# + +adriver +amp +bin +cgi +href +html +http +link +livejournal +quot +rnd +sid +style +www +а +А +августа +акций +Александр +Александра +Алексей +Анатолий +Андрей +АО +апрель +апреля +Ассошиэйтед +Б +без +Без +блог +более +Более +больше +большинство +большой +Борис +будет +будто +будут +бы +бывшего +бывший +был +была +были +было +быстро +быть +в +В +вам +вас +ваш +вдруг +ведь +Ведь +века +вести +весь +весьма +взгляд +взять +виде +видимо +Виктор +вице +включая +Владимира +власть +вместе +вместо +внимание +вновь +во +Во +вовсе +воды +возможно +возможности +возможность +войск +вокруг +вообще +вопрос +вопросы +воскресение +вот +Вот +впервые +вполне +Впрочем +времена +времени +время +вроде +вряд +все +Все +всегда +всего +всей +всем +всему +всех +встречи +всю +вся +всё +вторая +второй +вы +Вы +выборах +выше +выяснилось +г +Г +где +глава +главе +главного +главное +главный +главным +главы +го +говорил +говорит +говорится +говорить +говоря +говорят +год +года +году +годы +город +города +городе +градусов +Грозном +группа +группы +д +Д +Да +да +давно +дает +даже +Даже +дал +далеко +дальше +данным +дать +два +две +движения +двух +действий +действительно +действия +декабря +дел +дела +делаем +делам +делать +деле +дело +Дело +делу +день +деньги +десять +деятельности +деятельность +директора +для +Для +дней +дни +дня +днях +до +До +довольно +документы +долго +должен +должна +должно +должны +дом +дома +доме +достаточно +друг +друга +другие +другим +других +другое +другой +другом +е +Е +его +Его +едва +ее +ей +ему +если +Если +естественно +есть +Есть +еще +Еще +же +женщин +женщины +жизни +жизнь +жителей +жить +за +За +завода +закон +зам +заместитель +затем +заявил +заявление +здесь +Здесь +земли +знает +значит +знаю +знают +зрения +и +И +игры +идет +из +Из +Известий +Известия +Известиям +известно +или +Иллюстрация +им +имеет +имени +именно +Именно +иметь +имеют +имя +иначе +интервью +интересы +информацию +история +ИТАР +итоге +их +Их +июля +июня +й +к +К +каждая +каждого +каждый +кажется +как +Как +какие +каким +каких +какой +касается +качестве +квартиры +километров +когда +Когда +кого +количество +команда +команды +комиссии +комитета +комментариев +компания +кому +конечно +Конечно +конференции +конца +конце +коп +корреспонденту +которая +которого +которое +которой +котором +которому +которую +которые +который +которым +которыми +которых +края +Кроме +кроме +крупных +кстате +Кстати +кстати +кто +Кто +куда +Л +легко +лет +летний +ли +либо +лидер +лиц +лица +лично +лишь +лучше +любая +любой +людей +люди +людям +М +м +мало +марта +массовой +мая +между +Между +мене +менее +меньше +меня +мере +меры +места +месте +местных +место +месяц +месяца +месяцев +метра +метров +миллиарда +миллион +миллиона +минут +мире +мировой +Михаил +мне +Мне +многие +Многие +многих +много +мог +могла +могли +могут +может +Может +можно +Можно +мой +момент +мы +Мы +Н +на +На +НА +над +надо +назад +наиболее +найти +наконец +нам +например +народа +нас +находившегося +находится +начала +начале +начальник +начальника +наш +наша +нашего +нашей +наши +наших +не +Не +НЕ +невозможно +него +недавно +недели +неделю +нее +ней +некоторого +некоторые +некоторых +нельзя +нем +немало +нему +необходимо +нескольких +несколько +несмотря +нет +Нет +ни +Ни +нибудь +никак +никаких +никогда +Николай +никто +ним +ними +них +ничего +но +Но +нового +новой +новостей +новые +новый +новых +ноября +Ну +нужно +ныне +нынешнего +Нью +о +О +об +Об +области +образом +обычно +один +Один +одна +Однако +однако +одним +одно +одновременно +одного +одной +одном +одну +оказалась +оказались +оказалось +оказался +около +октября +он +Он +она +Она +они +Они +оно +операции +опыт +опять +органы +основном +особенно +остается +от +От +ответ +отдела +отличие +отношении +отношения +очень +очередной +очередь +П +партия +первая +первого +первой +первую +первые +первый +первым +первых +перед +период +письмо +площади +по +По +поводу +под +подобная +позиции +пока +Пока +политики +полностью +положение +полтора +получил +получили +получить +помощи +помощь +помощью +понять +пор +порядке +посколько +поскольку +после +После +последнее +последние +последний +последних +пост +постоянно +потом +Потом +потому +похоже +почему +Почему +почта +почти +Поэтому +поэтому +права +Правда +правда +правило +право +практически +предприятий +предприятия +председателя +представителей +представители +прежде +прежнему +премьер +премьера +Пресс +пресс +при +При +придется +примерно +примеру +принять +приходится +Причем +пришлось +про +проблем +проблема +проблемы +провести +продукции +проект +производства +производство +произошло +происходит +прокуратуры +просто +против +процента +процентов +процесс +прошла +прошлого +прошлом +прямо +пути +путь +пятая +пяти +пять +работа +работавшую +работает +работать +работе +работу +равно +раз +раза +развития +разных +района +районе +ранее +раньше +резко +результате +Рейтер +речь +решения +решил +решили +рода +роль +руб +рук +руках +руки +руководителей +руководитель +руководство +ряд +рядом +с +С +сам +сама +сами +самого +самое +самой +самом +самые +самый +самым +самых +сборной +свет +свое +своего +своей +своем +своему +свои +своим +своими +своих +свой +свою +связи +сделаем +сделал +сделать +себе +себя +сегодня +Сегодня +сейчас +Сейчас +семьи +сентября +Сергей +Сергея +силу +силы +система +системы +ситуации +ситуацию +ситуация +сих +скажем +сказал +сказать +сколько +скорее +следует +слишком +слова +словам +случае +случай +смерти +снова +со +собой +собственности +событий +события +совершенно +совсем +создать +сообща +сообщил +сообщили +состоянии +сотрудники +сотрудников +специалистов +специалисты +сразу +среди +Среди +средств +средства +срок +ссылка +стал +стала +стали +стало +станет +становится +стате +стать +степени +сто +стоит +столице +столь +столько +сторону +стороны +суббота +суда +сумму +сути +существует +счет +считает +считать +считают +т +так +Так +такая +также +таки +такие +таким +Таким +таких +такого +такое +такой +там +Там +ТАСС +те +театра +тебе +тем +Тем +теперь +Теперь +территории +тех +течение +то +То +тогда +Тогда +того +тоже +той +только +Только +том +тому +тонн +тот +точки +точнее +трех +три +труда +трудно +туда +тут +ты +тысяч +тысячи +у +У +удалось +уж +уже +Уже +уровень +уровне +условия +условиях +утверждает +утверждают +участие +участников +факт +февраля +фирм +фирма +фирмы +фонда +Фото +х +ходе +хорошо +хоть +хотя +Хотя +хочет +целом +центр +центра +центре +цены +час +часа +часов +части +частности +часто +часть +чаще +чего +человек +человека +чем +через +Через +четыре +четырех +числе +число +членов +что +Что +чтобы +Чтобы +чуть +шесть +эта +Эта +эти +Эти +этим +этих +это +Это +этого +этой +этом +этому +этот +Этот +эту +Ю +Юрий +я +Я +являетесь +является +явно +якобы +января +ясно diff --git a/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt b/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt index 0629e2deb2..5ad5904ccd 100644 --- a/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt @@ -1,22 +1,42 @@ -# # This is a stop word list for the Swedish language. # # Sources: # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-sv/blob/master/stopwords-sv.txt # that one Swedish journalist -# +# (Lightly edited to remove words in the original lists that are actually meaningful) +aderton +adertonde +adjö +aldrig alla +allas allt +alltid +alltså +andra +andras +annan +annat +artonde +artonn att av -blev -bli -blir -blivit +bara +bland borde +bort +borta båda +bådas +dag +dagar +dagarna +dagen de +del +delen dem den denna @@ -29,125 +49,309 @@ detta dig din dina +dit ditt -dom +dock du där +därför då +e efter eftersom egen ej +elfte eller +elva +emot en +enligt +ens er era +ers ert ett +ettusen fanns -finns +fem +femte +femtio +femtionde +femton +femtonde +fick +finnas +fjorton +fjortonde +fjärde +fler +flera +flesta från +fyra +fyrtio +fyrtionde få +får +fått +följande för före genom +gick gjorde gjort +god +goda +godare +godast +gälla +gäller +gällt +gärna +gå +går +gått +gör göra ha hade -hade -han +haft han hans -hans har +heller +hellre hen henne hennes +hit hon honom +hundra +hundraen +hundraett hur här +högst i -i +ibland icke +idag igen +igår +imorgon +in +inför +inga ingen +ingenting +inget innan +inne inom inte +inuti +ja jag +jo ju +just +jämfört kan +kanske +knappast +kom +komma +kommer +kommit +kr kunde +kunna kunnat -lite +kvar +legat +ligga +ligger man med +mej mellan men +mer +mera +mest mig min -min mina mitt +mittemot mot mycket +många +måste +möjlig +möjligen +möjligt +möjligtvis +nederst +nej +ner nere ni +nio +nionde +nittio +nittionde +nitton +nittonde +nog +noll +nr nu +nummer när +nästa någon +någonting något några +nån +nånting +nåt +nödvändig +nödvändiga +nödvändigt +nödvändigtvis och +också +ofta +oftast +olika +olikt om -oss på +rakt +redan +rätt +sa +sade +sagt samma sedan sen +senare +senast +sent +sex +sextio +sextionde +sexton +sextonde sig sin sina +sist +sista +siste +sitt sitta +sju +sjunde +sjuttio +sjuttionde +sjutton +sjuttonde själv +sjätte +ska +skall skulle +slutligen +snart som +säga +säger så sådan sådana sådant sån -till +ta +tack +tar till tills +tio +tionde +tjugo +tjugoen +tjugoett +tjugonde +tjugotre +tjugotvå +tjungo +tolfte +tolv +tre +tredje +trettio +trettionde +tretton +trettonde +två +tvåhundra under upp +ur ut utan +utanför ute +va vad var vara varför +varifrån varit varje +varken vars +varsågod vart vem +vems +verkligen vi vid +vidare +viktigare +viktigast vilka vilkas vilken vilket -vår +vill +väl +vänster +vänstra våra vårat vårt än +ännu är +även åt +åtminstone +åtta +åttio +åttionde +åttonde över +övermorgon +överst +övre \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/sv/sv_stop_words_old.txt b/apps/common/src/python/mediawords/languages/sv/sv_stop_words_old.txt new file mode 100644 index 0000000000..0629e2deb2 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/sv/sv_stop_words_old.txt @@ -0,0 +1,153 @@ +# +# This is a stop word list for the Swedish language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# that one Swedish journalist +# + +alla +allt +att +av +blev +bli +blir +blivit +borde +båda +de +dem +den +denna +dens +deras +dess +dessa +det +detta +dig +din +dina +ditt +dom +du +där +då +efter +eftersom +egen +ej +eller +en +er +era +ert +ett +fanns +finns +från +få +för +före +genom +gjorde +gjort +göra +ha +hade +hade +han +han +hans +hans +har +hen +henne +hennes +hon +honom +hur +här +i +i +icke +igen +ingen +innan +inom +inte +jag +ju +kan +kunde +kunnat +lite +man +med +mellan +men +mig +min +min +mina +mitt +mot +mycket +nere +ni +nu +när +någon +något +några +och +om +oss +på +samma +sedan +sen +sig +sin +sina +sitta +själv +skulle +som +så +sådan +sådana +sådant +sån +till +till +tills +under +upp +ut +utan +ute +vad +var +vara +varför +varit +varje +vars +vart +vem +vi +vid +vilka +vilkas +vilken +vilket +vår +våra +vårat +vårt +än +är +åt +över diff --git a/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt b/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt old mode 100755 new mode 100644 index 2418be327c..4736179ead --- a/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt @@ -1,44 +1,71 @@ -# -# This is a stop word list for the Turkish language. -# # Sources: +# # http://nlp.ceng.fatih.edu.tr/blog/?p=101 # http://www.ranks.nl/stopwords/turkish.html -# +# https://github.com/stopwords-iso/stopwords-tr/blob/master/stopwords-tr.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) a -acaba +acep +adamakıllı +adeta +ait altmýþ +altmış altý altı ama +amma +anca ancak +arada +artýk artık asla aslında +aynen +ayrıca az +açıkçası b bana +bari bazen bazý bazı bazıları bazısı +başkası +baţka belki ben benden beni benim +beri +beriki beþ beş +beţ +bilcümle bile bin +binaen +binaenaleyh bir +biraz +birazdan +birbiri +birden +birdenbire biri +birice +birileri birisi birkaç birkaçı birkez +birlikte birçok birçokları birçoğu @@ -46,56 +73,145 @@ birþey birþeyi birşey birşeyi +birţey +bitevi +biteviye +bittabi biz +bizatihi +bizce +bizcileyin bizden bize bizi bizim +bizimki +bizzat bu buna bunda bundan +bunlar +bunları +bunların bunu bunun +buracıkta burada +buradan +burası böyle böylece +böylecene +böylelikle +böylemesine +böylesine +büsbütün bütün c +cümlesi d da daha dahi +dahil +dahilen +daima +dair +dayanarak de defa +dek demek +demin +demincek +deminden +denli +derakap +derhal +derken +deđil değil +değin diye +diđer diğer diğeri diğerleri doksan dokuz dolayı +dolayısıyla +doğru dört e +edecek +eden +ederek +edilecek +ediliyor +edilmesi +ediyor +elbet elbette elli +emme en en gibi +enikonu +epey +epeyce +epeyi +esasen +esnasında +etmesi +etti +ettiği +ettiğini +evleviyetle +evvel +evvela +evvelce +evvelden +evvelemirde +evveli +eđer +eğer f fakat falan felan -filan +filanca g +gah +gayet +gayetle +gayri +gayrı +gelgelelim gene +gerek +gerçi +geçende +geçenlerde gibi +gibilerden +gibisinden +göre h +hakeza +halbuki +halen +halihazırda +haliyle +handiyse hangi hangisi hani +hariç +hasebiyle hatta +hele hem henüz hep @@ -104,32 +220,64 @@ hepsine hepsini her her biri +herhangi herkes herkese herkesi +herkesin hiç hiç kimse +hiçbir hiçbiri hiçbirine hiçbirini hâlâ i +iken iki +ila ile -INSERmi +ilgili +ilk +illa +illaki +imdi +indinde +insermi ise +ister +itibaren +itibariyle +itibarıyla +iyi +iyice +iyicene için içinde -işte +iţte j k kadar +kah +kala +kanýmca +karşın katrilyon -kaç +kaynak +kaçı kendi +kendilerine kendine kendini +kendisi +kendisine +kendisini +kere kez +keza +kezalik +keşke +keţke ki kim kimden @@ -137,10 +285,26 @@ kime kimi kimin kimisi +kimse +kimsecik +kimsecikler +külliyen kýrk +kýsaca +kırk +kısaca l +lakin +lütfen m +maada madem +mademki +mamafih +mebni +meğer +meğerki +meğerse mi milyar milyon @@ -151,39 +315,93 @@ mı n nasýl nasıl +nasılsa +nazaran ne ne kadar ne zaman neden +nedeniyle +nedenle +nedense nedir nerde +nerden +nerdeyse +nere nerede nereden +neredeyse +neresi nereye nesi +netekim +neye +neyi neyse +nice +nihayet +nihayetinde +nitekim niye niçin o +olan +olarak +oldu +olduklarını +oldukça +olduğu +olduğunu +olmadı +olmadığı +olmak +olması +olmayan +olmaz +olsa +olsun +olup +olur +olursa +oluyor on ona +onca +onculayın +onda ondan onlar onlara onlardan onlari onlarýn +onları onların onu onu otuz onun +oracık +oracıkta orada +oradan +oranca +oranla +oraya +otuz oysa oysaki p +pek +pekala +peki +peyderpey r rağmen s +sadece +sahi +sahiden sana sanki sekiz @@ -199,44 +417,104 @@ sizi sizin son sonra +sonradan +sonraları +sonunda t tabi +tabii +tam tamam +tamamen +tamamıyla +tarafından +tek trilyon tüm tümü u v var +vardı +vasıtasıyla ve +velev +velhasıl +velhasılıkelam veya veyahut y ya ya da +yahut +yakinen +yakında +yakından +yakınlarda +yalnız +yalnızca yani +yapacak +yapmak +yaptı +yaptıkları +yaptığı +yaptığını +yapılan +yapılması +yapıyor yedi +yeniden +yenilerde yerine yetmiþ +yetmiş +yetmiţ yine yirmi +yok yoksa +yoluyla yüz +yüzünden z +zarfında zaten +zati zira ç +çabuk +çabukça +çeşitli çok +çokları +çoklarınca +çokluk +çoklukla +çokça çoğu +çoğun çoğuna +çoğunca +çoğunlukla çoğunu çünkü ö +öbürkü öbürü ön önce +önceden +önceleri +öncelikle +öteki +ötekisi ötürü öyle +öylece +öylelikle +öylemesine +öz ü üzere üç @@ -261,9 +539,18 @@ zira şimdi şu şuna +şuncacık şunda şundan şunlar +şunları şunu şunun +şura +şuracıkta +şurası şöyle +ţayet +ţimdi +ţu +ţöyle \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/tr/tr_stop_words_old.txt b/apps/common/src/python/mediawords/languages/tr/tr_stop_words_old.txt new file mode 100755 index 0000000000..2418be327c --- /dev/null +++ b/apps/common/src/python/mediawords/languages/tr/tr_stop_words_old.txt @@ -0,0 +1,269 @@ +# +# This is a stop word list for the Turkish language. +# +# Sources: +# http://nlp.ceng.fatih.edu.tr/blog/?p=101 +# http://www.ranks.nl/stopwords/turkish.html +# + +a +acaba +altmýþ +altý +altı +ama +ancak +artık +asla +aslında +az +b +bana +bazen +bazý +bazı +bazıları +bazısı +belki +ben +benden +beni +benim +beþ +beş +bile +bin +bir +biri +birisi +birkaç +birkaçı +birkez +birçok +birçokları +birçoğu +birþey +birþeyi +birşey +birşeyi +biz +bizden +bize +bizi +bizim +bu +buna +bunda +bundan +bunu +bunun +burada +böyle +böylece +bütün +c +d +da +daha +dahi +de +defa +demek +değil +diye +diğer +diğeri +diğerleri +doksan +dokuz +dolayı +dört +e +elbette +elli +en +en gibi +f +fakat +falan +felan +filan +g +gene +gibi +h +hangi +hangisi +hani +hatta +hem +henüz +hep +hepsi +hepsine +hepsini +her +her biri +herkes +herkese +herkesi +hiç +hiç kimse +hiçbiri +hiçbirine +hiçbirini +hâlâ +i +iki +ile +INSERmi +ise +için +içinde +işte +j +k +kadar +katrilyon +kaç +kendi +kendine +kendini +kez +ki +kim +kimden +kime +kimi +kimin +kimisi +kýrk +l +m +madem +mi +milyar +milyon +mu +mü +mý +mı +n +nasýl +nasıl +ne +ne kadar +ne zaman +neden +nedir +nerde +nerede +nereden +nereye +nesi +neyse +niye +niçin +o +on +ona +ondan +onlar +onlara +onlardan +onlari +onlarýn +onların +onu +onu otuz +onun +orada +oysa +oysaki +p +r +rağmen +s +sana +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +size +sizi +sizin +son +sonra +t +tabi +tamam +trilyon +tüm +tümü +u +v +var +ve +veya +veyahut +y +ya +ya da +yani +yedi +yerine +yetmiþ +yine +yirmi +yoksa +yüz +z +zaten +zira +ç +çok +çoğu +çoğuna +çoğunu +çünkü +ö +öbürü +ön +önce +ötürü +öyle +ü +üzere +üç +þey +þeyden +þeyi +þeyler +þu +þuna +þunda +þundan +þunu +ğ +ı +ş +şayet +şey +şeyden +şeye +şeyi +şeyler +şimdi +şu +şuna +şunda +şundan +şunlar +şunu +şunun +şöyle diff --git a/apps/common/src/python/mediawords/languages/zh/__init__.py b/apps/common/src/python/mediawords/languages/zh/__init__.py index 909365b9dd..eb011d37be 100644 --- a/apps/common/src/python/mediawords/languages/zh/__init__.py +++ b/apps/common/src/python/mediawords/languages/zh/__init__.py @@ -32,6 +32,9 @@ class ChineseLanguage(StopWordsFromFileMixIn): # Stop words map '__stop_words_map', + # FIXME remove once stopword comparison is over + '__stop_words_old_map', + # Jieba instance '__jieba', diff --git a/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt b/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt index 3eb0376f33..f80f970b44 100644 --- a/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt @@ -1,10 +1,12 @@ -# Appended Traditional Chinese characters (Note: This does not include all stopwords in Cantonese or Taiwanese Mandarin) # Sources: # http://blog.csdn.net/shijiebei2009/article/details/39696571 # http://github.com/stopwords-iso/stopwords-zh +# +# (Lightly edited to remove words in the original lists that are actually meaningful) +# Appended Traditional Chinese characters (Note: This does not include all stopwords in Cantonese or Taiwanese Mandarin + ! " -# $ % & @@ -129,38 +131,6 @@ sup 一 一. 一一 -一下 -一个 -一些 -一何 -一來 -一個 -一切 -一则 -一则通过 -一則 -一則通過 -一天 -一定 -一方面 -一旦 -一时 -一時 -一来 -一样 -一樣 -一次 -一片 -一番 -一直 -一致 -一般 -一起 -一轉眼 -一转眼 -一边 -一邊 -一面 七 万一 三 @@ -204,8 +174,6 @@ sup 不光 不免 不再 -不力 -不勝 不单 不变 不只 @@ -224,8 +192,6 @@ sup 不如 不妨 不定 -不对 -不對 不少 不尽 不尽然 @@ -239,12 +205,8 @@ sup 不必 不怎么 不怎麼 -不怕 不惟 -不成 不拘 -不择手段 -不擇手段 不敢 不料 不断 @@ -259,8 +221,6 @@ sup 不止一次 不比 不消 -不满 -不滿 不然 不然的話 不然的话 @@ -277,7 +237,6 @@ sup 不管怎樣 不經意 不经意 -不胜 不能 不能不 不至于 @@ -306,8 +265,6 @@ sup 且說 且说 两者 -严格 -严重 並 並不 並不是 @@ -325,7 +282,6 @@ sup 中小 中間 中间 -丰富 串行 临 临到 @@ -341,7 +297,6 @@ sup 主张 主張 主要 -举凡 举行 乃 乃至 @@ -355,20 +310,12 @@ sup 之後 之所以 之类 -之類 -乌乎 -乎 -乒 乘 -乘势 乘勝 乘勢 乘机 乘機 乘胜 -乘虚 -乘虛 -乘隙 九 也 也好 @@ -397,7 +344,6 @@ sup 互相 五 些 -交口 亦 产生 亲口 @@ -437,9 +383,6 @@ sup 从中 从事 从今以后 -从优 -从古到今 -从古至今 从头 从宽 从小 @@ -491,17 +434,12 @@ sup 任憑 企图 企圖 -伙同 会 -伟大 传 -传说 -传闻 似乎 似的 但 但凡 -但愿 但是 但願 何 @@ -550,16 +488,9 @@ sup 依據 依照 依靠 -便 便于 便於 係 -促进 -促進 -保持 -保管 -保险 -保險 俺 俺们 俺們 @@ -584,7 +515,6 @@ sup 假使 假如 假若 -偉大 偏偏 做到 偶尔 @@ -592,8 +522,6 @@ sup 偶而 傥然 傳 -傳聞 -傳說 僅 僅僅 像 @@ -633,7 +561,6 @@ sup 八成 公然 六 -兮 共 共同 共总 @@ -676,8 +603,6 @@ sup 再說 再説 再说 -冒 -冲 决不 决定 决非 @@ -716,6 +641,7 @@ sup 切莫 则 则甚 +则通过 刚 刚好 刚巧 @@ -750,6 +676,7 @@ sup 到頭來 則 則甚 +則通過 前后 前後 前此 @@ -765,14 +692,8 @@ sup 加之 加以 加入 -加強 -加强 动不动 -动辄 -勃然 動不動 -動輒 -匆匆 十分 千 千万 @@ -831,8 +752,6 @@ sup 取道 受到 变成 -古來 -古来 另 另一个 另一個 @@ -843,7 +762,6 @@ sup 另行 只 只当 -只怕 只是 只有 只消 @@ -854,12 +772,8 @@ sup 叫做 召开 召開 -叮咚 -叮噹 -叮当 可 可以 -可好 可是 可能 可見 @@ -898,14 +812,11 @@ sup 吧 吧哒 吧噠 -吱 吶 呀 呃 呆呆地 呐 -呕 -呗 呜 呜呼 呢 @@ -913,29 +824,19 @@ sup 周圍 呵 呵呵 -呸 -呼哧 -呼啦 咁 咋 和 -咚 -咦 咧 咱 咱们 咱們 -咳 哇 哈 哈哈 哉 哎 -哎呀 -哎哟 哎喲 -哗 -哗啦 哟 哦 哩 @@ -957,7 +858,6 @@ sup 哪里 哼 哼唷 -唄 唉 唔 唯有 @@ -966,14 +866,10 @@ sup 啊哈 啊哟 啊喲 -問題 -啐 啥 啦 啪达 啪達 -啷噹 -啷当 喀 喂 喏 @@ -987,28 +883,16 @@ sup 嗚 嗚呼 嗡 -嗡嗡 嗬 嗯 -嗳 -嘅 嘍 嘎 -嘎嘎 嘎登 -嘔 -嘘 嘛 嘩 嘩啦 -嘻 嘿 嘿嘿 -噓 -噯 -嚇 -嚴格 -嚴重 四 因 因为 @@ -1019,7 +903,6 @@ sup 因着 因而 因著 -固 固然 在 在下 @@ -1040,7 +923,6 @@ sup 处在 处处 处理 -复杂 多 多么 多亏 @@ -1060,20 +942,13 @@ sup 夠瞧的 夥同 大 -大不了 -大举 -大事 大体 大体上 -大凡 -大力 大多 大多数 大多數 大大 大家 -大张旗鼓 -大張旗鼓 大批 大抵 大概 @@ -1081,24 +956,16 @@ sup 大約 大约 大致 -大舉 大都 大量 -大面儿上 -大面兒上 大體 大體上 -失去 -奇 -奈 -奋勇 -奮勇 +天 她 她们 她們 她是 她的 -好 好像 好在 好的 @@ -1137,14 +1004,9 @@ sup 它們的 它是 它的 -安全 -完全 完成 定 -实现 实际 -宣布 -容易 密切 實現 實際 @@ -1152,22 +1014,17 @@ sup 寧可 寧肯 寧願 -对 对于 对应 对待 对方 对比 将 -将才 将要 将近 將 -將才 將要 將近 -專門 -對 對待 對應 對方 @@ -1221,7 +1078,6 @@ sup 岂但 岂止 岂非 -川流不息 左右 巨大 巩固 @@ -1234,7 +1090,6 @@ sup 已經 已经 巴 -巴巴 带 帮助 帶 @@ -1266,26 +1121,17 @@ sup 幾時 幾番 幾經 -广大 广泛 应当 应用 应该 -庶乎 -庶几 -庶幾 廣大 廣泛 开外 开始 开展 引起 -弗 -強烈 -強調 弹指之间 -强烈 -强调 彈指之間 归 归根到底 @@ -1300,8 +1146,6 @@ sup 当口儿 当地 当场 -当头 -当庭 当时 当然 当真 @@ -1329,27 +1173,16 @@ sup 得了 得出 得到 -得天独厚 -得天獨厚 -得起 從 從不 從中 從事 從今以後 從來 -從優 -從古到今 -從古至今 -從嚴 -從寬 -從小 -從新 從早到晚 從未 從此 從此以後 -從無到有 從而 從輕 從速 @@ -1363,8 +1196,6 @@ sup 必定 必将 必將 -必然 -必要 必須 必须 快 @@ -1375,16 +1206,12 @@ sup 怎么 怎么办 怎么样 -怎奈 怎样 怎樣 怎麼 怎麼樣 怎麼辦 怎麽 -怕 -急匆匆 -怪 怪不得 总之 总是 @@ -1394,7 +1221,6 @@ sup 总结 总而言之 恍然 -恐怕 恰似 恰好 恰如 @@ -1407,23 +1233,15 @@ sup 您們 您是 惟其 -惯常 意思 -愤然 愿意 慢說 慢説 慢说 慣常 -憑 -憑藉 -憤然 -應用 應當 應該 成为 -成年 -成年累月 成心 成為 我 @@ -1440,7 +1258,6 @@ sup 或者 或許 或许 -战斗 截然 截至 戰鬥 @@ -1451,20 +1268,10 @@ sup 所有 所謂 所谓 -才 -才能 -扑通 -打 -打从 -打开天窗说亮话 -打從 -打開天窗說亮話 -扩大 把 抑或 报导 报道 -抽冷子 拦腰 拿 指 @@ -1472,7 +1279,6 @@ sup 按 按时 按時 -按期 按照 按理 按說 @@ -1508,7 +1314,6 @@ sup 換句話說 換句話説 換言之 -撲通 據 據實 據悉 @@ -1516,7 +1321,6 @@ sup 據此 據稱 據說 -擴大 攔腰 放量 故 @@ -1538,7 +1342,6 @@ sup 方 方便 方才 -方能 方面 於 於是 @@ -1560,11 +1363,9 @@ sup 日漸 日益 日臻 -日見 -日见 +旦 时 时候 -昂然 明显 明确 明確 @@ -1578,8 +1379,6 @@ sup 显著 時 時候 -普通 -普遍 暗中 暗地裡 暗地里 @@ -1594,21 +1393,17 @@ sup 曾經 曾经 替 -替代 最 最后 最大 最好 最後 最近 -最高 會 月 有 有些 有关 -有利 -有力 有及 有所 有效 @@ -1645,10 +1440,6 @@ sup 极为 极了 极其 -极力 -极大 -极度 -极端 构成 果然 果真 @@ -1657,32 +1448,29 @@ sup 某些 某個 某某 +样 根据 根據 根本 格外 -梆 -極 極了 極其 極力 -極大 極度 極為 極端 概 構成 +樣 權時 +次 次第 欢迎 欤 歟 -歡迎 -正值 正在 正如 正巧 -正常 正是 此 此中 @@ -1701,7 +1489,6 @@ sup 歸 歸根到底 歸根結底 -歸齊 殆 毋宁 毋寧 @@ -1734,7 +1521,6 @@ sup 毫無 毫無例外 毫無保留地 -汝 決不 決定 決非 @@ -1743,22 +1529,14 @@ sup 沒有 沙沙 没 -没奈何 没有 沿 沿着 沿著 況且 注意 -活 -深入 -清楚 湊巧 準備 -满 -满足 -滿 -滿足 漫說 漫説 漫说 @@ -1771,8 +1549,6 @@ sup 為止 為此 為著 -烏乎 -焉 無 無寧 無法 @@ -1797,18 +1573,14 @@ sup 爾後 爾爾 爾等 +片 牢牢 特別是 特别是 -特殊 特点 -特約 -特约 特點 -犹且 犹自 独 -独媒特约 独自 猛然 猛然間 @@ -1816,15 +1588,12 @@ sup 猶且 猶自 獨 -獨媒特約 獨自 獲得 率尔 率然 率爾 -现代 现在 -現代 現在 理应 理当 @@ -1862,6 +1631,7 @@ sup 略加 略微 略為 +番 當 當下 當中 @@ -1869,7 +1639,6 @@ sup 當前 當即 當口兒 -當地 當場 當庭 當時 @@ -1877,8 +1646,6 @@ sup 當真 當着 當著 -當頭 -白 白白 的 的确 @@ -1886,7 +1653,6 @@ sup 的話 的话 皆可 -盡 盡可能 盡如人意 盡心盡力 @@ -1895,6 +1661,7 @@ sup 盡然 盡量 目前 +直 直到 直接 相似 @@ -1937,8 +1704,6 @@ sup 确定 碰巧 確定 -社会主义 -社會主義 离 种 积极 @@ -1949,12 +1714,8 @@ sup 稱 積極 究竟 -穷年累月 突出 突然 -窃 -窮年累月 -竊 立 立刻 立即 @@ -2030,7 +1791,6 @@ sup 纵令 纵使 纵然 -练习 组成 经 经常 @@ -2044,7 +1804,6 @@ sup 绝非 绝顶 继之 -继后 继续 继而 维持 @@ -2052,8 +1811,6 @@ sup 缕缕 罢了 罷了 -老 -老大 老是 老老实实 老老實實 @@ -2074,11 +1831,6 @@ sup 而論 而论 联系 -联袂 -聯繫 -聯袂 -背地裡 -背地里 背靠背 能 能否 @@ -2098,7 +1850,6 @@ sup 自己 自後 自從 -自打 自身 臭 至 @@ -2112,17 +1863,14 @@ sup 與其說 與否 與此同時 -舉凡 舉行 +般 般的 良好 若 -若夫 若是 -若果 若非 范围 -莫 莫不 莫不然 莫如 @@ -2147,12 +1895,10 @@ sup 行动 行動 行為 -衝 表明 表示 被 裡面 -複雜 要 要不 要不是 @@ -2178,7 +1924,6 @@ sup 許多 話說 該 -該當 認為 認爲 認真 @@ -2195,18 +1940,13 @@ sup 誰知 請勿 論 -論說 諸 諸位 諸如 謹 -譬喻 譬如 變成 讓 -认为 -认真 -认识 让 许多 论 @@ -2237,12 +1977,9 @@ sup 豈但 豈止 豈非 -豐富 -賊死 賴以 贼死 赖以 -赶 赶快 赶早不赶晚 起 @@ -2255,10 +1992,6 @@ sup 起见 起頭 起首 -趁 -趁便 -趁势 -趁勢 趁早 趁机 趁機 @@ -2267,8 +2000,6 @@ sup 趁着 趁著 越是 -趕 -趕快 趕早不趕晚 距 跟 @@ -2278,23 +2009,15 @@ sup 較之 較比 較為 -轉動 -轉變 -轉貼 -轟然 -转动 -转变 -转贴 -轰然 +轉眼 +转眼 较 较为 较之 较比 边 达到 -达旦 迄 -迅速 过 过于 过去 @@ -2334,7 +2057,6 @@ sup 进入 进去 进来 -进步 进而 进行 连 @@ -2400,8 +2122,6 @@ sup 過去 過於 達到 -達旦 -適應 適用 適當 遭到 @@ -2468,7 +2188,6 @@ sup 關於 问题 间或 -防止 阿 附近 陈年 @@ -2487,7 +2206,6 @@ sup 除此而外 除開 除非 -陳年 随 随后 随时 @@ -2504,7 +2222,6 @@ sup 难说 难道 难道说 -集中 雖 雖則 雖然 @@ -2517,24 +2234,21 @@ sup 難說 難道 難道說 -雲爾 零 需要 非但 非常 非徒 非得 -非特 非独 非獨 靠 -鞏固 +面 頂多 頃 頃刻 頃刻之間 頃刻間 -順 順着 順著 頓時 @@ -2555,12 +2269,10 @@ sup 風雨無阻 风雨无阻 飽 -餘外 餵 饱 首先 馬上 -騰 马上 高低 高兴 @@ -2568,8 +2280,6 @@ sup 麼 默然 默默地 -齊 -齐 ︿ ! # diff --git a/apps/common/src/python/mediawords/languages/zh/zh_stop_words_old.txt b/apps/common/src/python/mediawords/languages/zh/zh_stop_words_old.txt new file mode 100644 index 0000000000..3eb0376f33 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/zh/zh_stop_words_old.txt @@ -0,0 +1,2727 @@ +# Appended Traditional Chinese characters (Note: This does not include all stopwords in Cantonese or Taiwanese Mandarin) +# Sources: +# http://blog.csdn.net/shijiebei2009/article/details/39696571 +# http://github.com/stopwords-iso/stopwords-zh +! +" +# +$ +% +& +( +) +* ++ +, +- +-- +. +.. +... +...... +................... +./ +.一 +.数 +.數 +.日 +/ +// +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +:// +:: +; +< += +> +>> +? +@ +[ +\ +] +^ +_ +` +A +exp +Lex +sub +sup +| +} +~ +~~~~ +· +× +××× +γ +Δ +μ +φ +φ. +Ψ +В +— +—— +——— +‘ +’ +’‘ +“ +” +”, +… +…… +…………………………………………………③ +′∈ +′| +℃ +Ⅲ +↑ +→ +∈[ +∪φ∈ +≈ +① +② +②c +③ +③] +④ +⑤ +⑥ +⑦ +⑧ +⑨ +⑩ +── +■ +▲ +  +、 +。 +〈 +〉 +《 +》 +》), +「 +」 +『 +』 +【 +】 +〔 +〕 +〕〔 +㈧ +一 +一. +一一 +一下 +一个 +一些 +一何 +一來 +一個 +一切 +一则 +一则通过 +一則 +一則通過 +一天 +一定 +一方面 +一旦 +一时 +一時 +一来 +一样 +一樣 +一次 +一片 +一番 +一直 +一致 +一般 +一起 +一轉眼 +一转眼 +一边 +一邊 +一面 +七 +万一 +三 +三天两头 +三天兩頭 +三番两次 +三番五次 +三番兩次 +上 +上下 +上來 +上升 +上去 +上来 +上述 +上面 +下 +下來 +下列 +下去 +下来 +下面 +不 +不一 +不下 +不久 +不了 +不亦乐乎 +不亦樂乎 +不仅 +不仅...而且 +不仅仅 +不仅仅是 +不会 +不但 +不但...而且 +不僅 +不僅...而且 +不僅僅 +不僅僅是 +不光 +不免 +不再 +不力 +不勝 +不单 +不变 +不只 +不可 +不可开交 +不可抗拒 +不可開交 +不同 +不問 +不單 +不外 +不外乎 +不够 +不夠 +不大 +不如 +不妨 +不定 +不对 +不對 +不少 +不尽 +不尽然 +不巧 +不已 +不常 +不得 +不得不 +不得了 +不得已 +不必 +不怎么 +不怎麼 +不怕 +不惟 +不成 +不拘 +不择手段 +不擇手段 +不敢 +不料 +不断 +不斷 +不日 +不时 +不是 +不時 +不曾 +不會 +不止 +不止一次 +不比 +不消 +不满 +不滿 +不然 +不然的話 +不然的话 +不特 +不独 +不獨 +不由得 +不盡 +不盡然 +不知不覺 +不知不觉 +不管 +不管怎样 +不管怎樣 +不經意 +不经意 +不胜 +不能 +不能不 +不至于 +不至於 +不若 +不要 +不論 +不變 +不论 +不起 +不足 +不过 +不迭 +不過 +不问 +不限 +与 +与其 +与其说 +与否 +与此同时 +专门 +且 +且不說 +且不说 +且說 +且说 +两者 +严格 +严重 +並 +並不 +並不是 +並且 +並排 +並沒 +並沒有 +並無 +並肩 +並非 +个 +个人 +个别 +中 +中小 +中間 +中间 +丰富 +串行 +临 +临到 +为 +为主 +为了 +为什么 +为什麽 +为何 +为止 +为此 +为着 +主张 +主張 +主要 +举凡 +举行 +乃 +乃至 +乃至于 +乃至於 +么 +之 +之一 +之前 +之后 +之後 +之所以 +之类 +之類 +乌乎 +乎 +乒 +乘 +乘势 +乘勝 +乘勢 +乘机 +乘機 +乘胜 +乘虚 +乘虛 +乘隙 +九 +也 +也好 +也就是說 +也就是说 +也是 +也罢 +也罷 +了 +了解 +争取 +二 +二來 +二来 +二話不說 +二話沒說 +二话不说 +二话没说 +于 +于是 +于是乎 +云云 +云尔 +云爾 +互 +互相 +五 +些 +交口 +亦 +产生 +亲口 +亲手 +亲眼 +亲自 +亲身 +人 +人人 +人们 +人們 +人家 +人民 +什么 +什么样 +什麼 +什麼樣 +什麽 +仅 +仅仅 +今 +今后 +今天 +今年 +今後 +今日 +今次 +介于 +介於 +仍 +仍旧 +仍然 +仍舊 +从 +从不 +从严 +从中 +从事 +从今以后 +从优 +从古到今 +从古至今 +从头 +从宽 +从小 +从新 +从无到有 +从早到晚 +从未 +从来 +从此 +从此以后 +从而 +从轻 +从速 +从重 +他 +他人 +他们 +他們 +他是 +他的 +代替 +令 +以 +以上 +以下 +以为 +以來 +以便 +以免 +以前 +以及 +以后 +以外 +以後 +以故 +以期 +以来 +以為 +以至 +以至于 +以至於 +以致 +们 +任 +任何 +任凭 +任务 +任務 +任憑 +企图 +企圖 +伙同 +会 +伟大 +传 +传说 +传闻 +似乎 +似的 +但 +但凡 +但愿 +但是 +但願 +何 +何乐而不为 +何以 +何况 +何嘗 +何处 +何妨 +何尝 +何必 +何时 +何時 +何樂而不為 +何止 +何況 +何苦 +何處 +何須 +何须 +余外 +作为 +作為 +作爲 +你 +你们 +你們 +你是 +你的 +佢 +使 +使得 +使用 +來 +來不及 +來得及 +來看 +來着 +來自 +來著 +來說 +來講 +例如 +依 +依据 +依據 +依照 +依靠 +便 +便于 +便於 +係 +促进 +促進 +保持 +保管 +保险 +保險 +俺 +俺们 +俺們 +個 +個人 +個別 +倍加 +倍感 +們 +倒不如 +倒不如說 +倒不如说 +倒是 +倘 +倘使 +倘或 +倘然 +倘若 +借 +借以 +借此 +假使 +假如 +假若 +偉大 +偏偏 +做到 +偶尔 +偶爾 +偶而 +傥然 +傳 +傳聞 +傳說 +僅 +僅僅 +像 +儘 +儘早 +儘管 +儘管如此 +儻然 +儿 +允許 +允许 +元/吨 +元/噸 +充其极 +充其極 +充其量 +充分 +先不先 +先后 +先後 +先生 +光 +光是 +兒 +內 +全体 +全力 +全年 +全然 +全身心 +全部 +全都 +全面 +全體 +兩者 +八 +八成 +公然 +六 +兮 +共 +共同 +共总 +共總 +关于 +其 +其一 +其中 +其二 +其他 +其余 +其后 +其它 +其实 +其實 +其後 +其次 +其餘 +具体 +具体地说 +具体来说 +具体说来 +具有 +具體 +具體來說 +具體來説 +具體地說 +具體說來 +兼之 +内 +再 +再其次 +再则 +再則 +再有 +再次 +再者 +再者說 +再者说 +再說 +再説 +再说 +冒 +冲 +决不 +决定 +决非 +况且 +准备 +凑巧 +凝神 +几 +几乎 +几度 +几时 +几番 +几经 +凡 +凡是 +凭 +凭借 +出 +出于 +出來 +出去 +出於 +出来 +出现 +出現 +分別 +分别 +分头 +分期 +分期分批 +分頭 +切 +切不可 +切切 +切勿 +切莫 +则 +则甚 +刚 +刚好 +刚巧 +刚才 +初 +別 +別人 +別是 +別的 +別管 +別處 +別說 +別説 +别 +别人 +别处 +别是 +别的 +别管 +别说 +到 +到了儿 +到了兒 +到处 +到头 +到头来 +到底 +到目前为止 +到目前為止 +到處 +到頭 +到頭來 +則 +則甚 +前后 +前後 +前此 +前者 +前进 +前進 +前面 +剛 +剛好 +剛巧 +剛才 +加上 +加之 +加以 +加入 +加強 +加强 +动不动 +动辄 +勃然 +動不動 +動輒 +匆匆 +十分 +千 +千万 +千万千万 +千萬 +千萬千萬 +半 +单 +单单 +单纯 +即 +即令 +即使 +即便 +即刻 +即如 +即将 +即將 +即或 +即是說 +即是说 +即若 +却 +却不 +卻 +卻不 +历 +原來 +原来 +去 +又 +又及 +及 +及其 +及时 +及時 +及至 +双方 +反之 +反之亦然 +反之则 +反之則 +反倒 +反倒是 +反应 +反應 +反手 +反映 +反而 +反过来 +反过来说 +反過來 +反過來說 +反過來説 +取得 +取道 +受到 +变成 +古來 +古来 +另 +另一个 +另一個 +另一方面 +另外 +另悉 +另方面 +另行 +只 +只当 +只怕 +只是 +只有 +只消 +只當 +只要 +只限 +叫 +叫做 +召开 +召開 +叮咚 +叮噹 +叮当 +可 +可以 +可好 +可是 +可能 +可見 +可见 +各 +各个 +各人 +各位 +各個 +各地 +各式 +各种 +各種 +各級 +各级 +各自 +合理 +同 +同一 +同时 +同時 +同样 +同樣 +后 +后来 +后者 +后面 +向 +向使 +向着 +向著 +吓 +吗 +否则 +否則 +吧 +吧哒 +吧噠 +吱 +吶 +呀 +呃 +呆呆地 +呐 +呕 +呗 +呜 +呜呼 +呢 +周围 +周圍 +呵 +呵呵 +呸 +呼哧 +呼啦 +咁 +咋 +和 +咚 +咦 +咧 +咱 +咱们 +咱們 +咳 +哇 +哈 +哈哈 +哉 +哎 +哎呀 +哎哟 +哎喲 +哗 +哗啦 +哟 +哦 +哩 +哪 +哪个 +哪些 +哪個 +哪儿 +哪兒 +哪天 +哪年 +哪怕 +哪样 +哪樣 +哪裏 +哪裡 +哪边 +哪邊 +哪里 +哼 +哼唷 +唄 +唉 +唔 +唯有 +啊 +啊呀 +啊哈 +啊哟 +啊喲 +問題 +啐 +啥 +啦 +啪达 +啪達 +啷噹 +啷当 +喀 +喂 +喏 +喔唷 +單 +單單 +單純 +喲 +喽 +嗎 +嗚 +嗚呼 +嗡 +嗡嗡 +嗬 +嗯 +嗳 +嘅 +嘍 +嘎 +嘎嘎 +嘎登 +嘔 +嘘 +嘛 +嘩 +嘩啦 +嘻 +嘿 +嘿嘿 +噓 +噯 +嚇 +嚴格 +嚴重 +四 +因 +因为 +因了 +因此 +因為 +因爲 +因着 +因而 +因著 +固 +固然 +在 +在下 +在于 +在於 +地 +均 +坚决 +坚持 +基于 +基於 +基本 +基本上 +堅持 +堅決 +報導 +報道 +处在 +处处 +处理 +复杂 +多 +多么 +多亏 +多多 +多多少少 +多多益善 +多少 +多年來 +多年前 +多年来 +多数 +多數 +多次 +多虧 +多麼 +够瞧的 +夠瞧的 +夥同 +大 +大不了 +大举 +大事 +大体 +大体上 +大凡 +大力 +大多 +大多数 +大多數 +大大 +大家 +大张旗鼓 +大張旗鼓 +大批 +大抵 +大概 +大略 +大約 +大约 +大致 +大舉 +大都 +大量 +大面儿上 +大面兒上 +大體 +大體上 +失去 +奇 +奈 +奋勇 +奮勇 +她 +她们 +她們 +她是 +她的 +好 +好像 +好在 +好的 +好象 +如 +如上 +如上所述 +如下 +如今 +如何 +如其 +如前所述 +如同 +如常 +如是 +如期 +如果 +如次 +如此 +如此等等 +如若 +始而 +姑且 +存在 +存心 +孰料 +孰知 +宁 +宁可 +宁愿 +宁肯 +它 +它们 +它们的 +它們 +它們的 +它是 +它的 +安全 +完全 +完成 +定 +实现 +实际 +宣布 +容易 +密切 +實現 +實際 +寧 +寧可 +寧肯 +寧願 +对 +对于 +对应 +对待 +对方 +对比 +将 +将才 +将要 +将近 +將 +將才 +將要 +將近 +專門 +對 +對待 +對應 +對方 +對於 +對比 +小 +少数 +少數 +尔 +尔后 +尔尔 +尔等 +尚且 +尤其 +就 +就地 +就是 +就是了 +就是說 +就是説 +就是说 +就此 +就算 +就要 +尽 +尽可能 +尽如人意 +尽心尽力 +尽心竭力 +尽快 +尽早 +尽然 +尽管 +尽管如此 +尽量 +局外 +居然 +屆時 +届时 +属于 +屡 +屡屡 +屡次 +屡次三番 +屢 +屢屢 +屢次 +屢次三番 +屬於 +岂 +岂但 +岂止 +岂非 +川流不息 +左右 +巨大 +巩固 +差一点 +差一點 +差不多 +己 +已 +已矣 +已經 +已经 +巴 +巴巴 +带 +帮助 +帶 +常 +常常 +常言說 +常言說得好 +常言说 +常言说得好 +常言道 +幫助 +平素 +年 +年复一年 +年復一年 +并 +并不 +并不是 +并且 +并排 +并无 +并没 +并没有 +并肩 +并非 +幾 +幾乎 +幾度 +幾時 +幾番 +幾經 +广大 +广泛 +应当 +应用 +应该 +庶乎 +庶几 +庶幾 +廣大 +廣泛 +开外 +开始 +开展 +引起 +弗 +強烈 +強調 +弹指之间 +强烈 +强调 +彈指之間 +归 +归根到底 +归根结底 +归齐 +当 +当下 +当中 +当儿 +当前 +当即 +当口儿 +当地 +当场 +当头 +当庭 +当时 +当然 +当真 +当着 +形成 +彻夜 +彻底 +彼 +彼时 +彼時 +彼此 +往 +往往 +待 +待到 +很 +很多 +很少 +後 +後來 +後来 +後者 +後面 +得 +得了 +得出 +得到 +得天独厚 +得天獨厚 +得起 +從 +從不 +從中 +從事 +從今以後 +從來 +從優 +從古到今 +從古至今 +從嚴 +從寬 +從小 +從新 +從早到晚 +從未 +從此 +從此以後 +從無到有 +從而 +從輕 +從速 +從重 +從頭 +徹夜 +徹底 +心裡 +心里 +必 +必定 +必将 +必將 +必然 +必要 +必須 +必须 +快 +快要 +忽地 +忽然 +怎 +怎么 +怎么办 +怎么样 +怎奈 +怎样 +怎樣 +怎麼 +怎麼樣 +怎麼辦 +怎麽 +怕 +急匆匆 +怪 +怪不得 +总之 +总是 +总的来看 +总的来说 +总的说来 +总结 +总而言之 +恍然 +恐怕 +恰似 +恰好 +恰如 +恰巧 +恰恰 +恰恰相反 +恰逢 +您 +您们 +您們 +您是 +惟其 +惯常 +意思 +愤然 +愿意 +慢說 +慢説 +慢说 +慣常 +憑 +憑藉 +憤然 +應用 +應當 +應該 +成为 +成年 +成年累月 +成心 +成為 +我 +我们 +我們 +我是 +我的 +或 +或则 +或則 +或多或少 +或是 +或曰 +或者 +或許 +或许 +战斗 +截然 +截至 +戰鬥 +所 +所以 +所在 +所幸 +所有 +所謂 +所谓 +才 +才能 +扑通 +打 +打从 +打开天窗说亮话 +打從 +打開天窗說亮話 +扩大 +把 +抑或 +报导 +报道 +抽冷子 +拦腰 +拿 +指 +指出 +按 +按时 +按時 +按期 +按照 +按理 +按說 +按说 +挨个 +挨個 +挨家挨戶 +挨家挨户 +挨次 +挨着 +挨著 +挨門挨戶 +挨門逐戶 +挨门挨户 +挨门逐户 +换句话说 +换言之 +据 +据实 +据悉 +据我所知 +据此 +据称 +据说 +掌握 +採取 +接下來 +接下来 +接着 +接著 +接连不断 +接連不斷 +換句話說 +換句話説 +換言之 +撲通 +據 +據實 +據悉 +據我所知 +據此 +據稱 +據說 +擴大 +攔腰 +放量 +故 +故意 +故此 +故而 +敞开儿 +敞開兒 +敢 +敢于 +敢情 +敢於 +数/ +整个 +整個 +數/ +断然 +斷然 +方 +方便 +方才 +方能 +方面 +於 +於是 +於是乎 +旁人 +无 +无宁 +无法 +无论 +既 +既...又 +既往 +既是 +既然 +日 +日复一日 +日復一日 +日渐 +日漸 +日益 +日臻 +日見 +日见 +时 +时候 +昂然 +明显 +明确 +明確 +明顯 +是 +是不是 +是以 +是否 +是的 +显然 +显著 +時 +時候 +普通 +普遍 +暗中 +暗地裡 +暗地里 +暗自 +更 +更为 +更加 +更為 +更进一步 +更進一步 +曾 +曾經 +曾经 +替 +替代 +最 +最后 +最大 +最好 +最後 +最近 +最高 +會 +月 +有 +有些 +有关 +有利 +有力 +有及 +有所 +有效 +有时 +有時 +有点 +有的 +有的是 +有着 +有著 +有關 +有點 +望 +朝 +朝着 +朝著 +末##末 +本 +本人 +本地 +本着 +本著 +本身 +权时 +来 +来不及 +来得及 +来看 +来着 +来自 +来讲 +来说 +极 +极为 +极了 +极其 +极力 +极大 +极度 +极端 +构成 +果然 +果真 +某 +某个 +某些 +某個 +某某 +根据 +根據 +根本 +格外 +梆 +極 +極了 +極其 +極力 +極大 +極度 +極為 +極端 +概 +構成 +權時 +次第 +欢迎 +欤 +歟 +歡迎 +正值 +正在 +正如 +正巧 +正常 +正是 +此 +此中 +此后 +此地 +此处 +此外 +此後 +此时 +此時 +此次 +此處 +此間 +此间 +歷 +歸 +歸根到底 +歸根結底 +歸齊 +殆 +毋宁 +毋寧 +每 +每个 +每個 +每天 +每年 +每当 +每时每刻 +每時每刻 +每每 +每當 +每逢 +比 +比及 +比如 +比如說 +比如说 +比方 +比照 +比起 +比較 +比较 +毕竟 +毫不 +毫无 +毫无例外 +毫无保留地 +毫無 +毫無例外 +毫無保留地 +汝 +決不 +決定 +決非 +沒 +沒奈何 +沒有 +沙沙 +没 +没奈何 +没有 +沿 +沿着 +沿著 +況且 +注意 +活 +深入 +清楚 +湊巧 +準備 +满 +满足 +滿 +滿足 +漫說 +漫説 +漫说 +為 +為主 +為了 +為什麼 +為什麽 +為何 +為止 +為此 +為著 +烏乎 +焉 +無 +無寧 +無法 +無論 +然 +然则 +然則 +然后 +然後 +然而 +照 +照着 +照著 +爭取 +爲了 +爲什麼 +爲何 +爲甚麼 +爲着 +爲著 +爾 +爾後 +爾爾 +爾等 +牢牢 +特別是 +特别是 +特殊 +特点 +特約 +特约 +特點 +犹且 +犹自 +独 +独媒特约 +独自 +猛然 +猛然間 +猛然间 +猶且 +猶自 +獨 +獨媒特約 +獨自 +獲得 +率尔 +率然 +率爾 +现代 +现在 +現代 +現在 +理应 +理当 +理應 +理當 +理該 +理该 +瑟瑟 +甚且 +甚么 +甚或 +甚而 +甚至 +甚至于 +甚至於 +甚麼 +甚麼樣 +甚麽 +產生 +用 +用來 +用来 +甫 +甭 +由 +由于 +由於 +由是 +由此 +由此可見 +由此可见 +畢竟 +略 +略为 +略加 +略微 +略為 +當 +當下 +當中 +當兒 +當前 +當即 +當口兒 +當地 +當場 +當庭 +當時 +當然 +當真 +當着 +當著 +當頭 +白 +白白 +的 +的确 +的確 +的話 +的话 +皆可 +盡 +盡可能 +盡如人意 +盡心盡力 +盡心竭力 +盡快 +盡然 +盡量 +目前 +直到 +直接 +相似 +相信 +相反 +相同 +相对 +相对而言 +相對 +相對而言 +相应 +相当 +相應 +相當 +相等 +省得 +看 +看上去 +看來 +看出 +看到 +看来 +看样子 +看樣子 +看看 +看見 +看见 +看起來 +看起来 +真是 +真正 +眨眼 +着 +着呢 +矣 +矣乎 +矣哉 +知道 +砰 +确定 +碰巧 +確定 +社会主义 +社會主義 +离 +种 +积极 +称 +移动 +移動 +種 +稱 +積極 +究竟 +穷年累月 +突出 +突然 +窃 +窮年累月 +竊 +立 +立刻 +立即 +立地 +立时 +立時 +立馬 +立马 +竟 +竟然 +竟而 +第 +第二 +等 +等到 +等等 +策略地 +简直 +简而言之 +简言之 +管 +範圍 +簡直 +簡而言之 +簡言之 +类如 +粗 +精光 +純 +純粹 +紧接着 +累年 +累次 +組成 +結合 +結果 +絕 +絕不 +絕對 +絕非 +絕頂 +給 +經 +經常 +經過 +綜上所述 +維持 +緊接著 +練習 +縱 +縱令 +縱使 +縱然 +縷縷 +總之 +總括來説 +總括而言 +總是 +總的來看 +總的來說 +總的來説 +總的說來 +總的説來 +總結 +總而言之 +繼之 +繼後 +繼續 +繼而 +纯 +纯粹 +纵 +纵令 +纵使 +纵然 +练习 +组成 +经 +经常 +经过 +结合 +结果 +给 +绝 +绝不 +绝对 +绝非 +绝顶 +继之 +继后 +继续 +继而 +维持 +综上所述 +缕缕 +罢了 +罷了 +老 +老大 +老是 +老老实实 +老老實實 +考慮 +考虑 +者 +而 +而且 +而况 +而又 +而后 +而外 +而已 +而後 +而是 +而況 +而言 +而論 +而论 +联系 +联袂 +聯繫 +聯袂 +背地裡 +背地里 +背靠背 +能 +能否 +能够 +能夠 +腾 +臨 +臨到 +自 +自个儿 +自从 +自個兒 +自各儿 +自各兒 +自后 +自家 +自己 +自後 +自從 +自打 +自身 +臭 +至 +至于 +至今 +至於 +至若 +致 +與 +與其 +與其說 +與否 +與此同時 +舉凡 +舉行 +般的 +良好 +若 +若夫 +若是 +若果 +若非 +范围 +莫 +莫不 +莫不然 +莫如 +莫若 +莫非 +获得 +萬一 +著 +著呢 +藉以 +藉此 +處在 +處理 +處處 +虽 +虽则 +虽然 +虽说 +蛮 +蠻 +行为 +行动 +行動 +行為 +衝 +表明 +表示 +被 +裡面 +複雜 +要 +要不 +要不是 +要不然 +要么 +要是 +要求 +要麼 +見 +規定 +親口 +親手 +親眼 +親自 +親身 +覺得 +见 +规定 +觉得 +設使 +設或 +設若 +許多 +話說 +該 +該當 +認為 +認爲 +認真 +認識 +誠如 +誠然 +說 +說來 +說明 +說說 +誰 +誰人 +誰料 +誰知 +請勿 +論 +論說 +諸 +諸位 +諸如 +謹 +譬喻 +譬如 +變成 +讓 +认为 +认真 +认识 +让 +许多 +论 +论说 +设使 +设或 +设若 +诚如 +诚然 +话说 +该 +该当 +说 +说明 +说来 +说说 +请勿 +诸 +诸位 +诸如 +谁 +谁人 +谁料 +谁知 +谨 +豁然 +豈 +豈但 +豈止 +豈非 +豐富 +賊死 +賴以 +贼死 +赖以 +赶 +赶快 +赶早不赶晚 +起 +起來 +起先 +起初 +起头 +起来 +起見 +起见 +起頭 +起首 +趁 +趁便 +趁势 +趁勢 +趁早 +趁机 +趁機 +趁热 +趁熱 +趁着 +趁著 +越是 +趕 +趕快 +趕早不趕晚 +距 +跟 +路經 +路经 +較 +較之 +較比 +較為 +轉動 +轉變 +轉貼 +轟然 +转动 +转变 +转贴 +轰然 +较 +较为 +较之 +较比 +边 +达到 +达旦 +迄 +迅速 +过 +过于 +过去 +过来 +运用 +近 +近來 +近几年来 +近年來 +近年来 +近幾年來 +近来 +还 +还是 +还有 +还要 +这 +这一来 +这个 +这么 +这么些 +这么样 +这么点儿 +这些 +这会儿 +这儿 +这就是说 +这时 +这样 +这次 +这点 +这种 +这般 +这边 +这里 +这麽 +进入 +进去 +进来 +进步 +进而 +进行 +连 +连同 +连声 +连日 +连日来 +连袂 +连连 +迟早 +迫于 +迫於 +适应 +适当 +适用 +逐步 +逐渐 +逐漸 +這 +這一來 +這些 +這個 +這兒 +這就是說 +這就是説 +這時 +這會兒 +這樣 +這次 +這種 +這般 +這裏 +這裡 +這邊 +這麼 +這麼些 +這麼樣 +這麼點兒 +這麽 +這點 +通常 +通过 +通過 +造成 +逢 +連 +連同 +連日 +連日來 +連聲 +連袂 +連連 +進來 +進入 +進去 +進步 +進而 +進行 +遇到 +運用 +過 +過來 +過去 +過於 +達到 +達旦 +適應 +適用 +適當 +遭到 +遲早 +遵循 +遵照 +避免 +還 +還是 +還有 +還要 +邊 +那 +那个 +那么 +那么些 +那么样 +那些 +那会儿 +那個 +那儿 +那兒 +那时 +那時 +那會兒 +那末 +那样 +那樣 +那般 +那裏 +那裡 +那边 +那邊 +那里 +那麼 +那麼些 +那麼樣 +那麽 +部分 +都 +鄙人 +采取 +里面 +重大 +重新 +重要 +針對 +鉴于 +鑑於 +鑒於 +针对 +長期以來 +長此下去 +長線 +長話短說 +长期以来 +长此下去 +长线 +长话短说 +開外 +開始 +開展 +間或 +關於 +问题 +间或 +防止 +阿 +附近 +陈年 +限制 +陡然 +除 +除了 +除却 +除卻 +除去 +除外 +除开 +除此 +除此之外 +除此以外 +除此而外 +除開 +除非 +陳年 +随 +随后 +随时 +随着 +随著 +隔夜 +隔日 +隨 +隨後 +隨時 +隨著 +难得 +难怪 +难说 +难道 +难道说 +集中 +雖 +雖則 +雖然 +雖說 +雖説 +雙方 +離 +難得 +難怪 +難說 +難道 +難道說 +雲爾 +零 +需要 +非但 +非常 +非徒 +非得 +非特 +非独 +非獨 +靠 +鞏固 +頂多 +頃 +頃刻 +頃刻之間 +頃刻間 +順 +順着 +順著 +頓時 +頗 +願意 +類如 +顯然 +顯著 +顶多 +顷 +顷刻 +顷刻之间 +顷刻间 +顺 +顺着 +顿时 +颇 +風雨無阻 +风雨无阻 +飽 +餘外 +餵 +饱 +首先 +馬上 +騰 +马上 +高低 +高兴 +高興 +麼 +默然 +默默地 +齊 +齐 +︿ +! +# +$ +% +& +' +( +) +)÷(1- +)、 +* ++ ++ξ +++ +, +,也 +- +-β +-- +-[*]- +. +/ +0 +0:2 +1 +1. +12% +2 +2.3% +3 +4 +5 +5:0 +6 +7 +8 +9 +: +; +< +<± +<Δ +<λ +<φ +<< += +=″ +=☆ +=( +=- +=[ +={ +> +>λ +? +@ +[ +[①①] +[①②] +[①③] +[①④] +[①⑤] +[①⑥] +[①⑦] +[①⑧] +[①⑨] +[①] +[①A] +[①B] +[①C] +[①D] +[①E] +[①f] +[①g] +[①h] +[①i] +[①o] +[② +[②①] +[②②] +[②③] +[②④ +[②⑤] +[②⑥] +[②⑦] +[②⑧] +[②⑩] +[②] +[②a] +[②B] +[②c] +[②d] +[②e] +[②f] +[②G] +[②h] +[②i] +[②j] +[③①] +[③⑩] +[③] +[③a] +[③b] +[③c] +[③d] +[③e] +[③F] +[③g] +[③h] +[④] +[④a] +[④b] +[④c] +[④d] +[④e] +[⑤] +[⑤]] +[⑤a] +[⑤b] +[⑤d] +[⑤e] +[⑤f] +[⑥] +[⑦] +[⑧] +[⑨] +[⑩] +[*] +[- +[] +] +]∧′=[ +][ +_ +A +a] +b] +c] +e] +f] +LI +ng昉 +R. L. +R.L. +ZXFITL +{ +{- +| +} +}> +~ +~± +~+ +¥ \ No newline at end of file diff --git a/apps/common/src/python/mediawords/util/config/common.py b/apps/common/src/python/mediawords/util/config/common.py index 114514a52c..c0f117393c 100644 --- a/apps/common/src/python/mediawords/util/config/common.py +++ b/apps/common/src/python/mediawords/util/config/common.py @@ -1,9 +1,10 @@ import collections import re -from typing import List, Pattern, Optional +from typing import List, Pattern, Optional, Union from mediawords.util.config import env_value, McConfigException from mediawords.util.parse_json import decode_json, McDecodeJSONException +from mediawords.util.perl import decode_object_from_bytes_if_needed from mediawords.util.log import create_logger log = create_logger(__name__) @@ -12,54 +13,110 @@ class ConnectRetriesConfig(object): """Connect retries configuration.""" - @staticmethod - def sleep_between_attempts() -> float: + __slots__ = [ + '__sleep_between_attempts', + '__max_attempts', + '__fatal_error_on_failure', + ] + + def __init__(self, + sleep_between_attempts: float = 1.0, + max_attempts: int = 60, + fatal_error_on_failure: bool = True): + + if isinstance(sleep_between_attempts, bytes): + sleep_between_attempts = decode_object_from_bytes_if_needed(sleep_between_attempts) + if isinstance(max_attempts, bytes): + max_attempts = decode_object_from_bytes_if_needed(max_attempts) + if isinstance(fatal_error_on_failure, bytes): + fatal_error_on_failure = decode_object_from_bytes_if_needed(fatal_error_on_failure) + + self.__sleep_between_attempts = float(sleep_between_attempts) + self.__max_attempts = int(max_attempts) + self.__fatal_error_on_failure = bool(fatal_error_on_failure) + + def sleep_between_attempts(self) -> float: """Seconds (or parts of second) to sleep between retries.""" - return 1.0 + return self.__sleep_between_attempts - @staticmethod - def max_attempts() -> int: + def max_attempts(self) -> int: """Max. number of attempts to connect. Must be positive (we want to try connecting at least one time). """ - return 60 + return self.__max_attempts + + def fatal_error_on_failure(self) -> bool: + """ + Return True if connect_to_db() should call fatal_error() and thus stop the whole process when giving up. + + True is a useful value in production when you might want the process that's unable to connect to the database to + just die. However, you might choose to return False here too if the caller is prepared to handle connection + failures more gracefully (e.g. Temporal's retries). + """ + return self.__fatal_error_on_failure class DatabaseConfig(object): """PostgreSQL database configuration.""" - @staticmethod - def hostname() -> str: + __slots__ = [ + '__hostname', + '__port', + '__database_name', + '__username', + '__password', + '__retries', + ] + + def __init__(self, + hostname: str = 'postgresql-pgbouncer', + port: int = 6432, + database_name: str = 'mediacloud', + username: str = 'mediacloud', + password: str = 'mediacloud', + retries: Optional[ConnectRetriesConfig] = None): + if not retries: + retries = ConnectRetriesConfig() + + if isinstance(port, bytes): + port = decode_object_from_bytes_if_needed(port) + + hostname = decode_object_from_bytes_if_needed(hostname) + database_name = decode_object_from_bytes_if_needed(database_name) + username = decode_object_from_bytes_if_needed(username) + password = decode_object_from_bytes_if_needed(password) + + self.__hostname = hostname + self.__port = int(port) + self.__database_name = database_name + self.__username = username + self.__password = password + self.__retries = retries + + def hostname(self) -> str: """Hostname.""" - # Container's name from docker-compose.yml - return "postgresql-pgbouncer" + return self.__hostname - @staticmethod - def port() -> int: + def port(self) -> int: """Port.""" - # Container's exposed port from docker-compose.yml - return 6432 + return self.__port - @staticmethod - def database_name() -> str: + def database_name(self) -> str: """Database name.""" - return "mediacloud" + return self.__database_name - @staticmethod - def username() -> str: + def username(self) -> str: """Username.""" - return "mediacloud" + return self.__username - @staticmethod - def password() -> str: + def password(self) -> str: """Password.""" - return "mediacloud" + return self.__password - @staticmethod - def retries() -> ConnectRetriesConfig: + def retries(self) -> ConnectRetriesConfig: """connect_to_db() retries configuration.""" - return ConnectRetriesConfig() + return self.__retries class AmazonS3DownloadsConfig(object): @@ -86,41 +143,117 @@ def directory_name() -> str: return env_value('MC_DOWNLOADS_AMAZON_S3_DIRECTORY_NAME', allow_empty_string=True) +class RabbitMQRetriesConfig(object): + """ + RabbitMQ retries configuration. + + https://docs.celeryproject.org/en/v4.4.7/userguide/calling.html#calling-retry + """ + + __slots__ = [ + '__max_retries', + '__interval_start', + '__interval_step', + '__interval_max', + ] + + def __init__(self, + max_retries: Optional[int] = 3, + interval_start: Union[int, float] = 0, + interval_step: Union[int, float] = 0.2, + interval_max: Union[int, float] = 0.2): + if isinstance(max_retries, bytes): + max_retries = decode_object_from_bytes_if_needed(max_retries) + if isinstance(interval_start, bytes): + interval_start = decode_object_from_bytes_if_needed(interval_start) + if isinstance(interval_step, bytes): + interval_step = decode_object_from_bytes_if_needed(interval_step) + if isinstance(interval_max, bytes): + interval_max = decode_object_from_bytes_if_needed(interval_max) + + self.__max_retries = None if max_retries is None else int(max_retries) # We want to preserve None here + self.__interval_start = float(interval_start) + self.__interval_step = float(interval_step) + self.__interval_max = float(interval_max) + + def max_retries(self) -> Optional[int]: + return self.__max_retries + + def interval_start(self) -> float: + return self.__interval_start + + def interval_step(self) -> float: + return self.__interval_step + + def interval_max(self) -> float: + return self.__interval_max + + class RabbitMQConfig(object): """RabbitMQ (Celery broker) client configuration.""" - @staticmethod - def hostname() -> str: + __slots__ = [ + '__hostname', + '__port', + '__username', + '__password', + '__vhost', + '__timeout', + '__retries', + ] + + def __init__(self, + hostname: str = 'rabbitmq-server', + port: int = 5672, + username: str = 'mediacloud', + password: str = 'mediacloud', + vhost: str = '/mediacloud', + timeout: int = 60, + retries: Optional[RabbitMQRetriesConfig] = None): + hostname = decode_object_from_bytes_if_needed(hostname) + if isinstance(port, bytes): + port = decode_object_from_bytes_if_needed(port) + username = decode_object_from_bytes_if_needed(username) + password = decode_object_from_bytes_if_needed(password) + vhost = decode_object_from_bytes_if_needed(vhost) + if isinstance(timeout, bytes): + timeout = decode_object_from_bytes_if_needed(timeout) + + self.__hostname = hostname + self.__port = int(port) + self.__username = username + self.__password = password + self.__vhost = vhost + self.__timeout = int(timeout) + self.__retries = retries + + def hostname(self) -> str: """Hostname.""" - # Container's name from docker-compose.yml - return "rabbitmq-server" + return self.__hostname - @staticmethod - def port() -> int: + def port(self) -> int: """Port.""" - # Container's exposed port from docker-compose.yml - return 5672 + return self.__port - @staticmethod - def username() -> str: + def username(self) -> str: """Username.""" - return "mediacloud" + return self.__username - @staticmethod - def password() -> str: + def password(self) -> str: """Password.""" - return "mediacloud" + return self.__password - @staticmethod - def vhost() -> str: + def vhost(self) -> str: """Virtual host.""" - return "/mediacloud" + return self.__vhost - @staticmethod - def timeout() -> int: + def timeout(self) -> int: """Timeout.""" - # FIXME possibly hardcode it somewhere - return 60 + return self.__timeout + + def retries(self) -> Optional[RabbitMQRetriesConfig]: + """Retry policy; if None, retries are disabled.""" + return self.__retries class SMTPConfig(object): @@ -155,6 +288,14 @@ def password() -> str: """Password.""" return '' + @staticmethod + def unsubscribe_address() -> str: + """Email to which unsubscribe/account deletion requests should be sent""" + address = env_value('MC_EMAIL_UNSUBSCRIBE', required=False, allow_empty_string=True) + if address is None or '@' not in address: + address = 'support@example.com' + return address + class DownloadStorageConfig(object): """Download storage configuration.""" diff --git a/apps/common/src/python/mediawords/util/mail.py b/apps/common/src/python/mediawords/util/mail.py index 3e75702f6d..9bea5ec1ed 100644 --- a/apps/common/src/python/mediawords/util/mail.py +++ b/apps/common/src/python/mediawords/util/mail.py @@ -117,6 +117,12 @@ def send_email(message: Message) -> bool: message_part = MIMEText(message.text_body, 'plain', 'utf-8') mime_message.attach(message_part) + unsubscribe_address = CommonConfig.smtp().unsubscribe_address() + + mime_message.add_header( + 'List-Unsubscribe', + f'mailto:{unsubscribe_address}?subject=Delete%20account%20and%20unsubscribe') + # HTML gets attached last, thus making it a preferred part as per RFC if message.html_body: message_part = MIMEText(message.html_body, 'html', 'utf-8') diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/__init__.py b/apps/common/src/python/mediawords/workflow/__init__.py similarity index 100% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/__init__.py rename to apps/common/src/python/mediawords/workflow/__init__.py diff --git a/apps/common/src/python/mediawords/workflow/client.py b/apps/common/src/python/mediawords/workflow/client.py new file mode 100644 index 0000000000..560ead779e --- /dev/null +++ b/apps/common/src/python/mediawords/workflow/client.py @@ -0,0 +1,22 @@ +from temporal.workflow import WorkflowClient + +from mediawords.util.network import wait_for_tcp_port_to_open + + +def workflow_client(namespace: str = 'default') -> WorkflowClient: + """ + Connect to Temporal server and return its client. + + :param namespace: Namespace to connect to. + :return: WorkflowClient instance. + """ + + host = 'temporal-server' + port = 7233 + + # It's super lame to wait for this port to open, but the Python SDK seems to fail otherwise + wait_for_tcp_port_to_open(hostname=host, port=port) + + client = WorkflowClient.new_client(host=host, port=port, namespace=namespace) + + return client diff --git a/apps/common/src/python/mediawords/workflow/exceptions.py b/apps/common/src/python/mediawords/workflow/exceptions.py new file mode 100644 index 0000000000..46fb109058 --- /dev/null +++ b/apps/common/src/python/mediawords/workflow/exceptions.py @@ -0,0 +1,80 @@ +""" +Custom exceptions used for reporting back various errors back to the workflow. +""" + +import abc + + +class _AbstractMcWorkflowError(Exception, metaclass=abc.ABCMeta): + """Abstract exception.""" + pass + + +class McProgrammingError(_AbstractMcWorkflowError): + """ + Exception thrown on programming errors. + + It's pointless to retry actions that have caused this error as we need to fix some code first, and it might be a + good idea to stop whatever we're doing altogether. + + Examples include: + + * Various third party APIs returning something that our code can't understand. + * Files existing where they're not supposed to exist. + * Typos in SQL commands. + * Assertions. + """ + pass + + +class McConfigurationError(_AbstractMcWorkflowError): + """ + Exception thrown when something is misconfigured. + + Different from McProgrammingError in that we can figure out that there's a configuration problem somewhere almost + immediately upon start, while a programming error can take some time to show up (e.g. some sort of an external API + doesn't work with particular inputs, or the temporary directory can't be written to anymore because we wrote too + many files in it). + + No reason to retry whatever has caused this error as someone needs to fix the configuration first, and one should + consider stopping whatever that we're doing as there's no point in continuing without valid configuration anyway. + + Examples include: + + * Configuration environment variables not set / set to invalid values. + * Bad authentication credentials. + * Invalid arguments passed. + """ + pass + + +class McTransientError(_AbstractMcWorkflowError): + """ + Exception thrown on transient (occurring at irregular intervals) errors. + + It is reasonable to expect that when this error occurs, we can wait for a bit, retry and the action might succeed. + + Examples include: + + * Not being able to connect to the database. + * HTTP server responding with "503 Service Unavailable". + * Network being down. + """ + pass + + +class McPermanentError(_AbstractMcWorkflowError): + """ + Exception thrown when some expectations of the application were unmet so it can't proceed with a specific input but + it's likely that it will be able to process other inputs. + + There's nothing wrong with the code that does the processing, and we can continue on processing other inputs, but + there's no way to continue processing this particular input or retrying on this error. + + Examples include: + + * One of the stories that's to be processed does not exist at all. + * HTTP server responding with "404 Not Found". + * Downloaded media file turns out to not be a media file at all. + """ + pass diff --git a/apps/common/src/python/mediawords/workflow/worker.py b/apps/common/src/python/mediawords/workflow/worker.py new file mode 100644 index 0000000000..86178f6f19 --- /dev/null +++ b/apps/common/src/python/mediawords/workflow/worker.py @@ -0,0 +1,16 @@ +import asyncio + +from temporal.worker import Worker + + +async def stop_worker_faster(worker: Worker) -> None: + """ + Stops worker but does it slightly faster. + + Default implementation of worker.stop() sleeps for 5 seconds between retries. We sleep a bit less. + + :param worker: Worker instance to stop + """ + worker.stop_requested = True + while worker.threads_stopped != worker.threads_started: + await asyncio.sleep(0.5) diff --git a/apps/common/src/requirements.txt b/apps/common/src/requirements.txt index 96c3385b19..3347d75565 100644 --- a/apps/common/src/requirements.txt +++ b/apps/common/src/requirements.txt @@ -44,14 +44,14 @@ furl==2.1.0 jieba==0.42.1 # Parsing email templates -Jinja2==2.11.2 +Jinja2==2.11.3 # One of Celery's dependencies (here just for PyCharm to stop complaining) # Upgrade together with Celery and not separately. kombu==4.6.11 # XML manipulations, HTML parsing -lxml==4.6.2 +lxml==4.6.3 # Japanese language tokenizer, stemmer, etc. mecab-python3==1.0.3 @@ -71,6 +71,9 @@ PyStemmer==2.0.1 # Unit tests pytest==6.2.2 +# asyncio tests +pytest-asyncio==0.15.1 + # Timezone handling pytz==2020.5 @@ -89,8 +92,11 @@ sentence_splitter==1.4 # Celery PostgreSQL result backend support sqlalchemy==1.3.22 +# Temporal's Python SDK +git+https://github.com/firdaus/temporal-python-sdk.git@8604d025ae1272b592d3d4dd430acd15eeb6562a#egg=temporal-python-sdk + # Normalizing URLs url_normalize==1.4.3 # Low level HTTP requests (with SSL certificate verification) -urllib3[secure]==1.26.3 +urllib3[secure]==1.26.5 diff --git a/apps/common/tests/python/mediawords/job/setup_broker_test.py b/apps/common/tests/python/mediawords/job/setup_broker_test.py index ca85f90b56..c7503eb019 100644 --- a/apps/common/tests/python/mediawords/job/setup_broker_test.py +++ b/apps/common/tests/python/mediawords/job/setup_broker_test.py @@ -39,12 +39,12 @@ class AbstractBrokerTestCase(TestCase, metaclass=abc.ABCMeta): @classmethod @abc.abstractmethod def worker_paths(cls) -> List[Worker]: - raise NotImplemented("Abstract method") + raise NotImplementedError("Abstract method") @classmethod @abc.abstractmethod def broker_class(cls) -> Type[JobBroker]: - raise NotImplemented("Abstract method") + raise NotImplementedError("Abstract method") @classmethod def setUpClass(cls) -> None: diff --git a/apps/common/tests/python/mediawords/languages/test_lt.py b/apps/common/tests/python/mediawords/languages/test_lt.py index 87aa142519..5ecb4a916a 100644 --- a/apps/common/tests/python/mediawords/languages/test_lt.py +++ b/apps/common/tests/python/mediawords/languages/test_lt.py @@ -17,7 +17,7 @@ def test_sample_sentence(self): def test_stop_words_map(self): stop_words = self.__tokenizer.stop_words_map() - assert "buvo" in stop_words + assert "dargi" in stop_words assert "not_a_stopword" not in stop_words def test_stem(self): diff --git a/apps/common/tests/python/mediawords/languages/test_pt.py b/apps/common/tests/python/mediawords/languages/test_pt.py index b385cad73b..cec4e0c41a 100644 --- a/apps/common/tests/python/mediawords/languages/test_pt.py +++ b/apps/common/tests/python/mediawords/languages/test_pt.py @@ -17,7 +17,7 @@ def test_sample_sentence(self): def test_stop_words_map(self): stop_words = self.__tokenizer.stop_words_map() - assert "fãs" in stop_words + assert "abre" in stop_words assert "not_a_stopword" not in stop_words def test_stem(self): diff --git a/apps/common/tests/python/mediawords/languages/test_sv.py b/apps/common/tests/python/mediawords/languages/test_sv.py index 48c217e7e3..cd9f8a7fa4 100644 --- a/apps/common/tests/python/mediawords/languages/test_sv.py +++ b/apps/common/tests/python/mediawords/languages/test_sv.py @@ -17,7 +17,7 @@ def test_sample_sentence(self): def test_stop_words_map(self): stop_words = self.__tokenizer.stop_words_map() - assert "vår" in stop_words + assert "åttio" in stop_words assert "not_a_stopword" not in stop_words def test_stem(self): diff --git a/apps/common/tests/python/mediawords/languages/test_zh.py b/apps/common/tests/python/mediawords/languages/test_zh.py index 9e21730b36..ae0b55ee59 100644 --- a/apps/common/tests/python/mediawords/languages/test_zh.py +++ b/apps/common/tests/python/mediawords/languages/test_zh.py @@ -17,7 +17,7 @@ def test_sample_sentence(self): def test_stop_words_map(self): stop_words = self.__tokenizer.stop_words_map() - assert "不勝" in stop_words + assert "不起" in stop_words assert "not_a_stopword" not in stop_words def test_stem(self): diff --git a/apps/crawler-ap/.idea/mediawords.sql b/apps/crawler-ap/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/crawler-ap/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/crawler-ap/.idea/sqlDataSources.xml b/apps/crawler-ap/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..51c3dd16dc --- /dev/null +++ b/apps/crawler-ap/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/crawler-ap/.idea/sqldialects.xml b/apps/crawler-ap/.idea/sqldialects.xml index 790b3f37f8..92fefa2e78 100644 --- a/apps/crawler-ap/.idea/sqldialects.xml +++ b/apps/crawler-ap/.idea/sqldialects.xml @@ -1,6 +1,7 @@ + diff --git a/apps/crawler-ap/docker-compose.tests.yml b/apps/crawler-ap/docker-compose.tests.yml index 0c7ecbf4f3..2ea0f17570 100644 --- a/apps/crawler-ap/docker-compose.tests.yml +++ b/apps/crawler-ap/docker-compose.tests.yml @@ -93,8 +93,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ rabbitmq-server: image: gcr.io/mcback/rabbitmq-server:latest diff --git a/apps/crawler-fetcher/.idea/crawler-fetcher.iml b/apps/crawler-fetcher/.idea/crawler-fetcher.iml index f29e2751d6..ff9e527d05 100644 --- a/apps/crawler-fetcher/.idea/crawler-fetcher.iml +++ b/apps/crawler-fetcher/.idea/crawler-fetcher.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/crawler-fetcher/.idea/mediawords.sql b/apps/crawler-fetcher/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/crawler-fetcher/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/crawler-fetcher/.idea/misc.xml b/apps/crawler-fetcher/.idea/misc.xml index 4da3ef8ce4..0b6f459d16 100644 --- a/apps/crawler-fetcher/.idea/misc.xml +++ b/apps/crawler-fetcher/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/crawler-fetcher/.idea/sqlDataSources.xml b/apps/crawler-fetcher/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..f3e178e0d8 --- /dev/null +++ b/apps/crawler-fetcher/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/crawler-fetcher/docker-compose.tests.yml b/apps/crawler-fetcher/docker-compose.tests.yml index bce5615b12..fb3cb718a7 100644 --- a/apps/crawler-fetcher/docker-compose.tests.yml +++ b/apps/crawler-fetcher/docker-compose.tests.yml @@ -56,8 +56,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ rabbitmq-server: image: gcr.io/mcback/rabbitmq-server:latest diff --git a/apps/crawler-fetcher/src/python/crawler_fetcher/handler.py b/apps/crawler-fetcher/src/python/crawler_fetcher/handler.py index 095e1fab3a..833d668d63 100644 --- a/apps/crawler-fetcher/src/python/crawler_fetcher/handler.py +++ b/apps/crawler-fetcher/src/python/crawler_fetcher/handler.py @@ -26,9 +26,9 @@ def fetch_download(self, db: DatabaseHandler, download: dict) -> Optional[Respon Return Response if the download had to be fetched and was, in fact, fetched; or return None if the download shouldn't / couldn't be fetched for whatever reason but no error is to be reported. """ - raise NotImplemented("Abstract method.") + raise NotImplementedError("Abstract method.") def store_response(self, db: DatabaseHandler, download: dict, response: Response) -> None: """Store the download (response object) somehow, e.g. store it, parse if it is a feed, add new stories derived from it, etc.""" - raise NotImplemented("Abstract method.") + raise NotImplementedError("Abstract method.") diff --git a/apps/crawler-fetcher/src/python/crawler_fetcher/handlers/default/store_mixin.py b/apps/crawler-fetcher/src/python/crawler_fetcher/handlers/default/store_mixin.py index 98042b2dc2..c779c1a49e 100644 --- a/apps/crawler-fetcher/src/python/crawler_fetcher/handlers/default/store_mixin.py +++ b/apps/crawler-fetcher/src/python/crawler_fetcher/handlers/default/store_mixin.py @@ -38,7 +38,7 @@ def store_download(self, db: DatabaseHandler, download: dict, content: str) -> L feed; * 'feed/web_page' downloads return a list with a single 'web_page' story to be extracted. """ - raise NotImplemented("Abstract method") + raise NotImplementedError("Abstract method") def _store_failed_download_error_message(self, db: DatabaseHandler, download: dict, response: Response) -> None: """ diff --git a/apps/crawler-fetcher/src/python/crawler_fetcher/handlers/feed.py b/apps/crawler-fetcher/src/python/crawler_fetcher/handlers/feed.py index d402c76a72..25f46d08a0 100644 --- a/apps/crawler-fetcher/src/python/crawler_fetcher/handlers/feed.py +++ b/apps/crawler-fetcher/src/python/crawler_fetcher/handlers/feed.py @@ -33,7 +33,7 @@ def add_stories_from_feed(self, db: DatabaseHandler, download: dict, content: st If helper returns an empty arrayref, '(redundant feed)' will be written instead of feed contents. """ - raise NotImplemented("Abstract method") + raise NotImplementedError("Abstract method") @abc.abstractmethod def return_stories_to_be_extracted_from_feed(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: @@ -42,7 +42,7 @@ def return_stories_to_be_extracted_from_feed(self, db: DatabaseHandler, download For example, 'web_page' feed creates a single story for itself so it has to be extracted right away. """ - raise NotImplemented("Abstract method") + raise NotImplementedError("Abstract method") def store_download(self, db: DatabaseHandler, download: dict, content: str) -> List[int]: download = decode_object_from_bytes_if_needed(download) diff --git a/apps/crawler-fetcher/tests/python/setup_handler_test.py b/apps/crawler-fetcher/tests/python/setup_handler_test.py index 27f0cd0b41..8c6f77b953 100644 --- a/apps/crawler-fetcher/tests/python/setup_handler_test.py +++ b/apps/crawler-fetcher/tests/python/setup_handler_test.py @@ -23,7 +23,7 @@ class TestDownloadHandler(TestCase, metaclass=abc.ABCMeta): @abc.abstractmethod def hashserver_pages(self) -> Dict[str, Any]: """Return HashServer pages to serve.""" - raise NotImplemented("Abstract method") + raise NotImplementedError("Abstract method") def _fetch_and_handle_response(self, path: str, downloads_id: Optional[int] = None) -> Dict[str, Any]: """Call the fetcher and handler on the given URL. Return the download passed to the fetcher and handler.""" diff --git a/apps/crawler-fetcher/tests/python/setup_univision_test.py b/apps/crawler-fetcher/tests/python/setup_univision_test.py index debb998431..5cf1d3a4eb 100644 --- a/apps/crawler-fetcher/tests/python/setup_univision_test.py +++ b/apps/crawler-fetcher/tests/python/setup_univision_test.py @@ -31,13 +31,13 @@ class AbstractUnivisionTest(object, metaclass=abc.ABCMeta): @abc.abstractmethod def univision_credentials(cls) -> Optional[UnivisionTestCredentials]: """Return test credentials to test Univision integration with, or None if you'd like the tests to be skipped.""" - raise NotImplemented("Abstract method") + raise NotImplementedError("Abstract method") @classmethod @abc.abstractmethod def expect_to_find_some_stories(cls) -> bool: """If True, we should expect to find some stories in the downloaded feed.""" - raise NotImplemented("Abstract method") + raise NotImplementedError("Abstract method") @classmethod def _mock_crawler_config(cls) -> CrawlerConfig: diff --git a/apps/crawler-provider/.idea/crawler-provider.iml b/apps/crawler-provider/.idea/crawler-provider.iml index 00c763162f..e64db4b116 100644 --- a/apps/crawler-provider/.idea/crawler-provider.iml +++ b/apps/crawler-provider/.idea/crawler-provider.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/crawler-provider/.idea/inspectionProfiles/Project_Default.xml b/apps/crawler-provider/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000000..7c041470bb --- /dev/null +++ b/apps/crawler-provider/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/apps/crawler-provider/.idea/mediawords.sql b/apps/crawler-provider/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/crawler-provider/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/crawler-provider/.idea/misc.xml b/apps/crawler-provider/.idea/misc.xml index 4ba9154104..ef27a6c4fa 100644 --- a/apps/crawler-provider/.idea/misc.xml +++ b/apps/crawler-provider/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/crawler-provider/.idea/sqlDataSources.xml b/apps/crawler-provider/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..05201c78f2 --- /dev/null +++ b/apps/crawler-provider/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/crawler-provider/.idea/sqldialects.xml b/apps/crawler-provider/.idea/sqldialects.xml index 790b3f37f8..92fefa2e78 100644 --- a/apps/crawler-provider/.idea/sqldialects.xml +++ b/apps/crawler-provider/.idea/sqldialects.xml @@ -1,6 +1,7 @@ + diff --git a/apps/crawler-provider/docker-compose.tests.yml b/apps/crawler-provider/docker-compose.tests.yml index e1c6fa25b4..0cfd0cbdfd 100644 --- a/apps/crawler-provider/docker-compose.tests.yml +++ b/apps/crawler-provider/docker-compose.tests.yml @@ -49,5 +49,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/crawler-provider/src/python/crawler_provider/__init__.py b/apps/crawler-provider/src/python/crawler_provider/__init__.py index 175b1d4f26..f15569b800 100644 --- a/apps/crawler-provider/src/python/crawler_provider/__init__.py +++ b/apps/crawler-provider/src/python/crawler_provider/__init__.py @@ -15,12 +15,13 @@ The provider works as a daemon, periodically checking the size queued_downloads and only adding new jobs to the queue if there are more than MAX_QUEUE_SIZE jobs in the table. This allows us to implement -throttline by keeping the crawler jobs queue relatively small, thus limiting the number of requests for each +throttling by keeping the crawler jobs queue relatively small, thus limiting the number of requests for each host over a period of several minutes, while allowing the crawler_fetcher jobs to acts as simple stupid worker jobs that just do a quick query of queued_downloads to grab the oldest queued download. """ import time +from typing import List, Any, Iterator from mediawords.db import DatabaseHandler from mediawords.util.log import create_logger @@ -97,7 +98,7 @@ def _add_stale_feeds(db: DatabaseHandler) -> None: -- Feed was downloaded more than stale_feed_interval seconds ago OR (last_attempted_download_time < (NOW() - (%(a)s || ' seconds')::interval)) - -- (Probably) if a new story comes in every "n" seconds, refetch feed every "n" + 5 minutes + -- (Probably) if a new story comes in every "n" seconds, re-fetch feed every "n" + 5 minutes OR ( (NOW() > last_attempted_download_time + (last_attempted_download_time - last_new_story_time) + interval '5 minutes') @@ -114,11 +115,13 @@ def _add_stale_feeds(db: DatabaseHandler) -> None: db.query( """ + -- noinspection SqlResolve @ table/"feeds_to_queue" UPDATE feeds SET last_attempted_download_time = NOW() WHERE feeds_id IN (SELECT feeds_id FROM feeds_to_queue) """) + # noinspection SqlResolve,SqlCheckUsingColumns downloads = db.query( """ WITH inserted_downloads as ( @@ -141,12 +144,15 @@ def _add_stale_feeds(db: DatabaseHandler) -> None: join feeds f using (feeds_id) """).hashes() - db.query("drop table feeds_to_queue") + db.query(""" + -- noinspection SqlResolveForFile + drop table feeds_to_queue + """) - log.info("added stale feeds: %d" % len(downloads)) + log.info(f"Added stale feeds: {len(downloads)}") -def provide_download_ids(db: DatabaseHandler) -> None: +def provide_download_ids(db: DatabaseHandler) -> List[int]: """Return a list of pending downloads ids to queue for fetching. Hand out one downloads_id for each distinct host with a pending download. @@ -158,17 +164,49 @@ def provide_download_ids(db: DatabaseHandler) -> None: _add_stale_feeds(db) - log.info("querying pending downloads ...") - - # get one downloads_id per host, ordered by priority asc, downloads_id desc, do this through a plpgsql - # function because that's the only way to avoid an index scan of the entire (host, priority, downloads_id) index - downloads_ids = db.query("select get_downloads_for_queue() downloads_id").flat() + log.info("Querying pending downloads...") + + # get one downloads_id per host, ordered by priority asc, downloads_id desc + # noinspection SqlResolve + downloads_ids = db.query(""" + + -- Pending downloads by host, ranked by priority and the biggest "downloads_id" + WITH pending_downloads_per_host AS ( + + SELECT + host, + downloads_id, + ROW_NUMBER() OVER( + PARTITION BY host + ORDER BY + priority, + downloads_id DESC NULLS LAST + ) AS rank + FROM downloads_pending AS dp + WHERE ( + SELECT 1 + FROM queued_downloads AS qd + WHERE qd.downloads_id = dp.downloads_id + ) IS NULL + ) + + SELECT downloads_id + FROM pending_downloads_per_host + WHERE rank = 1 + + """).flat() - log.info("provide downloads host downloads: %d" % len(downloads_ids)) + log.info(f"Providing {len(downloads_ids)} per-host download IDs") return downloads_ids +def __chunks(list_to_be_chunked: List[Any], chunk_size: int) -> Iterator[List[Any]]: + """Yield successive chunks from parameter list.""" + for i in range(0, len(list_to_be_chunked), chunk_size): + yield list_to_be_chunked[i:i + chunk_size] + + def run_provider(db: DatabaseHandler, daemon: bool = True) -> None: """Run the provider daemon to periodically add crawler_fetcher jobs by querying for pending downloads. @@ -189,27 +227,42 @@ def run_provider(db: DatabaseHandler, daemon: bool = True) -> None: queue_size = db.query( "select count(*) from ( select 1 from queued_downloads limit %(a)s ) q", {'a': MAX_QUEUE_SIZE * 10}).flat()[0] - log.warning("queue_size: %d" % queue_size) + log.info(f"Queue size: {queue_size}") if queue_size < MAX_QUEUE_SIZE: downloads_ids = provide_download_ids(db) if downloads_ids: - log.warning("adding to downloads to queue: %d" % len(downloads_ids)) + log.info(f"Adding {len(downloads_ids)} download IDs to queue...") + + # Insert in chunks so that: + # 1) Fetchers get to fetching sooner; + # 2) We don't have to come up with a query that's 2 MB long. + for chunk_downloads_ids in __chunks(list_to_be_chunked=downloads_ids, chunk_size=1000): + log.info(f"Inserting chunk of downloads ({len(chunk_downloads_ids)} download IDs)...") + # noinspection SqlResolve,SqlSignature + db.query( + """ + INSERT INTO queued_downloads (downloads_id) + VALUES (unnest (ARRAY %(chunk_downloads_ids)s::bigint[])) + ON CONFLICT (downloads_id) DO NOTHING + """ % { + 'chunk_downloads_ids': chunk_downloads_ids, + } + ) - values = ','.join(["(%d)" % i for i in downloads_ids]) - db.query( - "insert into queued_downloads(downloads_id) values %s on conflict (downloads_id) do nothing" % - values) else: - log.info("No downloads to add") + log.info("No download IDs to add") if daemon: if time.time() - last_queue_time < QUEUE_INTERVAL: + log.info(f"Sleeping for {QUEUE_INTERVAL} seconds") time.sleep(QUEUE_INTERVAL) elif daemon: - time.sleep(QUEUE_INTERVAL * 10) + time_to_sleep = QUEUE_INTERVAL * 10 + log.info(f"Sleeping for {time_to_sleep} seconds as we're running as a daemon") + time.sleep(time_to_sleep) last_queue_time = time.time() diff --git a/apps/crawler-provider/tests/python/test_add_stale_feeds.py b/apps/crawler-provider/tests/python/test_add_stale_feeds.py index 03b5e7e925..e7944889c8 100644 --- a/apps/crawler-provider/tests/python/test_add_stale_feeds.py +++ b/apps/crawler-provider/tests/python/test_add_stale_feeds.py @@ -3,6 +3,8 @@ from mediawords.test.db.create import create_test_medium from mediawords.db import connect_to_db from mediawords.util.sql import sql_now, get_sql_date_from_epoch + +# noinspection PyProtectedMember from crawler_provider import _add_stale_feeds @@ -32,7 +34,7 @@ def test_add_stale_feeds(): 'active': True, 'last_attempted_download_time': sql_now() } - feed = db.create('feeds', feed) + db.create('feeds', feed) feed = { 'media_id': medium['media_id'], @@ -43,7 +45,7 @@ def test_add_stale_feeds(): 'last_attempted_download_time': sql_now(), 'last_new_story_time': sql_now() } - feed = db.create('feeds', feed) + db.create('feeds', feed) feed = { 'media_id': medium['media_id'], @@ -51,8 +53,8 @@ def test_add_stale_feeds(): 'url': 'http://5 minute new story', 'type': 'syndicated', 'active': True, - 'last_attempted_download_time': get_sql_date_from_epoch(time.time() - 300), - 'last_new_story_time': get_sql_date_from_epoch(time.time() - 300), + 'last_attempted_download_time': get_sql_date_from_epoch(int(time.time()) - 300), + 'last_new_story_time': get_sql_date_from_epoch(int(time.time()) - 300), } feed = db.create('feeds', feed) pending_feeds.append(feed) @@ -63,7 +65,7 @@ def test_add_stale_feeds(): 'url': 'http://old last download', 'type': 'syndicated', 'active': True, - 'last_attempted_download_time': get_sql_date_from_epoch(time.time() - (86400 * 10)) + 'last_attempted_download_time': get_sql_date_from_epoch(int(time.time()) - (86400 * 10)) } feed = db.create('feeds', feed) pending_feeds.append(feed) diff --git a/apps/crawler-provider/tests/python/test_provide_download_ids.py b/apps/crawler-provider/tests/python/test_provide_download_ids.py index 6c51e7a2f6..b55af28e6e 100644 --- a/apps/crawler-provider/tests/python/test_provide_download_ids.py +++ b/apps/crawler-provider/tests/python/test_provide_download_ids.py @@ -1,5 +1,6 @@ from mediawords.db import connect_to_db from mediawords.test.db.create import create_test_medium, create_test_feed + from crawler_provider import provide_download_ids diff --git a/apps/crawler-provider/tests/python/test_run_provider.py b/apps/crawler-provider/tests/python/test_run_provider.py index 896e468ca7..a456c48862 100644 --- a/apps/crawler-provider/tests/python/test_run_provider.py +++ b/apps/crawler-provider/tests/python/test_run_provider.py @@ -1,9 +1,10 @@ import time -from crawler_provider import run_provider from mediawords.test.db.create import create_test_medium, create_test_feed from mediawords.db import connect_to_db +from crawler_provider import run_provider + def test_run_provider(): db = connect_to_db() diff --git a/apps/create-missing-partitions/.idea/create-missing-partitions.iml b/apps/create-missing-partitions/.idea/create-missing-partitions.iml index 42988ad5e7..3df2c0c662 100644 --- a/apps/create-missing-partitions/.idea/create-missing-partitions.iml +++ b/apps/create-missing-partitions/.idea/create-missing-partitions.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/create-missing-partitions/.idea/mediawords.sql b/apps/create-missing-partitions/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/create-missing-partitions/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/create-missing-partitions/.idea/misc.xml b/apps/create-missing-partitions/.idea/misc.xml index 6a91d1a00f..0fb1cc2fba 100644 --- a/apps/create-missing-partitions/.idea/misc.xml +++ b/apps/create-missing-partitions/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/create-missing-partitions/.idea/sqlDataSources.xml b/apps/create-missing-partitions/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..9e8bed1c5a --- /dev/null +++ b/apps/create-missing-partitions/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/create-missing-partitions/docker-compose.tests.yml b/apps/create-missing-partitions/docker-compose.tests.yml index cbafcefd70..a454e2a84b 100644 --- a/apps/create-missing-partitions/docker-compose.tests.yml +++ b/apps/create-missing-partitions/docker-compose.tests.yml @@ -43,5 +43,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/cron-generate-daily-rss-dumps/.idea/cron-generate-daily-rss-dumps.iml b/apps/cron-generate-daily-rss-dumps/.idea/cron-generate-daily-rss-dumps.iml index 524923feed..4c3138569c 100644 --- a/apps/cron-generate-daily-rss-dumps/.idea/cron-generate-daily-rss-dumps.iml +++ b/apps/cron-generate-daily-rss-dumps/.idea/cron-generate-daily-rss-dumps.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/cron-generate-daily-rss-dumps/.idea/mediawords.sql b/apps/cron-generate-daily-rss-dumps/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/cron-generate-daily-rss-dumps/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/cron-generate-daily-rss-dumps/.idea/misc.xml b/apps/cron-generate-daily-rss-dumps/.idea/misc.xml index 5012d8cb66..70d7383712 100644 --- a/apps/cron-generate-daily-rss-dumps/.idea/misc.xml +++ b/apps/cron-generate-daily-rss-dumps/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/cron-generate-daily-rss-dumps/.idea/sqlDataSources.xml b/apps/cron-generate-daily-rss-dumps/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..3eacf166ca --- /dev/null +++ b/apps/cron-generate-daily-rss-dumps/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/cron-generate-daily-rss-dumps/docker-compose.tests.yml b/apps/cron-generate-daily-rss-dumps/docker-compose.tests.yml index 80a2254651..56fdc8b91e 100644 --- a/apps/cron-generate-daily-rss-dumps/docker-compose.tests.yml +++ b/apps/cron-generate-daily-rss-dumps/docker-compose.tests.yml @@ -46,5 +46,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/cron-generate-media-health/.idea/cron-generate-media-health.iml b/apps/cron-generate-media-health/.idea/cron-generate-media-health.iml index 60ad626c86..995969f847 100644 --- a/apps/cron-generate-media-health/.idea/cron-generate-media-health.iml +++ b/apps/cron-generate-media-health/.idea/cron-generate-media-health.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/cron-generate-media-health/.idea/mediawords.sql b/apps/cron-generate-media-health/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/cron-generate-media-health/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/cron-generate-media-health/.idea/misc.xml b/apps/cron-generate-media-health/.idea/misc.xml index 4457122b86..6cccc07d8f 100644 --- a/apps/cron-generate-media-health/.idea/misc.xml +++ b/apps/cron-generate-media-health/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/cron-generate-media-health/.idea/sqlDataSources.xml b/apps/cron-generate-media-health/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..57bc924113 --- /dev/null +++ b/apps/cron-generate-media-health/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/cron-generate-media-health/docker-compose.tests.yml b/apps/cron-generate-media-health/docker-compose.tests.yml index d0e3004326..74b389d3e9 100644 --- a/apps/cron-generate-media-health/docker-compose.tests.yml +++ b/apps/cron-generate-media-health/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/cron-generate-user-summary/.idea/cron-generate-user-summary.iml b/apps/cron-generate-user-summary/.idea/cron-generate-user-summary.iml index 8221ae113c..43e8502909 100644 --- a/apps/cron-generate-user-summary/.idea/cron-generate-user-summary.iml +++ b/apps/cron-generate-user-summary/.idea/cron-generate-user-summary.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/cron-generate-user-summary/.idea/mediawords.sql b/apps/cron-generate-user-summary/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/cron-generate-user-summary/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/cron-generate-user-summary/.idea/misc.xml b/apps/cron-generate-user-summary/.idea/misc.xml index 593c83ec4a..c12db7e294 100644 --- a/apps/cron-generate-user-summary/.idea/misc.xml +++ b/apps/cron-generate-user-summary/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/cron-generate-user-summary/.idea/sqlDataSources.xml b/apps/cron-generate-user-summary/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..e8ade52e9c --- /dev/null +++ b/apps/cron-generate-user-summary/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/cron-generate-user-summary/docker-compose.tests.yml b/apps/cron-generate-user-summary/docker-compose.tests.yml index c02b92f977..1ac0b72ada 100644 --- a/apps/cron-generate-user-summary/docker-compose.tests.yml +++ b/apps/cron-generate-user-summary/docker-compose.tests.yml @@ -46,5 +46,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/cron-print-long-running-job-states/.idea/cron-print-long-running-job-states.iml b/apps/cron-print-long-running-job-states/.idea/cron-print-long-running-job-states.iml index cc772aeb8e..29e6a7ec66 100644 --- a/apps/cron-print-long-running-job-states/.idea/cron-print-long-running-job-states.iml +++ b/apps/cron-print-long-running-job-states/.idea/cron-print-long-running-job-states.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/cron-print-long-running-job-states/.idea/mediawords.sql b/apps/cron-print-long-running-job-states/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/cron-print-long-running-job-states/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/cron-print-long-running-job-states/.idea/misc.xml b/apps/cron-print-long-running-job-states/.idea/misc.xml index c036f5aa0f..f9bdf2011c 100644 --- a/apps/cron-print-long-running-job-states/.idea/misc.xml +++ b/apps/cron-print-long-running-job-states/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/cron-print-long-running-job-states/.idea/sqlDataSources.xml b/apps/cron-print-long-running-job-states/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..34938f86d8 --- /dev/null +++ b/apps/cron-print-long-running-job-states/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/cron-print-long-running-job-states/docker-compose.tests.yml b/apps/cron-print-long-running-job-states/docker-compose.tests.yml index f84022e5b3..f2eb667e72 100644 --- a/apps/cron-print-long-running-job-states/docker-compose.tests.yml +++ b/apps/cron-print-long-running-job-states/docker-compose.tests.yml @@ -46,5 +46,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/cron-refresh-stats/.idea/cron-refresh-stats.iml b/apps/cron-refresh-stats/.idea/cron-refresh-stats.iml index 1653c18a80..1d45b72bf1 100644 --- a/apps/cron-refresh-stats/.idea/cron-refresh-stats.iml +++ b/apps/cron-refresh-stats/.idea/cron-refresh-stats.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/cron-refresh-stats/.idea/mediawords.sql b/apps/cron-refresh-stats/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/cron-refresh-stats/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/cron-refresh-stats/.idea/misc.xml b/apps/cron-refresh-stats/.idea/misc.xml index 169914fe12..993cab23e3 100644 --- a/apps/cron-refresh-stats/.idea/misc.xml +++ b/apps/cron-refresh-stats/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/cron-refresh-stats/.idea/sqlDataSources.xml b/apps/cron-refresh-stats/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..bf483f8df2 --- /dev/null +++ b/apps/cron-refresh-stats/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/cron-refresh-stats/docker-compose.tests.yml b/apps/cron-refresh-stats/docker-compose.tests.yml index 2ccefa0ed7..9af8e30be6 100644 --- a/apps/cron-refresh-stats/docker-compose.tests.yml +++ b/apps/cron-refresh-stats/docker-compose.tests.yml @@ -46,5 +46,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/cron-rescrape-due-media/.idea/cron-rescrape-due-media.iml b/apps/cron-rescrape-due-media/.idea/cron-rescrape-due-media.iml index 1ef711ee68..0a5eca061e 100644 --- a/apps/cron-rescrape-due-media/.idea/cron-rescrape-due-media.iml +++ b/apps/cron-rescrape-due-media/.idea/cron-rescrape-due-media.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/cron-rescrape-due-media/.idea/mediawords.sql b/apps/cron-rescrape-due-media/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/cron-rescrape-due-media/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/cron-rescrape-due-media/.idea/misc.xml b/apps/cron-rescrape-due-media/.idea/misc.xml index 2f005776c1..652440f052 100644 --- a/apps/cron-rescrape-due-media/.idea/misc.xml +++ b/apps/cron-rescrape-due-media/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/cron-rescrape-due-media/.idea/sqlDataSources.xml b/apps/cron-rescrape-due-media/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..401967c19c --- /dev/null +++ b/apps/cron-rescrape-due-media/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/cron-rescrape-due-media/docker-compose.tests.yml b/apps/cron-rescrape-due-media/docker-compose.tests.yml index 9284439ac1..1b321172a7 100644 --- a/apps/cron-rescrape-due-media/docker-compose.tests.yml +++ b/apps/cron-rescrape-due-media/docker-compose.tests.yml @@ -47,8 +47,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ rabbitmq-server: image: gcr.io/mcback/rabbitmq-server:latest diff --git a/apps/cron-rescraping-changes/.idea/cron-rescraping-changes.iml b/apps/cron-rescraping-changes/.idea/cron-rescraping-changes.iml index 9c4fdfff81..1a8d59bdb4 100644 --- a/apps/cron-rescraping-changes/.idea/cron-rescraping-changes.iml +++ b/apps/cron-rescraping-changes/.idea/cron-rescraping-changes.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/cron-rescraping-changes/.idea/mediawords.sql b/apps/cron-rescraping-changes/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/cron-rescraping-changes/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/cron-rescraping-changes/.idea/misc.xml b/apps/cron-rescraping-changes/.idea/misc.xml index 1b8e228190..08275c44df 100644 --- a/apps/cron-rescraping-changes/.idea/misc.xml +++ b/apps/cron-rescraping-changes/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/cron-rescraping-changes/.idea/sqlDataSources.xml b/apps/cron-rescraping-changes/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..4c57723ab0 --- /dev/null +++ b/apps/cron-rescraping-changes/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/cron-rescraping-changes/docker-compose.tests.yml b/apps/cron-rescraping-changes/docker-compose.tests.yml index 6437093c02..a563decac7 100644 --- a/apps/cron-rescraping-changes/docker-compose.tests.yml +++ b/apps/cron-rescraping-changes/docker-compose.tests.yml @@ -46,5 +46,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/cron-set-media-primary-language/.idea/cron-set-media-primary-language.iml b/apps/cron-set-media-primary-language/.idea/cron-set-media-primary-language.iml index f310743f07..0e114839d4 100644 --- a/apps/cron-set-media-primary-language/.idea/cron-set-media-primary-language.iml +++ b/apps/cron-set-media-primary-language/.idea/cron-set-media-primary-language.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/cron-set-media-primary-language/.idea/mediawords.sql b/apps/cron-set-media-primary-language/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/cron-set-media-primary-language/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/cron-set-media-primary-language/.idea/misc.xml b/apps/cron-set-media-primary-language/.idea/misc.xml index 8876443b65..23250328bb 100644 --- a/apps/cron-set-media-primary-language/.idea/misc.xml +++ b/apps/cron-set-media-primary-language/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/cron-set-media-primary-language/.idea/sqlDataSources.xml b/apps/cron-set-media-primary-language/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..ca1eda3677 --- /dev/null +++ b/apps/cron-set-media-primary-language/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/cron-set-media-primary-language/docker-compose.tests.yml b/apps/cron-set-media-primary-language/docker-compose.tests.yml index 69c7cd43a6..f4e0ca5bf7 100644 --- a/apps/cron-set-media-primary-language/docker-compose.tests.yml +++ b/apps/cron-set-media-primary-language/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/cron-set-media-subject-country/.idea/cron-set-media-subject-country.iml b/apps/cron-set-media-subject-country/.idea/cron-set-media-subject-country.iml index 035f4e65b2..c714455c7d 100644 --- a/apps/cron-set-media-subject-country/.idea/cron-set-media-subject-country.iml +++ b/apps/cron-set-media-subject-country/.idea/cron-set-media-subject-country.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/cron-set-media-subject-country/.idea/mediawords.sql b/apps/cron-set-media-subject-country/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/cron-set-media-subject-country/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/cron-set-media-subject-country/.idea/misc.xml b/apps/cron-set-media-subject-country/.idea/misc.xml index 4b93bd35c3..2541630658 100644 --- a/apps/cron-set-media-subject-country/.idea/misc.xml +++ b/apps/cron-set-media-subject-country/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/cron-set-media-subject-country/.idea/sqlDataSources.xml b/apps/cron-set-media-subject-country/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..d57cbe6661 --- /dev/null +++ b/apps/cron-set-media-subject-country/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/cron-set-media-subject-country/docker-compose.tests.yml b/apps/cron-set-media-subject-country/docker-compose.tests.yml index 99c543ee36..01864f5a44 100644 --- a/apps/cron-set-media-subject-country/docker-compose.tests.yml +++ b/apps/cron-set-media-subject-country/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/docker-compose.dist.yml b/apps/docker-compose.dist.yml index 42d0e234f0..c015ab9c87 100644 --- a/apps/docker-compose.dist.yml +++ b/apps/docker-compose.dist.yml @@ -74,6 +74,14 @@ x-common-configuration: &common-configuration # "From:" email address when sending emails MC_EMAIL_FROM_ADDRESS: "info@mediacloud.org" + # Email address to point to in List-Unsubscribe email header. + # Technically we don't have a straightforward "unsubscribe" endpoint, but our + # emails are more likely to be marked spam if we don't have such a header, so + # we make the email subject "Delete account and unsubscribe" in + # mediawords/util/config/common.py + # example value = support@example.com + MC_EMAIL_UNSUBSCRIBE: "support@example.com" + # Fail all HTTP requests that match the following pattern, e.g. # "^https?://[^/]*some-website.com" MC_USERAGENT_BLACKLIST_URL_PATTERN: "" @@ -163,37 +171,6 @@ x-brandwatch-api-configuration: &brandwatch-api-configuration MC_BRANDWATCH_PASSWORD: "" -# -# Google Cloud for podcast transcription common configuration -# =========================================================== -# -x-podcast-google-cloud-configuration: &podcast-google-cloud-configuration - - # Base64-encoded Google Cloud authentication JSON file for a service account that - # uploads episodes to Google Cloud Storage and submits Speech API jobs; refer to - # doc/podcasts_gc_auth.markdown for instructions on how to create such an - # account. - # - # How to generate Base64 encoded credentials: - # - # $ base64 mediacloud-service-account-credentials.json - # - MC_PODCAST_GC_AUTH_JSON_BASE64: ' - ewogICAgInR5cGUiOiAic2VydmljZV9hY2NvdW50IiwKICAgICJwcm9qZWN0X2lkIjogImV - 4YW1wbGUiLAogICAgInByaXZhdGVfa2V5X2lkIjogIjdmMTY5YTIxZDNmODA5NzQzNjRiY2 - YwOWYyMDQ3ZWEwZWZiNTY4M2EiLAogICAgInByaXZhdGVfa2V5IjogIi0tLS0tQkVHSU4gU - FJJVkFURSBLRVktLS0tLVxuPC4uLj5cbi0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS1cbiIs - CiAgICAiY2xpZW50X2VtYWlsIjogImV4YW1wbGVAZXhhbXBsZS5pYW0uZ3NlcnZpY2VhY2N - vdW50LmNvbSIsCiAgICAiY2xpZW50X2lkIjogIjEyMyIsCiAgICAiYXV0aF91cmkiOiAiaH - R0cHM6Ly9hY2NvdW50cy5nb29nbGUuY29tL28vb2F1dGgyL2F1dGgiLAogICAgInRva2VuX - 3VyaSI6ICJodHRwczovL29hdXRoMi5nb29nbGVhcGlzLmNvbS90b2tlbiIsCiAgICAiYXV0 - aF9wcm92aWRlcl94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29 - tL29hdXRoMi92MS9jZXJ0cyIsCiAgICAiY2xpZW50X3g1MDlfY2VydF91cmwiOiAiaHR0cH - M6Ly93d3cuZ29vZ2xlYXBpcy5jb20vcm9ib3QvdjEvbWV0YWRhdGEveDUwOS9leGFtcGxlJ - TQwZXhhbXBsZS5pYW0uZ3NlcnZpY2VhY2NvdW50LmNvbSIKfQ== - ' - - # # Solr shard base service # ======================= @@ -1052,6 +1029,8 @@ services: depends_on: # Reads data from Munin node - munin-node + environment: + MC_MUNIN_CRON_ALERT_EMAIL: "FIXME@mediacloud.org" volumes: # Shared with "munin-fastcgi-graph" container: - vol_munin_data:/var/lib/munin/ @@ -1218,7 +1197,7 @@ services: # NYTLabels fetch annotation and tag # ----------------------- # - nytlabels-update-story-tags: + nytlabels-fetch-annotation-and-tag: image: gcr.io/mcback/nytlabels-fetch-annotation-and-tag:release init: true networks: @@ -1246,24 +1225,21 @@ services: memory: "512M" # - # Fetch story podcast episode and store it in GCS - # ----------------------------------------------- + # Fetch story podcast episode and store it in GCS (RabbitMQ worker) + # ----------------------------------------------------------------- # - podcast-fetch-episode: - image: gcr.io/mcback/podcast-fetch-episode:release + podcast-transcribe-episode-rabbitmq-worker: + image: gcr.io/mcback/podcast-transcribe-episode:release + command: "rabbitmq-worker.py" init: true networks: - default environment: <<: *common-configuration - <<: *podcast-google-cloud-configuration - # Google Cloud Storage bucket name for storing episode audio files - MC_PODCAST_FETCH_EPISODE_BUCKET_NAME: "mediacloud-story-podcast-episodes" - # Google Cloud Storage path prefix for storing episode audio files - MC_PODCAST_FETCH_EPISODE_PATH_PREFIX: "episodes" depends_on: - postgresql-pgbouncer - rabbitmq-server + - temporal-server deploy: <<: *misc-apps_deploy_placement_constraints <<: *endpoint-mode-dnsrr @@ -1282,80 +1258,53 @@ services: max_attempts: 3 # - # Fetch finished transcripts and store them locally - # ------------------------------------------------- - podcast-fetch-transcript: - image: gcr.io/mcback/podcast-fetch-transcript:release - init: true - networks: - - default - environment: - <<: *common-configuration - <<: *podcast-google-cloud-configuration - depends_on: - - postgresql-pgbouncer - - rabbitmq-server - deploy: - <<: *misc-apps_deploy_placement_constraints - <<: *endpoint-mode-dnsrr - # Worker count - replicas: 1 - resources: - limits: - # CPU core limit - cpus: "1" - # RAM limit - memory: "256M" - restart_policy: - # Automatically restart on non-zero exit codes only instead of on any exit - condition: on-failure - # Autorestart up to three times - max_attempts: 3 - - # - # Poll due operations and submit them to "podcast-fetch-transcript" + # Fetch story podcast episode and store it in GCS (Temporal worker) # ----------------------------------------------------------------- - podcast-poll-due-operations: - image: gcr.io/mcback/podcast-poll-due-operations:release - init: true - networks: - - default - environment: - <<: *common-configuration - depends_on: - - postgresql-pgbouncer - - rabbitmq-server - deploy: - <<: *misc-apps_deploy_placement_constraints - <<: *endpoint-mode-dnsrr - # Worker count - replicas: 1 - resources: - limits: - # CPU core limit - cpus: "1" - # RAM limit - memory: "256M" - restart_policy: - # Automatically restart on non-zero exit codes only instead of on any exit - condition: on-failure - # Autorestart up to three times - max_attempts: 3 - # - # Submit a Speech API operation for a podcast episode - # --------------------------------------------------- - podcast-submit-operation: - image: gcr.io/mcback/podcast-submit-operation:release + podcast-transcribe-episode-temporal-worker: + image: gcr.io/mcback/podcast-transcribe-episode:release + command: "temporal-worker.py" init: true networks: - default environment: <<: *common-configuration - <<: *podcast-google-cloud-configuration + # GCS bucket name and path prefix for storing raw, untranscoded enclosure files + MC_PODCAST_RAW_ENCLOSURES_BUCKET_NAME: "FIXME" + MC_PODCAST_RAW_ENCLOSURES_PATH_PREFIX: "enclosures" + # GCS bucket name and path prefix for storing transcoded episodes + MC_PODCAST_TRANSCODED_EPISODES_BUCKET_NAME: "FIXME" + MC_PODCAST_TRANSCODED_EPISODES_PATH_PREFIX: "episodes" + # GCS bucket name and path prefix for storing raw JSON transcripts + MC_PODCAST_TRANSCRIPTS_BUCKET_NAME: "FIXME" + MC_PODCAST_TRANSCRIPTS_PATH_PREFIX: "transcripts" + # Base64-encoded Google Cloud authentication JSON file for a service account that + # uploads episodes to Google Cloud Storage and submits Speech API jobs; refer to + # doc/podcasts_gc_auth.markdown for instructions on how to create such an + # account. + # + # How to generate Base64 encoded credentials: + # + # $ base64 mediacloud-service-account-credentials.json + # + MC_PODCAST_AUTH_JSON_BASE64: ' + ewogICAgInR5cGUiOiAic2VydmljZV9hY2NvdW50IiwKICAgICJwcm9qZWN0X2lkIjogImV + 4YW1wbGUiLAogICAgInByaXZhdGVfa2V5X2lkIjogIjdmMTY5YTIxZDNmODA5NzQzNjRiY2 + YwOWYyMDQ3ZWEwZWZiNTY4M2EiLAogICAgInByaXZhdGVfa2V5IjogIi0tLS0tQkVHSU4gU + FJJVkFURSBLRVktLS0tLVxuPC4uLj5cbi0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS1cbiIs + CiAgICAiY2xpZW50X2VtYWlsIjogImV4YW1wbGVAZXhhbXBsZS5pYW0uZ3NlcnZpY2VhY2N + vdW50LmNvbSIsCiAgICAiY2xpZW50X2lkIjogIjEyMyIsCiAgICAiYXV0aF91cmkiOiAiaH + R0cHM6Ly9hY2NvdW50cy5nb29nbGUuY29tL28vb2F1dGgyL2F1dGgiLAogICAgInRva2VuX + 3VyaSI6ICJodHRwczovL29hdXRoMi5nb29nbGVhcGlzLmNvbS90b2tlbiIsCiAgICAiYXV0 + aF9wcm92aWRlcl94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29 + tL29hdXRoMi92MS9jZXJ0cyIsCiAgICAiY2xpZW50X3g1MDlfY2VydF91cmwiOiAiaHR0cH + M6Ly93d3cuZ29vZ2xlYXBpcy5jb20vcm9ib3QvdjEvbWV0YWRhdGEveDUwOS9leGFtcGxlJ + TQwZXhhbXBsZS5pYW0uZ3NlcnZpY2VhY2NvdW50LmNvbSIKfQ== + ' depends_on: - postgresql-pgbouncer - rabbitmq-server + - temporal-server deploy: <<: *misc-apps_deploy_placement_constraints <<: *endpoint-mode-dnsrr @@ -1364,9 +1313,9 @@ services: resources: limits: # CPU core limit - cpus: "1" + cpus: "2" # RAM limit - memory: "256M" + memory: "4G" restart_policy: # Automatically restart on non-zero exit codes only instead of on any exit condition: on-failure @@ -1834,6 +1783,187 @@ services: # RAM limit memory: "1G" + # + # Temporal Elasticsearch (searching for workflows) + # ------------------------------------------------ + # + temporal-elasticsearch: + image: gcr.io/mcback/temporal-elasticsearch:release + init: true + networks: + - default + expose: + - "9200" + - "9300" + volumes: + - vol_temporal_elasticsearch_data:/var/lib/elasticsearch/ + deploy: + <<: *endpoint-mode-dnsrr + placement: + constraints: + # Must run on the host with Temporal Elasticsearch data volume + - node.labels.role-temporal-elasticsearch == true + # Worker count + replicas: 1 + resources: + limits: + # CPU core limit + cpus: "4" + # RAM limit + memory: "16G" + + # + # Temporal Grafana (web UI for Temporal's stats) + # ---------------------------------------------- + # + temporal-grafana: + image: gcr.io/mcback/temporal-grafana:release + init: true + networks: + - default + expose: + - "3000" + ports: + # For connecting to through a SSH tunnel + # MAKE SURE to BLOCK THIS PORT in provision/roles/docker/tasks/iptables.yml + - "3000:3000" + volumes: + - vol_temporal_grafana_data:/var/lib/grafana/ + deploy: + <<: *endpoint-mode-dnsrr + placement: + constraints: + # Must run on the host with Temporal Grafana data volume + - node.labels.role-temporal-grafana == true + # Worker count + replicas: 1 + resources: + limits: + cpus: "2" + memory: "2G" + + # + # Temporal PostgreSQL (Temporal's main data store) + # ------------------------------------------------ + # + temporal-postgresql: + image: gcr.io/mcback/temporal-postgresql:release + init: true + networks: + - default + expose: + - 5432 + volumes: + - vol_temporal_postgresql_data:/var/lib/postgresql/ + deploy: + <<: *endpoint-mode-dnsrr + placement: + constraints: + # Must run on the host with Temporal PostgreSQL server data volume + - node.labels.role-temporal-postgresql == true + # Worker count + replicas: 1 + resources: + limits: + # CPU core limit + cpus: "8" + # RAM limit + memory: "32G" + + # + # Temporal Prometheus (Temporal's statistics store) + # ------------------------------------------------- + # + temporal-prometheus: + image: gcr.io/mcback/temporal-prometheus:release + init: true + depends_on: + - temporal-grafana + networks: + - default + expose: + - "9090" + volumes: + - vol_temporal_prometheus_data:/opt/prometheus/data/ + deploy: + <<: *endpoint-mode-dnsrr + placement: + constraints: + # Must run on the host with Temporal Prometheus data volume + - node.labels.role-temporal-prometheus == true + # Worker count + replicas: 1 + resources: + limits: + cpus: "2" + memory: "2G" + + # + # Temporal server (running stateful workflows) + # -------------------------------------------- + # + temporal-server: + image: gcr.io/mcback/temporal-server:release + init: true + networks: + - default + depends_on: + - temporal-postgresql + - temporal-elasticsearch + - temporal-prometheus + expose: + - 6933 + - 6934 + - 6935 + - 6939 + - 7233 + - 7234 + - 7235 + - 7239 + volumes: + - vol_temporal_server_archives:/var/lib/temporal/ + deploy: + <<: *endpoint-mode-dnsrr + placement: + constraints: + # Must run on the host with Temporal server data volume + - node.labels.role-temporal-server == true + # Worker count + replicas: 1 + resources: + limits: + # CPU core limit + cpus: "8" + # RAM limit + memory: "32G" + + # + # Temporal webapp (tracking workflow state) + # ----------------------------------------- + # + temporal-webapp: + image: gcr.io/mcback/temporal-webapp:release + init: true + networks: + - default + expose: + - "8088" + ports: + # For connecting to through a SSH tunnel + # MAKE SURE to BLOCK THIS PORT in provision/roles/docker/tasks/iptables.yml + - "8088:8088" + deploy: + <<: *misc-apps_deploy_placement_constraints + <<: *endpoint-mode-dnsrr + # Worker count + replicas: 1 + resources: + limits: + # CPU core limit + cpus: "2" + # RAM limit + memory: "4G" + # # Extract story links for a topic # ------------------------------- @@ -2365,3 +2495,43 @@ volumes: type: none o: bind device: /space/mediacloud/vol_elk_elasticsearch_data + + # Temporal server workflow archives + vol_temporal_server_archives: + driver: local + driver_opts: + type: none + o: bind + device: /space/mediacloud/vol_temporal_server_archives + + # Temporal PostgreSQL server data + vol_temporal_postgresql_data: + driver: local + driver_opts: + type: none + o: bind + device: /space/mediacloud/vol_postgresql_data + + # Temporal Elasticsearch data + vol_temporal_elasticsearch_data: + driver: local + driver_opts: + type: none + o: bind + device: /space/mediacloud/vol_temporal_elasticsearch_data + + # Temporal Prometheus data + vol_temporal_prometheus_data: + driver: local + driver_opts: + type: none + o: bind + device: /space/mediacloud/vol_temporal_prometheus_data + + # Temporal Grafana data + vol_temporal_grafana_data: + driver: local + driver_opts: + type: none + o: bind + device: /space/mediacloud/vol_temporal_grafana_data diff --git a/apps/dump-table/.idea/dump-table.iml b/apps/dump-table/.idea/dump-table.iml index b01990d739..8bb1623091 100644 --- a/apps/dump-table/.idea/dump-table.iml +++ b/apps/dump-table/.idea/dump-table.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/dump-table/.idea/mediawords.sql b/apps/dump-table/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/dump-table/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/dump-table/.idea/misc.xml b/apps/dump-table/.idea/misc.xml index a628023bfa..219c0a49e1 100644 --- a/apps/dump-table/.idea/misc.xml +++ b/apps/dump-table/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/dump-table/.idea/sqlDataSources.xml b/apps/dump-table/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..796ff8cf62 --- /dev/null +++ b/apps/dump-table/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/dump-table/docker-compose.tests.yml b/apps/dump-table/docker-compose.tests.yml index 946442019d..7940757394 100644 --- a/apps/dump-table/docker-compose.tests.yml +++ b/apps/dump-table/docker-compose.tests.yml @@ -43,5 +43,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/podcast-fetch-episode/.dockerignore b/apps/elasticsearch-base/.dockerignore similarity index 100% rename from apps/podcast-fetch-episode/.dockerignore rename to apps/elasticsearch-base/.dockerignore diff --git a/apps/elasticsearch-base/Dockerfile b/apps/elasticsearch-base/Dockerfile new file mode 100644 index 0000000000..07207b0d51 --- /dev/null +++ b/apps/elasticsearch-base/Dockerfile @@ -0,0 +1,69 @@ +# +# Base image for Elasticsearch +# + +FROM gcr.io/mcback/java-base:latest + +# Install Elasticsearch +# (https://www.elastic.co/downloads/elasticsearch-no-jdk) +ENV MC_ELASTICSEARCH_VERSION=7.10.2 +RUN \ + mkdir -p /opt/elasticsearch/ && \ + curl --fail --location --retry 3 --retry-delay 5 "https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${MC_ELASTICSEARCH_VERSION}-no-jdk-linux-x86_64.tar.gz" | \ + tar -zx -C /opt/elasticsearch/ --strip 1 && \ + true + +# Add unprivileged user the service will run as +RUN useradd -ms /bin/bash elasticsearch + +RUN \ + # + # Data directory + mkdir -p /var/lib/elasticsearch/ && \ + mkdir -p /var/lib/elasticsearch/jvm-heapdumps/ && \ + mkdir -p /var/lib/elasticsearch/jvm-gc-logs/ && \ + chown -R elasticsearch:elasticsearch /var/lib/elasticsearch/ && \ + # + # JVM options directory + mkdir -p /opt/elasticsearch/config/jvm.options.d/ && \ + chmod 775 /opt/elasticsearch/config/jvm.options.d/ && \ + # + true + +COPY config/* /opt/elasticsearch/config/ +COPY bin/* /opt/elasticsearch/bin/ + +# Create keystore and move it to data volume +RUN \ + rm -f /opt/elasticsearch/config/elasticsearch.keystore && \ + rm -f /var/lib/elasticsearch/elasticsearch.keystore && \ + /opt/elasticsearch/bin/elasticsearch-keystore create && \ + mv /opt/elasticsearch/config/elasticsearch.keystore /var/lib/elasticsearch/ && \ + ln -s /var/lib/elasticsearch/elasticsearch.keystore /opt/elasticsearch/config/elasticsearch.keystore && \ + chown elasticsearch:elasticsearch /var/lib/elasticsearch/elasticsearch.keystore && \ + # + # Keystore tool will want to write a "temporary" keystore: + # + # ERROR: unable to create temporary keystore at + # [/opt/elasticsearch/config/elasticsearch.keystore.tmp], + # write permissions required for [/opt/elasticsearch/config] + # or run [elasticsearch-keystore upgrade] + # + # Plus the S3 plugin insists at writing to other locations too. + # + chown -R elasticsearch:elasticsearch /opt/elasticsearch/config/ && \ + # + true + +USER elasticsearch + +# Elasticsearch HTTP +EXPOSE 9200 + +# Elasticsearch TCP transport +EXPOSE 9300 + +# No "VOLUME /var/lib/elasticsearch" here because sub-images might want to +# pre-init the volume with some data + +CMD ["/opt/elasticsearch/bin/elasticsearch.sh"] diff --git a/apps/elasticsearch-base/bin/elasticsearch.sh b/apps/elasticsearch-base/bin/elasticsearch.sh new file mode 100755 index 0000000000..4dcc391452 --- /dev/null +++ b/apps/elasticsearch-base/bin/elasticsearch.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -e +set -u + +# https://www.elastic.co/guide/en/elasticsearch/reference/current/max-number-of-threads.html +if [ "$(ulimit -u)" != "unlimited" ] && [ $(ulimit -u) -lt 4096 ]; then + echo "Process limit (ulimit -u) is too low." + exit 1 +fi + +# https://www.elastic.co/guide/en/elasticsearch/reference/current/file-descriptors.html +if [ "$(ulimit -n -S)" != "unlimited" ] && [ $(ulimit -n -S) -lt 65535 ]; then + echo "Soft open file limit (ulimit -n -S) is too low." + exit 1 +fi +if [ "$(ulimit -n -H)" != "unlimited" ] && [ $(ulimit -n -H) -lt 65535 ]; then + echo "Hard open file limit (ulimit -n -H) is too low." + exit 1 +fi + +# "Set Xmx and Xms to no more than 50% of your physical RAM." +MC_RAM_SIZE=$(/container_memory_limit.sh) +MC_ELASTICSEARCH_MS=$((MC_RAM_SIZE / 10 * 4)) +MC_ELASTICSEARCH_MX="${MC_ELASTICSEARCH_MS}" + +export ES_JAVA_OPTS="" + +# Memory limits +export ES_JAVA_OPTS="${ES_JAVA_OPTS} -Xms${MC_ELASTICSEARCH_MS}m" +export ES_JAVA_OPTS="${ES_JAVA_OPTS} -Xmx${MC_ELASTICSEARCH_MX}m" + +# Run Elasticsearch +exec /opt/elasticsearch/bin/elasticsearch diff --git a/apps/elasticsearch-base/config/.dockerignore b/apps/elasticsearch-base/config/.dockerignore new file mode 100644 index 0000000000..b3c0a37b66 --- /dev/null +++ b/apps/elasticsearch-base/config/.dockerignore @@ -0,0 +1 @@ +elasticsearch.keystore diff --git a/apps/elasticsearch-base/config/.gitignore b/apps/elasticsearch-base/config/.gitignore new file mode 100644 index 0000000000..3eb03f777e --- /dev/null +++ b/apps/elasticsearch-base/config/.gitignore @@ -0,0 +1,3 @@ +# Might get created by a Docker container +elasticsearch.keystore + diff --git a/apps/elk-elasticsearch/config/elasticsearch.yml b/apps/elasticsearch-base/config/elasticsearch-base.yml similarity index 68% rename from apps/elk-elasticsearch/config/elasticsearch.yml rename to apps/elasticsearch-base/config/elasticsearch-base.yml index 12fc1f5b1c..3e7ad2dfea 100644 --- a/apps/elk-elasticsearch/config/elasticsearch.yml +++ b/apps/elasticsearch-base/config/elasticsearch-base.yml @@ -1,5 +1,3 @@ -cluster.name: elk-elasticsearch -node.name: elk-elasticsearch path.data: /var/lib/elasticsearch network.host: 0.0.0.0 http.port: 9200 @@ -8,8 +6,3 @@ transport.port: 9300 # Use single node discovery in order to disable production mode and avoid bootstrap checks # see https://www.elastic.co/guide/en/elasticsearch/reference/current/bootstrap-checks.html discovery.type: single-node - -# Define S3 client for log snapshots -s3.client: - elk_logs: - protocol: https diff --git a/apps/elk-elasticsearch/config/java.policy b/apps/elasticsearch-base/config/java.policy similarity index 100% rename from apps/elk-elasticsearch/config/java.policy rename to apps/elasticsearch-base/config/java.policy diff --git a/apps/elk-elasticsearch/config/jvm.options b/apps/elasticsearch-base/config/jvm.options similarity index 95% rename from apps/elk-elasticsearch/config/jvm.options rename to apps/elasticsearch-base/config/jvm.options index c15568722a..3590c3bb45 100644 --- a/apps/elk-elasticsearch/config/jvm.options +++ b/apps/elasticsearch-base/config/jvm.options @@ -18,7 +18,7 @@ # has sufficient space -XX:HeapDumpPath=/var/lib/elasticsearch/jvm-heapdumps/ -# Update policy for S3 plugin to work +# Update policy for plugins to work -Djava.security.policy=/opt/elasticsearch/config/java.policy # Log JVM errors to STDERR diff --git a/apps/elk-elasticsearch/config/log4j2.properties b/apps/elasticsearch-base/config/log4j2.properties similarity index 100% rename from apps/elk-elasticsearch/config/log4j2.properties rename to apps/elasticsearch-base/config/log4j2.properties diff --git a/apps/elk-elasticsearch/Dockerfile b/apps/elk-elasticsearch/Dockerfile index 4dd588dc44..754ccb43b6 100644 --- a/apps/elk-elasticsearch/Dockerfile +++ b/apps/elk-elasticsearch/Dockerfile @@ -2,77 +2,36 @@ # Elasticsearch for ELK logging stack # -FROM gcr.io/mcback/java-base:latest +FROM gcr.io/mcback/elasticsearch-base:latest -# Install Elasticsearch -# (https://www.elastic.co/downloads/elasticsearch-no-jdk) -ENV ELK_ELASTICSEARCH_VERSION=7.10.2 -RUN \ - mkdir -p /opt/elasticsearch/ && \ - curl --fail --location --retry 3 --retry-delay 5 "https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${ELK_ELASTICSEARCH_VERSION}-no-jdk-linux-x86_64.tar.gz" | \ - tar -zx -C /opt/elasticsearch/ --strip 1 && \ - true +USER root # Install Elasticsearch Amazon S3 plugin for ILS archival # (we use curl to be able to configure retries and such) RUN \ - curl --fail --location --retry 3 --retry-delay 5 "https://artifacts.elastic.co/downloads/elasticsearch-plugins/repository-s3/repository-s3-${ELK_ELASTICSEARCH_VERSION}.zip" > \ + curl --fail --location --retry 3 --retry-delay 5 "https://artifacts.elastic.co/downloads/elasticsearch-plugins/repository-s3/repository-s3-${MC_ELASTICSEARCH_VERSION}.zip" > \ /var/tmp/es-s3-plugin.zip && \ /opt/elasticsearch/bin/elasticsearch-plugin install --batch file:///var/tmp/es-s3-plugin.zip && \ rm /var/tmp/es-s3-plugin.zip && \ true -# Add unprivileged user the service will run as -RUN useradd -ms /bin/bash elk - -RUN \ - # - # Data directory - mkdir -p /var/lib/elasticsearch/ && \ - mkdir -p /var/lib/elasticsearch/jvm-heapdumps/ && \ - mkdir -p /var/lib/elasticsearch/jvm-gc-logs/ && \ - chown -R elk:elk /var/lib/elasticsearch/ && \ - # - # JVM options directory - mkdir -p /opt/elasticsearch/config/jvm.options.d/ && \ - chmod 775 /opt/elasticsearch/config/jvm.options.d/ && \ - # - true - COPY config/* /opt/elasticsearch/config/ COPY bin/* /opt/elasticsearch/bin/ # Create keystore and move it to data volume RUN \ - rm -f /opt/elasticsearch/config/elasticsearch.keystore && \ - rm -f /var/lib/elasticsearch/elasticsearch.keystore && \ - /opt/elasticsearch/bin/elasticsearch-keystore create && \ - mv /opt/elasticsearch/config/elasticsearch.keystore /var/lib/elasticsearch/ && \ - ln -s /var/lib/elasticsearch/elasticsearch.keystore /opt/elasticsearch/config/elasticsearch.keystore && \ - chown elk:elk /var/lib/elasticsearch/elasticsearch.keystore && \ - # - # Keystore tool will want to write a "temporary" keystore: - # - # ERROR: unable to create temporary keystore at - # [/opt/elasticsearch/config/elasticsearch.keystore.tmp], - # write permissions required for [/opt/elasticsearch/config] - # or run [elasticsearch-keystore upgrade] # - # Plus the S3 plugin insists at writing to other locations too. - # - chown -R elk:elk /opt/elasticsearch/config/ && \ + # Merge base and ELK configs into one + cat \ + /opt/elasticsearch/config/elasticsearch-base.yml \ + /opt/elasticsearch/config/elk-elasticsearch.yml \ + > /opt/elasticsearch/config/elasticsearch.yml && \ # true -USER elk - -# Elasticsearch HTTP -EXPOSE 9200 - -# Elasticsearch TCP transport -EXPOSE 9300 +USER elasticsearch # Elasticsearch data VOLUME /var/lib/elasticsearch -CMD ["/opt/elasticsearch/bin/elasticsearch.sh"] +CMD ["/opt/elasticsearch/bin/elk-elasticsearch.sh"] diff --git a/apps/elk-elasticsearch/bin/elasticsearch.sh b/apps/elk-elasticsearch/bin/elk-elasticsearch.sh similarity index 56% rename from apps/elk-elasticsearch/bin/elasticsearch.sh rename to apps/elk-elasticsearch/bin/elk-elasticsearch.sh index 5681a63dc0..677a2dcd4d 100755 --- a/apps/elk-elasticsearch/bin/elasticsearch.sh +++ b/apps/elk-elasticsearch/bin/elk-elasticsearch.sh @@ -24,22 +24,6 @@ fi set -u -# https://www.elastic.co/guide/en/elasticsearch/reference/current/max-number-of-threads.html -if [ "$(ulimit -u)" != "unlimited" ] && [ $(ulimit -u) -lt 4096 ]; then - echo "Process limit (ulimit -u) is too low." - exit 1 -fi - -# https://www.elastic.co/guide/en/elasticsearch/reference/current/file-descriptors.html -if [ "$(ulimit -n -S)" != "unlimited" ] && [ $(ulimit -n -S) -lt 65535 ]; then - echo "Soft open file limit (ulimit -n -S) is too low." - exit 1 -fi -if [ "$(ulimit -n -H)" != "unlimited" ] && [ $(ulimit -n -H) -lt 65535 ]; then - echo "Hard open file limit (ulimit -n -H) is too low." - exit 1 -fi - # Update AWS credentials in a keystore echo "Update AWS credentials in a keystore..." echo -n "${MC_ELK_ELASTICSEARCH_SNAPSHOT_S3_ACCESS_KEY_ID}" | \ @@ -55,16 +39,5 @@ if [ ! -f /var/lib/elasticsearch/s3-snapshots-setup ]; then touch /var/lib/elasticsearch/s3-snapshots-setup fi -# "Set Xmx and Xms to no more than 50% of your physical RAM." -MC_RAM_SIZE=$(/container_memory_limit.sh) -MC_ELASTICSEARCH_MS=$((MC_RAM_SIZE / 10 * 4)) -MC_ELASTICSEARCH_MX="${MC_ELASTICSEARCH_MS}" - -export ES_JAVA_OPTS="" - -# Memory limits -export ES_JAVA_OPTS="${ES_JAVA_OPTS} -Xms${MC_ELASTICSEARCH_MS}m" -export ES_JAVA_OPTS="${ES_JAVA_OPTS} -Xmx${MC_ELASTICSEARCH_MX}m" - -# Run Elasticsearch -exec /opt/elasticsearch/bin/elasticsearch +# Run Elasticsearch wrapper script +exec /opt/elasticsearch/bin/elasticsearch.sh diff --git a/apps/elk-elasticsearch/config/elk-elasticsearch.yml b/apps/elk-elasticsearch/config/elk-elasticsearch.yml new file mode 100644 index 0000000000..68c42c5625 --- /dev/null +++ b/apps/elk-elasticsearch/config/elk-elasticsearch.yml @@ -0,0 +1,7 @@ +cluster.name: elk-elasticsearch +node.name: elk-elasticsearch + +# Define S3 client for log snapshots +s3.client: + elk_logs: + protocol: https diff --git a/apps/elk-kibana/docker-compose.tests.yml b/apps/elk-kibana/docker-compose.tests.yml index 26eb31d2c5..d98bca2078 100644 --- a/apps/elk-kibana/docker-compose.tests.yml +++ b/apps/elk-kibana/docker-compose.tests.yml @@ -49,11 +49,12 @@ services: - "9300:9300" volumes: - type: bind - source: ./../elk-elasticsearch/bin/elasticsearch.sh - target: /opt/elasticsearch/bin/elasticsearch.sh + source: ./../elk-elasticsearch/bin/elk-elasticsearch.sh + target: /opt/elasticsearch/bin/elk-elasticsearch.sh - type: bind - source: ./../elk-elasticsearch/config/ - target: /opt/elasticsearch/config/ + source: ./../elasticsearch-base/bin/elasticsearch.sh + target: /opt/elasticsearch/bin/elasticsearch.sh + # Not mounting config as it gets concatenated into a single file # Limit CPUs and RAM for the process to not get too greedy deploy: resources: diff --git a/apps/export-tables-to-backup-crawler/.idea/export-tables-to-backup-crawler.iml b/apps/export-tables-to-backup-crawler/.idea/export-tables-to-backup-crawler.iml index a44aec3bcd..3d9fff1090 100644 --- a/apps/export-tables-to-backup-crawler/.idea/export-tables-to-backup-crawler.iml +++ b/apps/export-tables-to-backup-crawler/.idea/export-tables-to-backup-crawler.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/export-tables-to-backup-crawler/.idea/mediawords.sql b/apps/export-tables-to-backup-crawler/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/export-tables-to-backup-crawler/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/export-tables-to-backup-crawler/.idea/misc.xml b/apps/export-tables-to-backup-crawler/.idea/misc.xml index 4c079ac3cc..612d8fa8cb 100644 --- a/apps/export-tables-to-backup-crawler/.idea/misc.xml +++ b/apps/export-tables-to-backup-crawler/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/export-tables-to-backup-crawler/.idea/sqlDataSources.xml b/apps/export-tables-to-backup-crawler/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..1206505a61 --- /dev/null +++ b/apps/export-tables-to-backup-crawler/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/export-tables-to-backup-crawler/docker-compose.tests.yml b/apps/export-tables-to-backup-crawler/docker-compose.tests.yml index fa6ca3e3dc..bec36c4c46 100644 --- a/apps/export-tables-to-backup-crawler/docker-compose.tests.yml +++ b/apps/export-tables-to-backup-crawler/docker-compose.tests.yml @@ -49,5 +49,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/extract-and-vector/.idea/extract-and-vector.iml b/apps/extract-and-vector/.idea/extract-and-vector.iml index 6c34b252d7..374cee519e 100644 --- a/apps/extract-and-vector/.idea/extract-and-vector.iml +++ b/apps/extract-and-vector/.idea/extract-and-vector.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/extract-and-vector/.idea/mediawords.sql b/apps/extract-and-vector/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/extract-and-vector/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/extract-and-vector/.idea/misc.xml b/apps/extract-and-vector/.idea/misc.xml index 2b539d222a..c8744769fb 100644 --- a/apps/extract-and-vector/.idea/misc.xml +++ b/apps/extract-and-vector/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/extract-and-vector/.idea/sqlDataSources.xml b/apps/extract-and-vector/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..e137dc53a1 --- /dev/null +++ b/apps/extract-and-vector/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/extract-and-vector/docker-compose.tests.yml b/apps/extract-and-vector/docker-compose.tests.yml index b13ba25133..dc15a36094 100644 --- a/apps/extract-and-vector/docker-compose.tests.yml +++ b/apps/extract-and-vector/docker-compose.tests.yml @@ -69,8 +69,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ rabbitmq-server: image: gcr.io/mcback/rabbitmq-server:latest diff --git a/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py b/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py index 21fd9c1078..4b99f4cce9 100755 --- a/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py +++ b/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py @@ -44,6 +44,9 @@ class ServerHandler(BaseHTTPRequestHandler): + # Allow HTTP/1.1 connections and so don't wait up on "Expect:" headers + protocol_version = "HTTP/1.1" + _API_ENDPOINT_PATH = "/extract" def __json_response(self, status: int, response: dict) -> bytes: diff --git a/apps/facebook-fetch-story-stats/.idea/facebook-fetch-story-stats.iml b/apps/facebook-fetch-story-stats/.idea/facebook-fetch-story-stats.iml index 7329c1b21b..1a562335a1 100644 --- a/apps/facebook-fetch-story-stats/.idea/facebook-fetch-story-stats.iml +++ b/apps/facebook-fetch-story-stats/.idea/facebook-fetch-story-stats.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/facebook-fetch-story-stats/.idea/mediawords.sql b/apps/facebook-fetch-story-stats/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/facebook-fetch-story-stats/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/facebook-fetch-story-stats/.idea/misc.xml b/apps/facebook-fetch-story-stats/.idea/misc.xml index a5e76612f4..501124f803 100644 --- a/apps/facebook-fetch-story-stats/.idea/misc.xml +++ b/apps/facebook-fetch-story-stats/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/facebook-fetch-story-stats/.idea/sqlDataSources.xml b/apps/facebook-fetch-story-stats/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..5a0f1f0d7a --- /dev/null +++ b/apps/facebook-fetch-story-stats/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/facebook-fetch-story-stats/docker-compose.tests.yml b/apps/facebook-fetch-story-stats/docker-compose.tests.yml index 52fd7b1b3d..917c869c08 100644 --- a/apps/facebook-fetch-story-stats/docker-compose.tests.yml +++ b/apps/facebook-fetch-story-stats/docker-compose.tests.yml @@ -53,8 +53,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ rabbitmq-server: image: gcr.io/mcback/rabbitmq-server:latest diff --git a/apps/import-solr-data-for-testing/.idea/import-solr-data-for-testing.iml b/apps/import-solr-data-for-testing/.idea/import-solr-data-for-testing.iml index 418a15a742..4ea74d3211 100644 --- a/apps/import-solr-data-for-testing/.idea/import-solr-data-for-testing.iml +++ b/apps/import-solr-data-for-testing/.idea/import-solr-data-for-testing.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/import-solr-data-for-testing/.idea/mediawords.sql b/apps/import-solr-data-for-testing/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/import-solr-data-for-testing/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/import-solr-data-for-testing/.idea/misc.xml b/apps/import-solr-data-for-testing/.idea/misc.xml index 140198e1e6..6ad961cfaf 100644 --- a/apps/import-solr-data-for-testing/.idea/misc.xml +++ b/apps/import-solr-data-for-testing/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/import-solr-data-for-testing/.idea/sqlDataSources.xml b/apps/import-solr-data-for-testing/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..a1a49e3292 --- /dev/null +++ b/apps/import-solr-data-for-testing/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/import-solr-data-for-testing/docker-compose.tests.yml b/apps/import-solr-data-for-testing/docker-compose.tests.yml index 679bb5b161..27fc041b8c 100644 --- a/apps/import-solr-data-for-testing/docker-compose.tests.yml +++ b/apps/import-solr-data-for-testing/docker-compose.tests.yml @@ -50,8 +50,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ solr-shard-01: image: gcr.io/mcback/solr-shard:latest diff --git a/apps/import-solr-data/.idea/import-solr-data.iml b/apps/import-solr-data/.idea/import-solr-data.iml index a87b798c24..635c58f05c 100644 --- a/apps/import-solr-data/.idea/import-solr-data.iml +++ b/apps/import-solr-data/.idea/import-solr-data.iml @@ -9,7 +9,7 @@ - + diff --git a/apps/import-solr-data/.idea/mediawords.sql b/apps/import-solr-data/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/import-solr-data/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/import-solr-data/.idea/misc.xml b/apps/import-solr-data/.idea/misc.xml index 14bbb3ee03..4454819bf6 100644 --- a/apps/import-solr-data/.idea/misc.xml +++ b/apps/import-solr-data/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/import-solr-data/.idea/sqlDataSources.xml b/apps/import-solr-data/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..0f7b4d038e --- /dev/null +++ b/apps/import-solr-data/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/import-solr-data/docker-compose.tests.yml b/apps/import-solr-data/docker-compose.tests.yml index bb70f29a81..ebf9bfbd6d 100644 --- a/apps/import-solr-data/docker-compose.tests.yml +++ b/apps/import-solr-data/docker-compose.tests.yml @@ -75,8 +75,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ solr-shard-01: image: gcr.io/mcback/solr-shard:latest diff --git a/apps/import-stories-feedly/.idea/import-stories-feedly.iml b/apps/import-stories-feedly/.idea/import-stories-feedly.iml index e3b6952c3c..61a389860d 100644 --- a/apps/import-stories-feedly/.idea/import-stories-feedly.iml +++ b/apps/import-stories-feedly/.idea/import-stories-feedly.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/import-stories-feedly/.idea/mediawords.sql b/apps/import-stories-feedly/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/import-stories-feedly/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/import-stories-feedly/.idea/misc.xml b/apps/import-stories-feedly/.idea/misc.xml index 884ce73432..62346b8b3c 100644 --- a/apps/import-stories-feedly/.idea/misc.xml +++ b/apps/import-stories-feedly/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/import-stories-feedly/.idea/sqlDataSources.xml b/apps/import-stories-feedly/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..56932b0cf7 --- /dev/null +++ b/apps/import-stories-feedly/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/import-stories-feedly/docker-compose.tests.yml b/apps/import-stories-feedly/docker-compose.tests.yml index a0b0c338b0..dfc441ac98 100644 --- a/apps/import-stories-feedly/docker-compose.tests.yml +++ b/apps/import-stories-feedly/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/import-stories-scrapehtml/.idea/import-stories-scrapehtml.iml b/apps/import-stories-scrapehtml/.idea/import-stories-scrapehtml.iml index c0e0968c5c..9d92328955 100644 --- a/apps/import-stories-scrapehtml/.idea/import-stories-scrapehtml.iml +++ b/apps/import-stories-scrapehtml/.idea/import-stories-scrapehtml.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/import-stories-scrapehtml/.idea/mediawords.sql b/apps/import-stories-scrapehtml/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/import-stories-scrapehtml/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/import-stories-scrapehtml/.idea/misc.xml b/apps/import-stories-scrapehtml/.idea/misc.xml index cd8728a737..9c2430e515 100644 --- a/apps/import-stories-scrapehtml/.idea/misc.xml +++ b/apps/import-stories-scrapehtml/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/import-stories-scrapehtml/.idea/sqlDataSources.xml b/apps/import-stories-scrapehtml/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..a29f210cd2 --- /dev/null +++ b/apps/import-stories-scrapehtml/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/import-stories-scrapehtml/docker-compose.tests.yml b/apps/import-stories-scrapehtml/docker-compose.tests.yml index 7feb9e03b1..f48a47cda7 100644 --- a/apps/import-stories-scrapehtml/docker-compose.tests.yml +++ b/apps/import-stories-scrapehtml/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/mail-postfix-server/Dockerfile b/apps/mail-postfix-server/Dockerfile index 6c475f995c..039bf1b029 100644 --- a/apps/mail-postfix-server/Dockerfile +++ b/apps/mail-postfix-server/Dockerfile @@ -50,13 +50,16 @@ RUN \ # # Filter out "Received:" and some other headers postconf -e header_checks=regexp:/etc/postfix/header_checks && \ - postconf -e mime_header_checks=regexp:/etc/postfix/header_checks && \ postconf -e smtp_header_checks=regexp:/etc/postfix/header_checks && \ # # Don't require TLS as local clients are trusted postconf -e smtp_tls_security_level=may && \ postconf -e smtpd_tls_security_level=none && \ # + # Make sure default headers (e.g. Message-Id, date) are present + postconf -e always_add_missing_headers=yes && \ + postconf -e local_header_rewrite_clients=permit_inet_interfaces && \ + # # Disable chroot on all services as it doesn't play well with a mounted # volume, e.g. "smtpd" is unable to access libnss after a chroot and thus # resolve OpenDKIM container. diff --git a/apps/mail-postfix-server/docker-compose.tests.yml b/apps/mail-postfix-server/docker-compose.tests.yml index babdaa376d..1c51ca0677 100644 --- a/apps/mail-postfix-server/docker-compose.tests.yml +++ b/apps/mail-postfix-server/docker-compose.tests.yml @@ -4,15 +4,36 @@ services: # Service to use for testing the mail service # - # Usage: + # Steps to test: # - # host$ ./dev/run.py mail-postfix-server bash - # container$ sendmail "your@email.com" + # 1) host$ ./dev/run.py mail-postfix-server bash + # 2) (new terminal window) host$ docker ps + # 3) find container with name ending in 'mail-postfix-server-actual_1' + # 4) host$ docker exec -it some_string_mail-postfix-server-actual_1 bash + # 5) container$ ./postfix.sh + # 6) open new terminal window on your host machine + # 7) host$ docker exec -it some_string_mail-postfix-server-actual_1 bash + # 8) follow instructions at URL below to create a test mail.txt file and send to your email address from the container + # https://clients.javapipe.com/knowledgebase/132/How-to-Test-Sendmail-From-Command-Line-on-Linux.html # + # Alternatively, if you want to test via the send_email() method (https://github.com/mediacloud/backend/blob/master/apps/common/src/python/mediawords/util/mail.py#L73), + # or test changes to said method, to you can disregard steps 7-8 above and instead: + # 7) host$ docker ps + # 8) Find mail-postfix-server container ID + # 9) host$ docker exec -it some_string_mail-postfix-server + # 10) $container python3 + # 11) >> from mediawords.util.mail import * + # 12) >> test_message = Message(to='your@email.com', subject='test postfix', text_body=None, html_body='

hi

') + # 13) >> send_email(test_message) + # mail-postfix-server: image: gcr.io/mcback/common:latest init: true stop_signal: SIGKILL + volumes: + - type: bind + source: ./../common/src/ + target: /opt/mediacloud/src/common/ depends_on: - mail-postfix-server-actual @@ -21,7 +42,7 @@ services: image: gcr.io/mcback/mail-postfix-server:latest init: true stop_signal: SIGKILL - # "docker exec" into a container and run Postfix manually (/postfix.sh): + # "docker exec" into a container and run Postfix manually (./postfix.sh): command: sleep infinity # To be able to set /proc/sys/kernel/yama/ptrace_scope: privileged: true diff --git a/apps/mail-postfix-server/header_checks b/apps/mail-postfix-server/header_checks index 0b5347f5fe..d23d0795d6 100644 --- a/apps/mail-postfix-server/header_checks +++ b/apps/mail-postfix-server/header_checks @@ -1,5 +1,4 @@ /^Received:.*with ESMTP / IGNORE /^X-Originating-IP:/ IGNORE /^X-Mailer:/ IGNORE -/^Mime-Version:/ IGNORE /^User-Agent:/ IGNORE \ No newline at end of file diff --git a/apps/munin-cron/Dockerfile b/apps/munin-cron/Dockerfile index 27e617e49f..ab512c923f 100644 --- a/apps/munin-cron/Dockerfile +++ b/apps/munin-cron/Dockerfile @@ -4,6 +4,9 @@ FROM gcr.io/mcback/cron-base:latest +# FIXME +RUN apt-get -y update + # Install packages RUN \ # @@ -43,6 +46,8 @@ COPY munin-conf.d/ /etc/munin/munin-conf.d/ # Overwrite crontab with our own COPY crontab /etc/cron.d/munin +COPY bin/munin-cron.sh / + # Volume for RRD data (shared with munin-fastcgi-graph) VOLUME /var/lib/munin/ @@ -50,3 +55,6 @@ VOLUME /var/lib/munin/ VOLUME /var/cache/munin/www/ # No USER because Cron will run the script as unprivileged user itself + +# Use our own wrapper for +CMD ["/munin-cron.sh"] diff --git a/apps/munin-cron/bin/munin-cron.sh b/apps/munin-cron/bin/munin-cron.sh new file mode 100755 index 0000000000..a44d39d8a4 --- /dev/null +++ b/apps/munin-cron/bin/munin-cron.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -e + +if [ -z "$MC_MUNIN_CRON_ALERT_EMAIL" ]; then + echo "MC_MUNIN_CRON_ALERT_EMAIL (email address to send email alerts to) is not set." + exit 1 +fi + +set -u + +# Set up alerting +ALERTS_CONF_FILE="/etc/munin/munin-conf.d/alerts.conf" +echo -n > "${ALERTS_CONF_FILE}" +chmod 644 "${ALERTS_CONF_FILE}" + +# Pretty weird way to print a bunch of dollar signs to a file but Munin doesn't make it easy +echo -n 'contact.mediacloud.command ' >> "${ALERTS_CONF_FILE}" +echo -n 'mail -s "[Munin] ' >> "${ALERTS_CONF_FILE}" +echo -n '${if:cfields CRITICAL}${if:wfields WARNING}' >> "${ALERTS_CONF_FILE}" +echo -n '${if:fofields OK}${if:ufields UNKNOWN}' >> "${ALERTS_CONF_FILE}" +echo -n ' -> ${var:graph_title} ' >> "${ALERTS_CONF_FILE}" +echo -n '${if:wfields -> ${loop<,>:wfields ${var:label}=${var:value}}}' >> "${ALERTS_CONF_FILE}" +echo -n '${if:cfields -> ${loop<,>:cfields ${var:label}=${var:value}}}' >> "${ALERTS_CONF_FILE}" +echo -n '${if:fofields -> ${loop<,>:fofields ${var:label}=${var:value}}}' >> "${ALERTS_CONF_FILE}" +echo -n '" ' >> "${ALERTS_CONF_FILE}" + +# Escape "@" +echo -n "${MC_MUNIN_CRON_ALERT_EMAIL}" | sed 's/@/\\@/g' >> "${ALERTS_CONF_FILE}" + +echo >> "${ALERTS_CONF_FILE}" + +# Start Cron daemon wrapper from cron-base +exec /cron.sh diff --git a/apps/munin-cron/docker-compose.tests.yml b/apps/munin-cron/docker-compose.tests.yml index e24e465693..055e88673f 100644 --- a/apps/munin-cron/docker-compose.tests.yml +++ b/apps/munin-cron/docker-compose.tests.yml @@ -6,10 +6,15 @@ services: image: gcr.io/mcback/munin-cron:latest init: true stop_signal: SIGKILL + environment: + MC_MUNIN_CRON_ALERT_EMAIL: "alerts@testmediacloud.ml" volumes: - type: bind - source: ./munin-conf.d/ - target: /etc/munin/munin-conf.d/ + source: ./munin-conf.d/host.conf + target: /etc/munin/munin-conf.d/host.conf + - type: bind + source: ./bin/munin-cron.sh + target: /munin-cron.sh - type: bind source: ./../cron-base/bin/cron.sh target: /cron.sh @@ -58,8 +63,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ solr-shard-01: image: gcr.io/mcback/solr-shard:latest diff --git a/apps/munin-cron/munin-conf.d/.gitignore b/apps/munin-cron/munin-conf.d/.gitignore new file mode 100644 index 0000000000..7842d2c628 --- /dev/null +++ b/apps/munin-cron/munin-conf.d/.gitignore @@ -0,0 +1,3 @@ +# Gets autogenerated on every start +alerts.conf + diff --git a/apps/munin-cron/munin-conf.d/alerts.conf b/apps/munin-cron/munin-conf.d/alerts.conf deleted file mode 100644 index 583489fc94..0000000000 --- a/apps/munin-cron/munin-conf.d/alerts.conf +++ /dev/null @@ -1,2 +0,0 @@ -contact.hroberts.command mail -s "[Munin] ${if:cfields CRITICAL}${if:wfields WARNING}${if:fofields OK}${if:ufields UNKNOWN} -> ${var:graph_title} ${if:wfields -> ${loop<,>:wfields ${var:label}=${var:value}}}${if:cfields -> ${loop<,>:cfields ${var:label}=${var:value}}}${if:fofields -> ${loop<,>:fofields ${var:label}=${var:value}}}" hroberts\@mediacloud.org -contact.lvaliukas.command mail -s "[Munin] ${if:cfields CRITICAL}${if:wfields WARNING}${if:fofields OK}${if:ufields UNKNOWN} -> ${var:graph_title} ${if:wfields -> ${loop<,>:wfields ${var:label}=${var:value}}}${if:cfields -> ${loop<,>:cfields ${var:label}=${var:value}}}${if:fofields -> ${loop<,>:fofields ${var:label}=${var:value}}}" linas\@mediacloud.org diff --git a/apps/munin-httpd/docker-compose.tests.yml b/apps/munin-httpd/docker-compose.tests.yml index 17c380dfb8..0c8403be69 100644 --- a/apps/munin-httpd/docker-compose.tests.yml +++ b/apps/munin-httpd/docker-compose.tests.yml @@ -36,6 +36,8 @@ services: image: gcr.io/mcback/munin-cron:latest init: true stop_signal: SIGKILL + environment: + MC_MUNIN_CRON_ALERT_EMAIL: "FIXME@mediacloud.org" volumes: - type: bind source: ./../munin-cron/munin-conf.d/ @@ -88,8 +90,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ solr-shard-01: image: gcr.io/mcback/solr-shard:latest diff --git a/apps/munin-node/docker-compose.tests.yml b/apps/munin-node/docker-compose.tests.yml index 9a99b0065b..d9bdc527e2 100644 --- a/apps/munin-node/docker-compose.tests.yml +++ b/apps/munin-node/docker-compose.tests.yml @@ -43,8 +43,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ solr-shard-01: image: gcr.io/mcback/solr-shard:latest diff --git a/apps/munin-node/plugins/mc_postgresql_relfrozenxid_age b/apps/munin-node/plugins/mc_postgresql_relfrozenxid_age new file mode 100755 index 0000000000..67f04d92f2 --- /dev/null +++ b/apps/munin-node/plugins/mc_postgresql_relfrozenxid_age @@ -0,0 +1,51 @@ +#!/usr/bin/env perl +# +# https://blog.crunchydata.com/blog/managing-transaction-id-wraparound-in-postgresql +# + +use strict; +use warnings; + +use FindBin; +use lib $FindBin::Bin; + +use MediaWords::Munin; + +MediaWords::Munin::plugin({ + title => 'Tables with old relfrozenxid', + info => 'Number of tables with old relfrozenxid', + vlabel => 'Tables', + metrics => [ + { + label => 'Tables', + color => $COLOR1, + + # Even a single table can mess up autovacuum + critical => { to => 1 }, + + value => sub { + + my $db = shift; + + my $table_count = $db->query(< 1000000000 +SQL + )->flat->[0]; + + unless ( defined $table_count ) { + print STDERR "Unable to list tables with old relfrozenxid.\n"; + + # Returning a non-zero value to trigger the "CRITICAL" alert + return 1; + } + + return $table_count; + }, + }, + ], +}); diff --git a/apps/munin-node/plugins/mc_solr_sentences_last_day b/apps/munin-node/plugins/mc_solr_sentences_last_day deleted file mode 100755 index dd33519b50..0000000000 --- a/apps/munin-node/plugins/mc_solr_sentences_last_day +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; - -use FindBin; -use lib $FindBin::Bin; - -use MediaWords::Munin; -use JSON; -use URI; -use LWP::Simple qw($ua get); - -MediaWords::Munin::plugin({ - title => 'Solr sentences in last day', - info => 'Solr imported sentences in the last day, as reported by Solr', - vlabel => 'Count', - metrics => [ - { - label => 'Solr sentences', - color => $COLOR2, - warning => { from => sub { - if ( MediaWords::Munin::should_expect_weekend_traffic() ) { - return 2_500_000; - } else { - return 4_500_000; - } - } }, - critical => { from => sub { - if ( MediaWords::Munin::should_expect_weekend_traffic() ) { - return 2_000_000; - } else { - return 4_000_000; - } - } }, - value => sub { - - my $db = shift; - - my $solr_url = $ENV{ 'MC_SOLR_URL' }; - unless ( defined $solr_url ) { - say STDERR "Solr URL is not set, falling back to the default one."; - $solr_url = 'http://127.0.0.1:8983/solr'; - } - - my $stories_id_from_last_day = $db->query(<= NOW() - '1 day'::interval - ORDER BY collect_date - LIMIT 1 -SQL - )->flat->[0]; - unless ( defined $stories_id_from_last_day ) { - print STDERR "No stories since yesterday\n"; - return 0; - } - - my $solr_uri = URI->new( $solr_url )->canonical; - my @solr_path_segments = $solr_uri->path_segments; - push ( @solr_path_segments, 'collection1' ); - push ( @solr_path_segments, 'select' ); - $solr_uri->path_segments( @solr_path_segments ); - - $solr_uri->query_form( - q => 'stories_id:[' . $stories_id_from_last_day . ' TO *]', - rows => 0, - wt => 'json', - indent => 'true', - ); - - $ua->timeout( 10 ); - my $response; - eval { - $response = get( $solr_uri->as_string ); - }; - if ( $@ or (! $response )) { - die "Unable to get response from Solr: $@\n"; - } - - my $json_response; - eval { - $json_response = decode_json( $response ); - }; - if ( $@ or (! $json_response )) { - die "Unable to decode JSON response: $@\n"; - } - - my $num_found = $json_response->{ response }->{ numFound }; - unless ( defined $num_found ) { - die "Unable to read /response/numFound key"; - } - - return $num_found; - }, - }, - ], -}); diff --git a/apps/munin-node/plugins/mc_websites_up b/apps/munin-node/plugins/mc_websites_up index 8e66a06db8..869bd1f2fb 100755 --- a/apps/munin-node/plugins/mc_websites_up +++ b/apps/munin-node/plugins/mc_websites_up @@ -30,7 +30,7 @@ MediaWords::Munin::plugin({ my $response = $ua->get('https://api.mediacloud.org/api/v2/stories_public/list'); # Don't test whether request was successful (because it wasn't) - if ( $response->decoded_content =~ /Invalid API key or authentication cookie/i ) { + if ( $response->decoded_content =~ /Invalid API key/i ) { # Up return 1; } else { diff --git a/apps/nytlabels-annotator/.idea/misc.xml b/apps/nytlabels-annotator/.idea/misc.xml index 62c91cd3b1..dd2f82cf96 100644 --- a/apps/nytlabels-annotator/.idea/misc.xml +++ b/apps/nytlabels-annotator/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/apps/nytlabels-annotator/.idea/nytlabels-annotator.iml b/apps/nytlabels-annotator/.idea/nytlabels-annotator.iml index f1dab97a30..3e3e8c191c 100644 --- a/apps/nytlabels-annotator/.idea/nytlabels-annotator.iml +++ b/apps/nytlabels-annotator/.idea/nytlabels-annotator.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/nytlabels-annotator/Dockerfile b/apps/nytlabels-annotator/Dockerfile index 4cd13da3ba..0d33d8845b 100644 --- a/apps/nytlabels-annotator/Dockerfile +++ b/apps/nytlabels-annotator/Dockerfile @@ -5,6 +5,8 @@ FROM gcr.io/mcback/base:latest RUN \ + # FIXME remove once the base image gets updated + apt-get -y update && \ # # Install model fetch dependencies apt-get -y --no-install-recommends install brotli && \ @@ -25,29 +27,29 @@ RUN /dl_to_stdout.sh "$MODEL_URL/GoogleNews-vectors-negative300.stripped.shelve. RUN /dl_to_stdout.sh "$MODEL_URL/scaler.onnx" > scaler.onnx RUN /dl_to_stdout.sh "$MODEL_URL/all_descriptors.onnx.br" | \ - brotli -d > all_descriptors.onnx + brotli -d > allDescriptors.onnx RUN /dl_to_stdout.sh "$MODEL_URL/all_descriptors.txt.br" | \ - brotli -d > all_descriptors.txt + brotli -d > allDescriptors.txt RUN /dl_to_stdout.sh "$MODEL_URL/descriptors_3000.onnx.br" | \ - brotli -d > descriptors_3000.onnx + brotli -d > descriptors3000.onnx RUN /dl_to_stdout.sh "$MODEL_URL/descriptors_3000.txt.br" | \ - brotli -d > descriptors_3000.txt + brotli -d > descriptors3000.txt RUN /dl_to_stdout.sh "$MODEL_URL/descriptors_600.onnx.br" | \ - brotli -d > descriptors_600.onnx + brotli -d > descriptors600.onnx RUN /dl_to_stdout.sh "$MODEL_URL/descriptors_600.txt.br" | \ - brotli -d > descriptors_600.txt + brotli -d > descriptors600.txt RUN /dl_to_stdout.sh "$MODEL_URL/descriptors_with_taxonomies.onnx.br" | \ - brotli -d > descriptors_with_taxonomies.onnx + brotli -d > descriptorsAndTaxonomies.onnx RUN /dl_to_stdout.sh "$MODEL_URL/descriptors_with_taxonomies.txt.br" | \ - brotli -d > descriptors_with_taxonomies.txt + brotli -d > descriptorsAndTaxonomies.txt RUN /dl_to_stdout.sh "$MODEL_URL/just_taxonomies.onnx.br" | \ - brotli -d > just_taxonomies.onnx + brotli -d > taxonomies.onnx RUN /dl_to_stdout.sh "$MODEL_URL/just_taxonomies.txt.br" | \ - brotli -d > just_taxonomies.txt + brotli -d > taxonomies.txt # Install NLTK data RUN \ @@ -84,6 +86,11 @@ RUN \ WORKDIR /usr/src/crappy-predict-news-labels/ COPY src/crappy-predict-news-labels/requirements.txt /usr/src/crappy-predict-news-labels/ RUN \ + # + # OpenMP for onnxruntime speed up + apt-get -y --no-install-recommends install libgomp1 && \ + # + # The rest pip3 install -r requirements.txt && \ rm -rf /root/.cache/ && \ true @@ -103,4 +110,4 @@ STOPSIGNAL SIGTERM USER nobody -CMD ["nytlabels_http_server.py"] +CMD ["nytlabels.sh"] diff --git a/apps/nytlabels-annotator/README.md b/apps/nytlabels-annotator/README.md index 33df58c79f..f20423ebe3 100644 --- a/apps/nytlabels-annotator/README.md +++ b/apps/nytlabels-annotator/README.md @@ -55,6 +55,14 @@ and then `POST` said file as JSON to the annotator: ```bash echo '{}' | \ jq --arg key0 text --arg value0 "$(cat test.txt)" '. | .[$key0]=$value0' | \ - curl --header "Content-Type: application/json" -X POST --data-binary @- http://127.0.0.1:8080/predict.json | \ + curl --verbose --silent --trace-time --header "Content-Type: application/json" -X POST --data-binary @- http://127.0.0.1:8080/predict.json | \ jq ".descriptors600" ``` + +Alternatively, to try out just the `descriptors600` model: + +```bash +echo '{"models": ["descriptors600"]}' | \ + jq --arg key0 text --arg value0 "$(cat test.txt)" '. | .[$key0]=$value0' | \ + curl --verbose --silent --trace-time --header "Content-Type: application/json" -X POST --data-binary @- http://127.0.0.1:8080/predict.json +``` diff --git a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py index c07629c68c..d7a2911689 100644 --- a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py +++ b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py @@ -1,7 +1,8 @@ import dataclasses +import multiprocessing import os import shelve -from typing import List +from typing import List, Optional from nltk.data import load as load_nltk_data from nltk.tokenize.destructive import NLTKWordTokenizer @@ -87,13 +88,28 @@ class MultiLabelPredict(object): '_embedding_size', ] - def __init__(self, model_path: str, labels_path: str): + def __init__(self, model_path: str, labels_path: str, num_threads: Optional[int] = None): if not os.path.isfile(model_path): raise RuntimeError(f"Model was not found in {model_path}") if not os.path.isfile(labels_path): raise RuntimeError(f"Model labels were not found in {labels_path}") - self._model = onnxruntime.InferenceSession(model_path) + if num_threads is None: + num_threads = multiprocessing.cpu_count() + + options = onnxruntime.SessionOptions() + options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL + + # Seems to be slightly slower: + # options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL + + # Not really used without ORT_PARALLEL: + options.inter_op_num_threads = num_threads + options.intra_op_num_threads = num_threads + + options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL + + self._model = onnxruntime.InferenceSession(path_or_bytes=model_path) self._labels = open(labels_path, 'r').read().splitlines() _, self._sample_length, self._embedding_size = self._model.get_inputs()[0].shape diff --git a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.sh b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.sh new file mode 100755 index 0000000000..73263c2105 --- /dev/null +++ b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -u +set -e + +PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +exec "$PWD/nytlabels_http_server.py" --num_threads "$(/container_cpu_limit.sh)" diff --git a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py index 0f0205ae41..951de5e6b5 100755 --- a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py +++ b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py @@ -4,41 +4,35 @@ NYTLabels annotator HTTP service. """ -import dataclasses +import argparse import json -import operator import os +import pprint from http import HTTPStatus from http.server import HTTPServer, BaseHTTPRequestHandler -from sys import argv -from typing import Union, Dict, List +from typing import Union, Dict, List, Optional, Type from self_test_input import SELF_TEST_INPUT from nytlabels import Text2ScaledVectors, MultiLabelPredict - -@dataclasses.dataclass(frozen=True) -class _ModelDescriptor(object): - basename: str - json_key: str +# For each key there must exist a model ONNX file and a list of labels with a given basename +ALL_MODELS = [ + 'allDescriptors', + 'descriptors3000', + 'descriptors600', + 'descriptorsAndTaxonomies', + 'taxonomies', +] class _Predictor(object): __slots__ = [ - 'text2vectors', - 'models', - ] - - _MODEL_DESCRIPTORS = [ - _ModelDescriptor(basename='all_descriptors', json_key='allDescriptors'), - _ModelDescriptor(basename='descriptors_3000', json_key='descriptors3000'), - _ModelDescriptor(basename='descriptors_600', json_key='descriptors600'), - _ModelDescriptor(basename='descriptors_with_taxonomies', json_key='descriptorsAndTaxonomies'), - _ModelDescriptor(basename='just_taxonomies', json_key='taxonomies'), + '__text2vectors', + '__models', ] - def __init__(self): + def __init__(self, num_threads: Optional[int]): pwd = os.path.dirname(os.path.abspath(__file__)) models_dir = os.path.join(pwd, 'models') @@ -46,24 +40,25 @@ def __init__(self): raise RuntimeError(f"Models path should be directory: {models_dir}") print("Loading scaler and word2vec...") - self.text2vectors = Text2ScaledVectors( + self.__text2vectors = Text2ScaledVectors( word2vec_shelve_path=os.path.join(models_dir, 'GoogleNews-vectors-negative300.stripped.shelve'), scaler_path=os.path.join(models_dir, 'scaler.onnx'), ) print("Scaler and word2vec loaded.") print("Loading models...") - self.models = dict() + self.__models = dict() # Make sure all models have the sample sample length and embedding size as we vector text only once sample_length = None embedding_size = None - for model_descriptor in self._MODEL_DESCRIPTORS: - print(f" Loading '{model_descriptor.basename}'...") + for model_name in ALL_MODELS: + print(f" Loading '{model_name}'...") model = MultiLabelPredict( - model_path=os.path.join(models_dir, f"{model_descriptor.basename}.onnx"), - labels_path=os.path.join(models_dir, f"{model_descriptor.basename}.txt"), + model_path=os.path.join(models_dir, f"{model_name}.onnx"), + labels_path=os.path.join(models_dir, f"{model_name}.txt"), + num_threads=num_threads, ) if sample_length and embedding_size: @@ -72,39 +67,82 @@ def __init__(self): sample_length = model.sample_length() embedding_size = model.embedding_size() - self.models[model_descriptor] = model + self.__models[model_name] = model print("Models loaded.") print("Running self-test...\n") - vectors = self.text2vectors.transform( - text=SELF_TEST_INPUT, + test_result = self.predict(text=SELF_TEST_INPUT, enabled_model_names=ALL_MODELS) + pp = pprint.PrettyPrinter(indent=4, width=1024) + pp.pprint(test_result) + print("Done running self-test.") + + def predict(self, text: str, enabled_model_names: List[str]) -> Dict[str, List[Dict[str, str]]]: + + # Sample length / embedding size is the same for all models + first_model = self.__models[list(self.__models.keys())[0]] + sample_length = first_model.sample_length() + embedding_size = first_model.embedding_size() + + vectors = self.__text2vectors.transform( + text=text, sample_length=sample_length, embedding_size=embedding_size, ) - for model_descriptor in sorted(self._MODEL_DESCRIPTORS, key=operator.attrgetter('basename')): - print(f"Model '{model_descriptor.basename}':") - model = self.models[model_descriptor] + + result = dict() + + for model_name in enabled_model_names: + model = self.__models[model_name] predictions = model.predict(x_matrix=vectors) - for prediction in predictions: - print(f" * Label: {prediction.label}, score: {prediction.score:.6f}") - assert len(predictions), f"Some predictions should be returned by {model.__class__.__name__}" - print() - print("Done running self-test.") + result[model_name] = [ + {'label': x.label, 'score': "{0:.5f}".format(x.score)} for x in predictions + ] + + return result # noinspection PyPep8Naming class NYTLabelsRequestHandler(BaseHTTPRequestHandler): - _PREDICTOR = _Predictor() + # Allow HTTP/1.1 connections and so don't wait up on "Expect:" headers + protocol_version = "HTTP/1.1" + + _PREDICTOR = None + + @classmethod + def initialize_predictor(cls, num_threads: Optional[int]) -> None: + assert not cls._PREDICTOR, "Predictor is already initialized." + cls._PREDICTOR = _Predictor(num_threads=num_threads) + + def __init__(self, *args, **kwargs): + assert self._PREDICTOR, "You need to initialize the predictor before setting this class as a request handler." + super(NYTLabelsRequestHandler, self).__init__(*args, **kwargs) def __respond(self, http_status: int, response: Union[dict, list]): + raw_response = json.dumps(response).encode('utf-8') self.send_response(http_status) self.send_header('Content-Type', 'application/json; charset=UTF-8') + self.send_header('Content-Length', str(len(raw_response))) self.end_headers() - self.wfile.write(json.dumps(response).encode('utf-8')) + self.wfile.write(raw_response) def __respond_with_error(self, http_status: int, message: str): self.__respond(http_status=http_status, response={'error': message}) + # If the request handler's protocol_version is set to "HTTP/1.0" (the default) and the client tries connecting via + # HTTP/1.1 and sends an "Expect: 100-continue" header, the client will then wait for a bit (curl waits for a second) + # for "100 Continue" which the server will never send (due to it being configured to support HTTP/1.0 only), + # therefore the whole request will take a one whole second more. + # + # Please note that when enabling HTTP/1.1, one has to send Content-Length in their responses. + def __check_expect_header(self): + expect = self.headers.get('Expect', "") + if expect.lower() == "100-continue": + if not (self.protocol_version >= "HTTP/1.1" and self.request_version >= "HTTP/1.1"): + print(( + "WARNING: due to server / client misconfiguration, client sent Expect: header " + "and is waiting for a response, possibly delaying the whole request.""" + )) + def do_GET(self): # noinspection PyUnresolvedReferences self.__respond_with_error(http_status=HTTPStatus.BAD_REQUEST.value, message='GET requests are not supported.') @@ -113,30 +151,10 @@ def do_HEAD(self): # noinspection PyUnresolvedReferences self.__respond_with_error(http_status=HTTPStatus.BAD_REQUEST.value, message='HEAD requests are not supported.') - def _predict(self, text: str) -> Dict[str, List[Dict[str, str]]]: - - # Sample length / embedding size is the same for all models - first_model = self._PREDICTOR.models[list(self._PREDICTOR.models.keys())[0]] - sample_length = first_model.sample_length() - embedding_size = first_model.embedding_size() - - vectors = self._PREDICTOR.text2vectors.transform( - text=text, - sample_length=sample_length, - embedding_size=embedding_size, - ) - - result = dict() - - for model_descriptor, model in self._PREDICTOR.models.items(): - predictions = model.predict(x_matrix=vectors) - result[model_descriptor.json_key] = [ - {'label': x.label, 'score': "{0:.5f}".format(x.score)} for x in predictions - ] + def do_POST(self): - return result + self.__check_expect_header() - def do_POST(self): content_length = int(self.headers.get('Content-Length', 0)) if not content_length: # noinspection PyUnresolvedReferences @@ -174,8 +192,39 @@ def do_POST(self): ) return + models = payload.get('models', None) + if models is None: + enabled_model_names = ALL_MODELS + else: + enabled_model_names = [] + for model_name in models: + if model_name not in ALL_MODELS: + # noinspection PyUnresolvedReferences + self.__respond_with_error( + http_status=HTTPStatus.BAD_REQUEST.value, + message=f"Model '{model_name}' was not found.", + ) + return + if model_name in enabled_model_names: + # noinspection PyUnresolvedReferences + self.__respond_with_error( + http_status=HTTPStatus.BAD_REQUEST.value, + message=f"Model '{model_name}' is duplicate.", + ) + return + + enabled_model_names.append(model_name) + + if not enabled_model_names: + # noinspection PyUnresolvedReferences + self.__respond_with_error( + http_status=HTTPStatus.BAD_REQUEST.value, + message="List of enabled models is empty.", + ) + return + try: - result = self._predict(text) + result = self._PREDICTOR.predict(text=text, enabled_model_names=enabled_model_names) except Exception as ex: # noinspection PyUnresolvedReferences self.__respond_with_error( @@ -187,15 +236,29 @@ def do_POST(self): self.__respond(http_status=HTTPStatus.OK, response=result) -def run(port: int = 8080): - server_address = ('', port) - httpd = HTTPServer(server_address, NYTLabelsRequestHandler) - print(f'Starting NYTLabels annotator on port {port}...') +def make_nytlabels_request_handler_class(num_threads: Optional[int]) -> Type[NYTLabelsRequestHandler]: + class CustomNYTLabelsRequestHandler(NYTLabelsRequestHandler): + pass + + CustomNYTLabelsRequestHandler.initialize_predictor(num_threads=num_threads) + + return CustomNYTLabelsRequestHandler + + +def main(): + parser = argparse.ArgumentParser(description="Start NYTLabels annotator web service.") + parser.add_argument("-p", "--port", type=int, required=False, default=8080, + help="Port to listen to") + parser.add_argument("-t", "--num_threads", type=int, required=False, + help="Threads that the model runtime should spawn") + args = parser.parse_args() + + server_address = ('', args.port) + handler_class = make_nytlabels_request_handler_class(num_threads=args.num_threads) + httpd = HTTPServer(server_address, handler_class) + print(f'Starting NYTLabels annotator on port {args.port}...') httpd.serve_forever() if __name__ == "__main__": - if len(argv) == 2: - run(port=int(argv[1])) - else: - run() + main() diff --git a/apps/nytlabels-fetch-annotation-and-tag/.dockerignore b/apps/nytlabels-fetch-annotation-and-tag/.dockerignore index 752414ae9c..9b2c362a80 100644 --- a/apps/nytlabels-fetch-annotation-and-tag/.dockerignore +++ b/apps/nytlabels-fetch-annotation-and-tag/.dockerignore @@ -89,3 +89,4 @@ sdist Temporary Items wheels _Inline + diff --git a/apps/nytlabels-fetch-annotation-and-tag/.idea/mediawords.sql b/apps/nytlabels-fetch-annotation-and-tag/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/nytlabels-fetch-annotation-and-tag/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/nytlabels-fetch-annotation-and-tag/.idea/misc.xml b/apps/nytlabels-fetch-annotation-and-tag/.idea/misc.xml index bd61294576..2ac35808ab 100644 --- a/apps/nytlabels-fetch-annotation-and-tag/.idea/misc.xml +++ b/apps/nytlabels-fetch-annotation-and-tag/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/apps/nytlabels-fetch-annotation-and-tag/.idea/nytlabels-fetch-annotation-and-tag.iml b/apps/nytlabels-fetch-annotation-and-tag/.idea/nytlabels-fetch-annotation-and-tag.iml index 10163454cb..9a2244a452 100644 --- a/apps/nytlabels-fetch-annotation-and-tag/.idea/nytlabels-fetch-annotation-and-tag.iml +++ b/apps/nytlabels-fetch-annotation-and-tag/.idea/nytlabels-fetch-annotation-and-tag.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/nytlabels-fetch-annotation-and-tag/.idea/sqlDataSources.xml b/apps/nytlabels-fetch-annotation-and-tag/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..db4a8197d2 --- /dev/null +++ b/apps/nytlabels-fetch-annotation-and-tag/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/nytlabels-fetch-annotation-and-tag/docker-compose.tests.yml b/apps/nytlabels-fetch-annotation-and-tag/docker-compose.tests.yml index bcabb80022..785e7aacb4 100644 --- a/apps/nytlabels-fetch-annotation-and-tag/docker-compose.tests.yml +++ b/apps/nytlabels-fetch-annotation-and-tag/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/nytlabels-fetch-annotation-and-tag/src/python/nytlabels_fetch_annotation_and_tag/nytlabels_tags_from_annotation.py b/apps/nytlabels-fetch-annotation-and-tag/src/python/nytlabels_fetch_annotation_and_tag/nytlabels_tags_from_annotation.py index afae2e395f..848991973a 100644 --- a/apps/nytlabels-fetch-annotation-and-tag/src/python/nytlabels_fetch_annotation_and_tag/nytlabels_tags_from_annotation.py +++ b/apps/nytlabels-fetch-annotation-and-tag/src/python/nytlabels_fetch_annotation_and_tag/nytlabels_tags_from_annotation.py @@ -13,6 +13,9 @@ class NYTLabelsTagsFromAnnotation(TagsFromJSONAnnotation): """Fetches NYT labels annotation and uses it to generate/store story tags.""" + # Specific model to run the input text against + _ENABLED_MODEL = 'descriptors600' + # NYTLabels version tag set __NYTLABELS_VERSION_TAG_SET = 'nyt_labels_version' @@ -36,7 +39,7 @@ def _request_for_text(self, text: str) -> Request: # Create JSON request log.debug("Converting text to JSON request...") try: - text_json = encode_json({'text': text}) + text_json = encode_json({'text': text, 'models': [self._ENABLED_MODEL]}) except Exception as ex: # Not critical, might happen to some stories, no need to shut down the annotator raise McTagsFromJSONAnnotationException( @@ -66,8 +69,8 @@ def _fetched_annotation_is_valid(self, annotation: Union[dict, list]) -> bool: log.warning("Annotation is not dict: %s" % str(annotation)) return False - if 'descriptors600' not in annotation: - log.warning("Annotation doesn't have 'descriptors600' key: %s" % str(annotation)) + if self._ENABLED_MODEL not in annotation: + log.warning(f"Annotation doesn't have '{self._ENABLED_MODEL}' key: {annotation}") return False return True @@ -93,10 +96,10 @@ def _tags_for_annotation(self, annotation: Union[dict, list]) -> List[TagsFromJS tags_label=nytlabels_version_tag, tags_description="Story was tagged with '%s'" % nytlabels_version_tag)) - descriptors600 = annotation.get('descriptors600', None) - if descriptors600 is not None and len(descriptors600) > 0: + descriptors = annotation.get(self._ENABLED_MODEL, None) + if descriptors is not None and len(descriptors) > 0: - for descriptor in descriptors600: + for descriptor in descriptors: label = descriptor['label'] score = float(descriptor['score']) diff --git a/apps/nytlabels-fetch-annotation-and-tag/tests/python/test_nytlabels_tags_from_annotation.py b/apps/nytlabels-fetch-annotation-and-tag/tests/python/test_nytlabels_tags_from_annotation.py index 2ee0c9d654..b8027b64e9 100644 --- a/apps/nytlabels-fetch-annotation-and-tag/tests/python/test_nytlabels_tags_from_annotation.py +++ b/apps/nytlabels-fetch-annotation-and-tag/tests/python/test_nytlabels_tags_from_annotation.py @@ -6,6 +6,7 @@ from mediawords.util.network import random_unused_port from mediawords.util.parse_json import encode_json from mediawords.util.sql import sql_now + from nytlabels_fetch_annotation_and_tag.config import NYTLabelsTagsFromAnnotationConfig from nytlabels_fetch_annotation_and_tag.nytlabels_tags_from_annotation import NYTLabelsTagsFromAnnotation from nytlabels_fetch_annotation_and_tag.sample_data import sample_nytlabels_response, expected_nytlabels_tags @@ -14,7 +15,6 @@ class TestNYTLabelsTagsFromAnnotation(TestCase): def test_tagging(self): - db = connect_to_db() media = db.create(table='media', insert_hash={ diff --git a/apps/podcast-fetch-episode/.idea/misc.xml b/apps/podcast-fetch-episode/.idea/misc.xml deleted file mode 100644 index 64bb3a0baa..0000000000 --- a/apps/podcast-fetch-episode/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/podcast-fetch-episode.iml b/apps/podcast-fetch-episode/.idea/podcast-fetch-episode.iml deleted file mode 100644 index 526ab95d93..0000000000 --- a/apps/podcast-fetch-episode/.idea/podcast-fetch-episode.iml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/sqldialects.xml b/apps/podcast-fetch-episode/.idea/sqldialects.xml deleted file mode 100644 index 790b3f37f8..0000000000 --- a/apps/podcast-fetch-episode/.idea/sqldialects.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/webResources.xml b/apps/podcast-fetch-episode/.idea/webResources.xml deleted file mode 100644 index c30bda4153..0000000000 --- a/apps/podcast-fetch-episode/.idea/webResources.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-episode/Dockerfile b/apps/podcast-fetch-episode/Dockerfile deleted file mode 100644 index 6bb28d4eb4..0000000000 --- a/apps/podcast-fetch-episode/Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -# -# Fetch podcast episode from story, read metadata, store it to GCS -# - -FROM gcr.io/mcback/common:latest - -# Install FFmpeg for manipulating audio files -RUN apt-get -y --no-install-recommends install ffmpeg - -# Install Python dependencies -COPY src/requirements.txt /var/tmp/ -RUN \ - cd /var/tmp/ && \ - pip3 install -r requirements.txt && \ - rm requirements.txt && \ - rm -rf /root/.cache/ && \ - true - -# Copy sources -COPY src/ /opt/mediacloud/src/podcast-fetch-episode/ -ENV PERL5LIB="/opt/mediacloud/src/podcast-fetch-episode/perl:${PERL5LIB}" \ - PYTHONPATH="/opt/mediacloud/src/podcast-fetch-episode/python:${PYTHONPATH}" - -# Copy worker script -COPY bin /opt/mediacloud/bin - -USER mediacloud - -CMD ["podcast_fetch_episode_worker.py"] diff --git a/apps/podcast-fetch-episode/bin/podcast_fetch_episode_worker.py b/apps/podcast-fetch-episode/bin/podcast_fetch_episode_worker.py deleted file mode 100755 index e8fc7ec433..0000000000 --- a/apps/podcast-fetch-episode/bin/podcast_fetch_episode_worker.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 - -from mediawords.db import connect_to_db -from mediawords.job import JobBroker -from mediawords.util.log import create_logger -from mediawords.util.perl import decode_object_from_bytes_if_needed -from mediawords.util.process import fatal_error - -from podcast_fetch_episode.exceptions import McPodcastFetchEpisodeSoftException -from podcast_fetch_episode.fetch_and_store import fetch_and_store_episode - -log = create_logger(__name__) - - -def run_podcast_fetch_episode(stories_id: int) -> None: - """Fetch podcast episode for story, upload it to GCS.""" - - if isinstance(stories_id, bytes): - stories_id = decode_object_from_bytes_if_needed(stories_id) - stories_id = int(stories_id) - - db = connect_to_db() - - log.info(f"Fetching podcast episode for story {stories_id}...") - - try: - fetch_and_store_episode(db=db, stories_id=stories_id) - - JobBroker(queue_name='MediaWords::Job::Podcast::SubmitOperation').add_to_queue(stories_id=stories_id) - - except McPodcastFetchEpisodeSoftException as ex: - # Soft exceptions - log.error(f"Unable to fetch podcast episode for story {stories_id}: {ex}") - raise ex - except Exception as ex: - # Hard and other exceptions - fatal_error(f"Fatal / unknown error while fetching podcast episode for story {stories_id}: {ex}") - - log.info(f"Done fetching podcast episode for story {stories_id}") - - -if __name__ == '__main__': - app = JobBroker(queue_name='MediaWords::Job::Podcast::FetchEpisode') - app.start_worker(handler=run_podcast_fetch_episode) diff --git a/apps/podcast-fetch-episode/docker-compose.tests.yml b/apps/podcast-fetch-episode/docker-compose.tests.yml deleted file mode 100644 index 599c00c076..0000000000 --- a/apps/podcast-fetch-episode/docker-compose.tests.yml +++ /dev/null @@ -1,59 +0,0 @@ -version: "3.7" - -services: - - podcast-fetch-episode: - image: gcr.io/mcback/podcast-fetch-episode:latest - init: true - stop_signal: SIGKILL - environment: - MC_PODCAST_GC_AUTH_JSON_BASE64: "${MC_PODCAST_GC_AUTH_JSON_BASE64}" - MC_PODCAST_FETCH_EPISODE_BUCKET_NAME: "${MC_PODCAST_FETCH_EPISODE_BUCKET_NAME}" - # Dev/test environments don't use "MC_PODCAST_FETCH_EPISODE_PATH_PREFIX" environment - # variable as they create a different, timestamped prefix for every test run. - volumes: - - type: bind - source: ./bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./src/ - target: /opt/mediacloud/src/podcast-fetch-episode/ - - type: bind - source: ./tests/ - target: /opt/mediacloud/tests/ - - type: bind - source: ./../common/src/ - target: /opt/mediacloud/src/common/ - depends_on: - - postgresql-pgbouncer - # We don't need "rabbitmq-server" to run tests - - postgresql-pgbouncer: - image: gcr.io/mcback/postgresql-pgbouncer:latest - init: true - stop_signal: SIGKILL - expose: - - 6432 - volumes: - - type: bind - source: ./../postgresql-pgbouncer/conf/ - target: /etc/pgbouncer/ - depends_on: - - postgresql-server - - postgresql-server: - image: gcr.io/mcback/postgresql-server:latest - init: true - stop_signal: SIGKILL - expose: - - 5432 - volumes: - - type: bind - source: ./../postgresql-server/bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./../postgresql-server/schema/ - target: /opt/mediacloud/schema/ - - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/config.py b/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/config.py deleted file mode 100644 index 05c7d028af..0000000000 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/config.py +++ /dev/null @@ -1,22 +0,0 @@ -from mediawords.util.config import env_value, file_with_env_value - - -class PodcastFetchEpisodeConfig(object): - """ - Podcast episode fetcher configuration. - """ - - @staticmethod - def gc_auth_json_file() -> str: - """Return path to Google Cloud authentication JSON file.""" - return file_with_env_value(name='MC_PODCAST_GC_AUTH_JSON_BASE64', encoded_with_base64=True) - - @staticmethod - def gc_storage_bucket_name() -> str: - """Return Google Cloud Storage bucket name.""" - return env_value(name='MC_PODCAST_FETCH_EPISODE_BUCKET_NAME') - - @staticmethod - def gc_storage_path_prefix() -> str: - """Return Google Cloud Storage path prefix under which objects will be stored.""" - return env_value(name='MC_PODCAST_FETCH_EPISODE_PATH_PREFIX') diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/exceptions.py b/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/exceptions.py deleted file mode 100644 index 9c95054ffd..0000000000 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/exceptions.py +++ /dev/null @@ -1,79 +0,0 @@ -import abc - - -class _AbstractMcPodcastFetchEpisodeException(Exception, metaclass=abc.ABCMeta): - """Abstract exception.""" - pass - - -class McPodcastFetchEpisodeSoftException(_AbstractMcPodcastFetchEpisodeException): - """Soft errors exception.""" - pass - - -class McStoryNotFoundException(McPodcastFetchEpisodeSoftException): - """Exception raised when story was not found.""" - pass - - -class McPodcastNoViableStoryEnclosuresException(McPodcastFetchEpisodeSoftException): - """Exception thrown when story has no viable enclosures to choose from.""" - pass - - -class McPodcastEnclosureTooBigException(McPodcastFetchEpisodeSoftException): - """Exception thrown when story's best viable enclosure is too big.""" - pass - - -class McPodcastFileFetchFailureException(McPodcastFetchEpisodeSoftException): - """Exception thrown when we're unable to fetch the downloaded file for whatever reason.""" - pass - - -class McPodcastFileIsInvalidException(McPodcastFetchEpisodeSoftException): - """Exception thrown when the fetched file is not something that we can process for whatever reason.""" - pass - - -# --- - -class McPodcastFetchEpisodeHardException(_AbstractMcPodcastFetchEpisodeException): - """Hard errors exception.""" - pass - - -class McPodcastFileStoreFailureException(McPodcastFetchEpisodeHardException): - """ - Exception thrown when we're unable to store the downloaded file for whatever reason. - - This is a hard exception as not being able to store a file means that we might be out of disk space or something - like that. - """ - pass - - -class McPodcastGCSStoreFailureException(McPodcastFetchEpisodeHardException): - """ - Exception thrown when we're unable to store an object to Google Cloud Storage. - - GCS problems, if any, are probably temporary, but still, in those cases we should retry a few times and then give up - permanently because not being able to store stuff to GCS might mean that we ran out of some sort of a limit, - credentials are wrong, etc. - """ - pass - - -class McPodcastMisconfiguredTranscoderException(McPodcastFetchEpisodeHardException): - """Exception thrown when something happens with the transcoder that we didn't anticipate before.""" - pass - - -class McPodcastMisconfiguredGCSException(McPodcastFetchEpisodeHardException): - """Exception thrown when something happens with Google Cloud Storage that we didn't anticipate before.""" - pass - - -class McPodcastPostgreSQLException(McPodcastFetchEpisodeHardException): - """Exception thrown on PostgreSQL errors.""" - pass diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/fetch_and_store.py b/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/fetch_and_store.py deleted file mode 100644 index 82836d1dbf..0000000000 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/fetch_and_store.py +++ /dev/null @@ -1,204 +0,0 @@ -import os -import shutil -import tempfile -from typing import Optional - -from mediawords.db import DatabaseHandler -from mediawords.util.identify_language import language_code_for_text, identification_would_be_reliable -from mediawords.util.log import create_logger -from mediawords.util.parse_html import html_strip - -from podcast_fetch_episode.bcp47_lang import iso_639_1_code_to_bcp_47_identifier -from podcast_fetch_episode.config import PodcastFetchEpisodeConfig -from podcast_fetch_episode.enclosure import podcast_viable_enclosure_for_story, MAX_ENCLOSURE_SIZE -from podcast_fetch_episode.exceptions import ( - McStoryNotFoundException, - McPodcastNoViableStoryEnclosuresException, - McPodcastEnclosureTooBigException, - McPodcastFileStoreFailureException, - McPodcastFileFetchFailureException, - McPodcastGCSStoreFailureException, - McPodcastPostgreSQLException, -) -from podcast_fetch_episode.fetch_url import fetch_big_file -from podcast_fetch_episode.gcs_store import GCSStore -from podcast_fetch_episode.media_file import TranscodeTempDirAndFile, transcode_media_file_if_needed, media_file_info - -log = create_logger(__name__) - - -def _cleanup_temp_dir(temp: TranscodeTempDirAndFile) -> None: - """Clean up temporary directory or raise a hard exception.""" - try: - shutil.rmtree(temp.temp_dir) - except Exception as ex: - # Not being able to clean up after ourselves is a "hard" error as we might run out of disk space that way - raise McPodcastFileStoreFailureException(f"Unable to remove temporary directory: {ex}") - - -def fetch_and_store_episode(db: DatabaseHandler, - stories_id: int, - config: Optional[PodcastFetchEpisodeConfig] = None) -> None: - """ - Choose a viable story enclosure for podcast, fetch it, transcode if needed, store to GCS, and record to DB. - - 1) Determines the episode's likely language by looking into its title and description, converts the language code to - BCP 47; - 1) Using enclosures from "story_enclosures", chooses the one that looks like a podcast episode the most; - 2) Fetches the chosen enclosure; - 3) Transcodes the file (if needed) by: - a) converting it to an audio format that the Speech API can support, and / or - b) discarding video stream from the media file, and / or - c) discarding other audio streams from the media file; - 5) Reads the various parameters, e.g. sample rate, of the episode audio file; - 4) Uploads the episode audio file to Google Cloud Storage; - 5) Adds a row to "podcast_episodes". - - Adding a job to submit the newly created episode to Speech API (by adding a RabbitMQ job) is up to the caller. - - :param db: Database handler. - :param stories_id: Story ID for the story to operate on. - :param config: (optional) Podcast fetcher configuration object (useful for testing). - """ - - if not config: - config = PodcastFetchEpisodeConfig() - - story = db.find_by_id(table='stories', object_id=stories_id) - if not story: - raise McStoryNotFoundException(f"Story {stories_id} was not found.") - - # Try to determine language of the story - story_title = story['title'] - story_description = html_strip(story['description']) - sample_text = f"{story_title}\n{story_description}" - - iso_639_1_language_code = None - if identification_would_be_reliable(text=sample_text): - iso_639_1_language_code = language_code_for_text(text=sample_text) - - if not iso_639_1_language_code: - iso_639_1_language_code = 'en' - - # Convert to BCP 47 identifier - bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier( - iso_639_1_code=iso_639_1_language_code, - url_hint=story['url'], - ) - - # Find the enclosure that might work the best - best_enclosure = podcast_viable_enclosure_for_story(db=db, stories_id=stories_id) - if not best_enclosure: - raise McPodcastNoViableStoryEnclosuresException(f"There were no viable enclosures found for story {stories_id}") - - if best_enclosure.length: - if best_enclosure.length > MAX_ENCLOSURE_SIZE: - raise McPodcastEnclosureTooBigException(f"Chosen enclosure {best_enclosure} is too big.") - - try: - temp_dir = tempfile.mkdtemp('fetch_and_store') - except Exception as ex: - raise McPodcastFileStoreFailureException(f"Unable to create temporary directory: {ex}") - - # Fetch enclosure - input_filename = 'input_file' - input_file_path = os.path.join(temp_dir, input_filename) - log.info(f"Fetching enclosure {best_enclosure} to {input_file_path}...") - fetch_big_file(url=best_enclosure.url, dest_file=input_file_path, max_size=MAX_ENCLOSURE_SIZE) - log.info(f"Done fetching enclosure {best_enclosure} to {input_file_path}") - - if os.stat(input_file_path).st_size == 0: - # Might happen with misconfigured webservers - raise McPodcastFileFetchFailureException(f"Fetched file {input_file_path} is empty.") - - # Transcode if needed - input_file_obj = TranscodeTempDirAndFile(temp_dir=temp_dir, filename=input_filename) - transcoded_file_obj = transcode_media_file_if_needed(input_media_file=input_file_obj) - - # Unset the variable so that we don't accidentally use it later - del input_filename, temp_dir - - if input_file_obj != transcoded_file_obj: - # Function did some transcoding and stored everything in yet another file - - # Remove the input file - _cleanup_temp_dir(temp=input_file_obj) - - # Consider the transcoded file the new input file - input_file_obj = transcoded_file_obj - - # (Re)read the properties of either the original or the transcoded file - media_info = media_file_info(media_file_path=input_file_obj.temp_full_path) - best_audio_stream = media_info.best_supported_audio_stream() - - # Store input file to GCS - try: - gcs = GCSStore(config=config) - gcs_uri = gcs.store_object( - local_file_path=input_file_obj.temp_full_path, - object_id=str(stories_id), - mime_type=best_audio_stream.audio_codec_class.mime_type(), - ) - - except Exception as ex: - - log.error(f"Unable to store episode file '{input_file_obj.temp_full_path}' for story {stories_id}: {ex}") - - # Clean up, then raise further - _cleanup_temp_dir(temp=input_file_obj) - - raise ex - - # Clean up the locally stored file as we don't need it anymore - _cleanup_temp_dir(temp=input_file_obj) - - # Insert everything to the database - try: - db.query(""" - INSERT INTO podcast_episodes ( - stories_id, - story_enclosures_id, - gcs_uri, - duration, - codec, - sample_rate, - bcp47_language_code - ) VALUES ( - %(stories_id)s, - %(story_enclosures_id)s, - %(gcs_uri)s, - %(duration)s, - %(codec)s, - %(sample_rate)s, - %(bcp47_language_code)s - ) ON CONFLICT (stories_id) DO UPDATE SET - story_enclosures_id = %(story_enclosures_id)s, - gcs_uri = %(gcs_uri)s, - duration = %(duration)s, - codec = %(codec)s, - sample_rate = %(sample_rate)s, - bcp47_language_code = %(bcp47_language_code)s - """, { - 'stories_id': stories_id, - 'story_enclosures_id': best_enclosure.story_enclosures_id, - 'gcs_uri': gcs_uri, - 'duration': best_audio_stream.duration, - 'codec': best_audio_stream.audio_codec_class.postgresql_enum_value(), - 'sample_rate': best_audio_stream.sample_rate, - 'bcp47_language_code': bcp_47_language_code, - }) - - except Exception as ex_db: - - # Try to delete object on GCS first - try: - gcs.delete_object(object_id=str(stories_id)) - except Exception as ex_gcs: - # We should be able to delete it as we've just uploaded it - raise McPodcastGCSStoreFailureException(( - f"Unable to clean up story's {stories_id} audio file from GCS after database insert failure; " - f"database insert exception: {ex_db}; " - f"GCS exception: {ex_gcs}") - ) - - raise McPodcastPostgreSQLException(f"Failed inserting episode for story {stories_id}: {ex_db}") diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/gcs_store.py b/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/gcs_store.py deleted file mode 100644 index 579ceb3afb..0000000000 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/gcs_store.py +++ /dev/null @@ -1,179 +0,0 @@ -import os -from typing import Optional - -# noinspection PyPackageRequirements -from google.cloud import storage -# noinspection PyPackageRequirements -from google.cloud.exceptions import NotFound -# noinspection PyPackageRequirements -from google.cloud.storage import Blob, Bucket - -from mediawords.util.log import create_logger - -from podcast_fetch_episode.config import PodcastFetchEpisodeConfig -from podcast_fetch_episode.exceptions import ( - McPodcastGCSStoreFailureException, - McPodcastMisconfiguredGCSException, -) - -log = create_logger(__name__) - - -class GCSStore(object): - """Google Cloud Storage store.""" - - __slots__ = [ - '__bucket_internal', - '__config', - ] - - def __init__(self, config: Optional[PodcastFetchEpisodeConfig] = None): - if not config: - config = PodcastFetchEpisodeConfig() - - self.__config = config - self.__bucket_internal = None - - @property - def _bucket(self) -> Bucket: - """Lazy-loaded bucket.""" - if not self.__bucket_internal: - - try: - storage_client = storage.Client.from_service_account_json(self.__config.gc_auth_json_file()) - self.__bucket_internal = storage_client.get_bucket(self.__config.gc_storage_bucket_name()) - except Exception as ex: - raise McPodcastGCSStoreFailureException( - f"Unable to get GCS bucket '{self.__config.gc_storage_bucket_name()}': {ex}" - ) - - return self.__bucket_internal - - @classmethod - def _remote_path(cls, path_prefix: str, object_id: str): - if not object_id: - raise McPodcastMisconfiguredGCSException("Object ID is unset.") - - path = os.path.join(path_prefix, object_id) - - # GCS doesn't like double slashes... - path = os.path.normpath(path) - - # ...nor is a fan of slashes at the start of path - while path.startswith('/'): - path = path[1:] - - return path - - def _blob_from_object_id(self, object_id: str) -> Blob: - if not object_id: - raise McPodcastMisconfiguredGCSException("Object ID is unset.") - - remote_path = self._remote_path(path_prefix=self.__config.gc_storage_path_prefix(), object_id=object_id) - blob = self._bucket.blob(remote_path) - return blob - - def object_exists(self, object_id: str) -> bool: - """ - Test if object exists at remote location. - - :param object_id: Object ID that should be tested. - :return: True if object already exists under a given object ID. - """ - - if not object_id: - raise McPodcastMisconfiguredGCSException("Object ID is unset.") - - log.debug(f"Testing if object ID {object_id} exists...") - - blob = self._blob_from_object_id(object_id=object_id) - - log.debug(f"Testing blob for existence: {blob}") - - try: - # blob.reload() returns metadata too - blob.reload() - - except NotFound as ex: - log.debug(f"Object '{object_id}' was not found: {ex}") - exists = False - - except Exception as ex: - raise McPodcastGCSStoreFailureException(f"Unable to test whether GCS object {object_id} exists: {ex}") - - else: - exists = True - - return exists - - def store_object(self, local_file_path: str, object_id: str, mime_type: Optional[str] = None) -> str: - """ - Store a local file to a remote location. - - Will overwrite existing objects with a warning. - - :param local_file_path: Local file that should be stored. - :param object_id: Object ID under which the object should be stored. - :param mime_type: MIME type which, if set, will be stored as "Content-Type". - :return: Full Google Cloud Storage URI of the object, e.g. "gs:////". - """ - - if not os.path.isfile(local_file_path): - raise McPodcastMisconfiguredGCSException(f"Local file '{local_file_path}' does not exist.") - - if not object_id: - raise McPodcastMisconfiguredGCSException("Object ID is unset.") - - log.debug(f"Storing file '{local_file_path}' as object ID {object_id}...") - - if self.object_exists(object_id=object_id): - log.warning(f"Object {object_id} already exists, will overwrite.") - - blob = self._blob_from_object_id(object_id=object_id) - - blob.upload_from_filename(filename=local_file_path, content_type=mime_type) - - return self.object_uri(object_id=object_id) - - def delete_object(self, object_id: str) -> None: - """ - Delete object from remote location. - - Doesn't raise if object doesn't exist. - - :param object_id: Object ID that should be deleted. - """ - - if not object_id: - raise McPodcastMisconfiguredGCSException("Object ID is unset.") - - log.debug(f"Deleting object ID {object_id}...") - - blob = self._blob_from_object_id(object_id=object_id) - - try: - blob.delete() - - except NotFound: - log.warning(f"Object {object_id} doesn't exist.") - - except Exception as ex: - raise McPodcastGCSStoreFailureException(f"Unable to delete GCS object {object_id}: {ex}") - - def object_uri(self, object_id: str) -> str: - """ - Generate Google Cloud Storage URI for the object. - - :param object_id: Object ID to return the URI for. - :return: Full Google Cloud Storage URI of the object, e.g. "gs:////". - """ - - if not object_id: - raise McPodcastMisconfiguredGCSException("Object ID is unset.") - - uri = "gs://{host}/{remote_path}".format( - host=self.__config.gc_storage_bucket_name(), - remote_path=self._remote_path(path_prefix=self.__config.gc_storage_path_prefix(), object_id=object_id), - ) - - return uri diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/media_file.py b/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/media_file.py deleted file mode 100644 index debd695878..0000000000 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/media_file.py +++ /dev/null @@ -1,298 +0,0 @@ -import dataclasses -import subprocess -import math -import os -import shutil -import tempfile -from typing import Type, Optional, List - -import ffmpeg - -from mediawords.util.log import create_logger - -from podcast_fetch_episode.audio_codecs import ( - AbstractAudioCodec, - Linear16AudioCodec, - FLACAudioCodec, - MULAWAudioCodec, - OggOpusAudioCodec, - MP3AudioCodec, -) -from podcast_fetch_episode.exceptions import ( - McPodcastMisconfiguredTranscoderException, - McPodcastFileIsInvalidException, - McPodcastFileStoreFailureException, -) - -log = create_logger(__name__) - -_SUPPORTED_CODEC_CLASSES = { - Linear16AudioCodec, - FLACAudioCodec, - MULAWAudioCodec, - OggOpusAudioCodec, - MP3AudioCodec, -} -"""Supported native audio codec classes.""" - - -@dataclasses.dataclass -class MediaFileInfoAudioStream(object): - """Information about a single audio stream in a media file.""" - - ffmpeg_stream_index: int - """FFmpeg internal stream index.""" - - audio_codec_class: Optional[Type[AbstractAudioCodec]] - """Audio codec class if the stream is one of the supported types and has single (mono) channel, None otherwise.""" - - duration: int - """Duration (in seconds).""" - - audio_channel_count: int - """Audio channel count.""" - - sample_rate: int - """Audio sample rate.""" - - -@dataclasses.dataclass -class MediaFileInfo(object): - """Information about media file.""" - - audio_streams: List[MediaFileInfoAudioStream] - """List of audio streams found in the media file.""" - - has_video_streams: bool - """True if the media file has video streams.""" - - def best_supported_audio_stream(self) -> Optional[MediaFileInfoAudioStream]: - """Return the first supported audio stream, if any.""" - for stream in self.audio_streams: - if stream.audio_codec_class: - return stream - return None - - -def media_file_info(media_file_path: str) -> MediaFileInfo: - """ - Read audio / video media file information, or raise if it can't be read. - - :param media_file_path: Full path to media file. - :return: MediaFileInfo object. - """ - if not os.path.isfile(media_file_path): - # Input file should exist at this point; it it doesn't, we have probably messed up something in the code - raise McPodcastMisconfiguredTranscoderException(f"Input file {media_file_path} does not exist.") - - try: - file_info = ffmpeg.probe(media_file_path) - if not file_info: - raise Exception("Returned metadata is empty.") - except Exception as ex: - raise McPodcastFileIsInvalidException( - f"Unable to read metadata from file {media_file_path}: {ex}" - ) - - if 'streams' not in file_info: - # FFmpeg should come up with some sort of a stream in any case - raise McPodcastMisconfiguredTranscoderException("Returned probe doesn't have 'streams' key.") - - # Test if one of the audio streams is of one of the supported codecs - audio_streams = [] - has_video_streams = False - for stream in file_info['streams']: - if stream['codec_type'] == 'audio': - - try: - audio_channel_count = int(stream['channels']) - if audio_channel_count == 0: - raise Exception("Audio channel count is 0") - except Exception as ex: - log.warning(f"Unable to read audio channel count from stream {stream}: {ex}") - # Just skip this stream if we can't figure it out - continue - - audio_codec_class = None - - # We'll need to transcode audio files with more than one channel count anyway - if audio_channel_count == 1: - for codec in _SUPPORTED_CODEC_CLASSES: - if codec.ffmpeg_stream_is_this_codec(ffmpeg_stream=stream): - audio_codec_class = codec - break - - try: - - if 'duration' in stream: - # 'duration': '3.766621' - duration = math.floor(float(stream['duration'])) - - elif 'DURATION' in stream.get('tags', {}): - # 'DURATION': '00:00:03.824000000' - duration_parts = stream['tags']['DURATION'].split(':') - if len(duration_parts) != 3: - raise McPodcastFileIsInvalidException(f"Unable to parse 'DURATION': {duration_parts}") - - hh = int(duration_parts[0]) - mm = int(duration_parts[1]) - ss_ms = duration_parts[2].split('.') - - if len(ss_ms) == 1: - ss = int(ss_ms[0]) - ms = 0 - elif len(ss_ms) == 2: - ss = int(ss_ms[0]) - ms = int(ss_ms[1]) - else: - raise McPodcastFileIsInvalidException(f"Unable to parse 'DURATION': {duration_parts}") - - duration = hh * 3600 + mm * 60 + ss + (1 if ms > 0 else 0) - - else: - raise McPodcastFileIsInvalidException(f"Stream doesn't have duration: {stream}") - - audio_stream = MediaFileInfoAudioStream( - ffmpeg_stream_index=stream['index'], - audio_codec_class=audio_codec_class, - duration=duration, - audio_channel_count=audio_channel_count, - sample_rate=int(stream['sample_rate']), - ) - audio_streams.append(audio_stream) - - except Exception as ex: - # Just skip this stream if we can't figure it out - log.warning(f"Unable to read audio stream data for stream {stream}: {ex}") - - elif stream['codec_type'] == 'video': - has_video_streams = True - - return MediaFileInfo( - audio_streams=audio_streams, - has_video_streams=has_video_streams, - ) - - -@dataclasses.dataclass -class TranscodeTempDirAndFile(object): - """ - Temporary directory and filename for transcoding. - - It is assumed that caller is free to recursively remove 'temp_directory' after making use of the transcoded file. - """ - temp_dir: str - filename: str - - @property - def temp_full_path(self) -> str: - """Return full path to file.""" - return os.path.join(self.temp_dir, self.filename) - - -def transcode_media_file_if_needed(input_media_file: TranscodeTempDirAndFile) -> TranscodeTempDirAndFile: - """ - Transcode file (if needed) to something that Speech API will support. - - * If input has a video stream, it will be discarded; - * If input has more than one audio stream, others will be discarded leaving only one (preferably the one that Speech - API can support); - * If input doesn't have an audio stream in Speech API-supported codec, it will be transcoded to lossless - FLAC 16 bit in order to preserve quality; - * If the chosen audio stream has multiple channels (e.g. stereo or 5.1), it will be mixed into a single (mono) - channel as Speech API supports multi-channel recognition only when different voices speak into each of the - channels. - - :param input_media_file: Temporary directory and input media file to consider transcoding. - :return: Either the same 'input_media_file' if file wasn't transcoded, or new TranscodeTempDirAndFile() if it was. - """ - - if not os.path.isdir(input_media_file.temp_dir): - # Directory should exist; if it doesn't, it's a critical problem either in the filesystem or the code - raise McPodcastMisconfiguredTranscoderException(f"Directory '{input_media_file.temp_dir}' does not exist.") - - if not os.path.isfile(input_media_file.temp_full_path): - raise McPodcastMisconfiguredTranscoderException(f"File '{input_media_file}' does not exist.") - - # Independently from what has told us, identify the file type again ourselves - media_info = media_file_info(media_file_path=input_media_file.temp_full_path) - - if not media_info.audio_streams: - raise McPodcastFileIsInvalidException("Downloaded file doesn't appear to have any audio streams.") - - ffmpeg_args = [] - - supported_audio_stream = media_info.best_supported_audio_stream() - if supported_audio_stream: - log.info(f"Found a supported audio stream") - - # Test if there is more than one audio stream - if len(media_info.audio_streams) > 1: - log.info(f"Found other audio streams besides the supported one, will discard those") - - ffmpeg_args.extend(['-f', supported_audio_stream.audio_codec_class.ffmpeg_container_format()]) - - # Select all audio streams - ffmpeg_args.extend(['-map', '0:a']) - - for stream in media_info.audio_streams: - # Deselect the unsupported streams - if stream != supported_audio_stream: - ffmpeg_args.extend(['-map', f'-0:a:{stream.ffmpeg_stream_index}']) - - # If a stream of a supported codec was not found, transcode it to FLAC 16 bit in order to not lose any quality - else: - log.info(f"None of the audio streams are supported by the Speech API, will transcode to FLAC") - - # Map first audio stream to input 0 - ffmpeg_args.extend(['-map', '0:a:0']) - - # Transcode to FLAC (16 bit) in order to not lose any quality - ffmpeg_args.extend(['-acodec', 'flac']) - ffmpeg_args.extend(['-f', 'flac']) - ffmpeg_args.extend(['-sample_fmt', 's16']) - - # Ensure that we end up with mono audio - ffmpeg_args.extend(['-ac', '1']) - - # If there's video in the file (e.g. video), remove it - if media_info.has_video_streams: - # Discard all video streams - ffmpeg_args.extend(['-map', '-0:v']) - - if ffmpeg_args: - - temp_filename = 'transcoded_file' - - try: - temp_dir = tempfile.mkdtemp('media_file') - except Exception as ex: - raise McPodcastFileStoreFailureException(f"Unable to create temporary directory: {ex}") - - temp_file_path = os.path.join(temp_dir, temp_filename) - - try: - log.info(f"Transcoding {input_media_file.temp_full_path} to {temp_file_path}...") - - # I wasn't sure how to map outputs in "ffmpeg-python" library so here we call ffmpeg directly - ffmpeg_command = ['ffmpeg', '-nostdin', '-hide_banner', - '-i', input_media_file.temp_full_path] + ffmpeg_args + [temp_file_path] - log.debug(f"FFmpeg command: {ffmpeg_command}") - subprocess.check_call(ffmpeg_command) - - log.info(f"Done transcoding {input_media_file.temp_full_path} to {temp_file_path}") - - except Exception as ex: - - shutil.rmtree(temp_dir) - - raise McPodcastFileIsInvalidException(f"Unable to transcode {input_media_file.temp_full_path}: {ex}") - - result_media_file = TranscodeTempDirAndFile(temp_dir=temp_dir, filename=temp_filename) - - else: - - # Return the same file as it wasn't touched - result_media_file = input_media_file - - return result_media_file diff --git a/apps/podcast-fetch-episode/src/requirements.txt b/apps/podcast-fetch-episode/src/requirements.txt deleted file mode 100644 index 061d400634..0000000000 --- a/apps/podcast-fetch-episode/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -ffmpeg-python==0.2.0 -google-cloud-storage==1.35.0 diff --git a/apps/podcast-fetch-episode/tests/python/config_random_gcs_prefix.py b/apps/podcast-fetch-episode/tests/python/config_random_gcs_prefix.py deleted file mode 100644 index f01de8e910..0000000000 --- a/apps/podcast-fetch-episode/tests/python/config_random_gcs_prefix.py +++ /dev/null @@ -1,16 +0,0 @@ -import datetime - -from podcast_fetch_episode.config import PodcastFetchEpisodeConfig - - -class RandomPathPrefixConfig(PodcastFetchEpisodeConfig): - """Configuration which stores GCS objects under a timestamped prefix.""" - _RANDOM_PREFIX = None - - @staticmethod - def gc_storage_path_prefix() -> str: - if not RandomPathPrefixConfig._RANDOM_PREFIX: - date = datetime.datetime.utcnow().isoformat() - date = date.replace(':', '_') - RandomPathPrefixConfig._RANDOM_PREFIX = f'tests-{date}' - return RandomPathPrefixConfig._RANDOM_PREFIX diff --git a/apps/podcast-fetch-episode/tests/python/test_fetch_and_store.py b/apps/podcast-fetch-episode/tests/python/test_fetch_and_store.py deleted file mode 100644 index 326bdd4d1c..0000000000 --- a/apps/podcast-fetch-episode/tests/python/test_fetch_and_store.py +++ /dev/null @@ -1,79 +0,0 @@ -import os -from typing import Union - -from mediawords.db import connect_to_db -from mediawords.test.db.create import create_test_medium, create_test_feed, create_test_story -from mediawords.test.hash_server import HashServer -from mediawords.util.network import random_unused_port - -from podcast_fetch_episode.fetch_and_store import fetch_and_store_episode -from podcast_fetch_episode.gcs_store import GCSStore - -from .config_random_gcs_prefix import RandomPathPrefixConfig - -TEST_MP3_PATH = '/opt/mediacloud/tests/data/media-samples/samples/kim_kardashian-mp3-mono.mp3' -assert os.path.isfile(TEST_MP3_PATH), f"Test MP3 file '{TEST_MP3_PATH}' should exist." - - -def test_fetch_and_store_episode(): - db = connect_to_db() - - test_medium = create_test_medium(db=db, label='test') - test_feed = create_test_feed(db=db, label='test', medium=test_medium) - - # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be - # used to guess the probable language of the podcast episode - test_story = create_test_story(db=db, label='keeping up with Kardashians', feed=test_feed) - - stories_id = test_story['stories_id'] - - with open(TEST_MP3_PATH, mode='rb') as f: - test_mp3_data = f.read() - - # noinspection PyUnusedLocal - def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]: - response = "".encode('utf-8') - response += "HTTP/1.0 200 OK\r\n".encode('utf-8') - response += "Content-Type: audio/mpeg\r\n".encode('utf-8') - response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8') - response += "\r\n".encode('utf-8') - response += test_mp3_data - return response - - port = random_unused_port() - pages = { - '/test.mp3': { - 'callback': __mp3_callback, - } - } - - hs = HashServer(port=port, pages=pages) - hs.start() - - mp3_url = f'http://127.0.0.1:{port}/test.mp3' - - story_enclosure = db.insert(table='story_enclosures', insert_hash={ - 'stories_id': stories_id, - 'url': mp3_url, - 'mime_type': 'audio/mpeg', - 'length': len(test_mp3_data), - }) - - conf = RandomPathPrefixConfig() - fetch_and_store_episode(db=db, stories_id=stories_id, config=conf) - - episodes = db.select(table='podcast_episodes', what_to_select='*').hashes() - assert len(episodes), f"Only one episode is expected." - - episode = episodes[0] - assert episode['stories_id'] == stories_id - assert episode['story_enclosures_id'] == story_enclosure['story_enclosures_id'] - assert episode['gcs_uri'] == f"gs://{conf.gc_storage_bucket_name()}/{conf.gc_storage_path_prefix()}/{stories_id}" - assert episode['duration'] > 0 - assert episode['codec'] == 'MP3' - assert episode['sample_rate'] == 44100 - assert episode['bcp47_language_code'] == 'en-US' - - # Try removing test object - gcs = GCSStore(config=conf) - gcs.delete_object(object_id=str(stories_id)) diff --git a/apps/podcast-fetch-transcript/.idea/externalDependencies.xml b/apps/podcast-fetch-transcript/.idea/externalDependencies.xml deleted file mode 100644 index 7872ffbcf2..0000000000 --- a/apps/podcast-fetch-transcript/.idea/externalDependencies.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/.idea/misc.xml b/apps/podcast-fetch-transcript/.idea/misc.xml deleted file mode 100644 index b31733e855..0000000000 --- a/apps/podcast-fetch-transcript/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/.idea/modules.xml b/apps/podcast-fetch-transcript/.idea/modules.xml deleted file mode 100644 index 4ff9c4812f..0000000000 --- a/apps/podcast-fetch-transcript/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/.idea/runConfigurations/Dockerfile.xml b/apps/podcast-fetch-transcript/.idea/runConfigurations/Dockerfile.xml deleted file mode 100644 index 3f86f834cf..0000000000 --- a/apps/podcast-fetch-transcript/.idea/runConfigurations/Dockerfile.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/.idea/sqldialects.xml b/apps/podcast-fetch-transcript/.idea/sqldialects.xml deleted file mode 100644 index 790b3f37f8..0000000000 --- a/apps/podcast-fetch-transcript/.idea/sqldialects.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/.idea/vcs.xml b/apps/podcast-fetch-transcript/.idea/vcs.xml deleted file mode 100644 index a4647a1c0e..0000000000 --- a/apps/podcast-fetch-transcript/.idea/vcs.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/Dockerfile b/apps/podcast-fetch-transcript/Dockerfile deleted file mode 100644 index 0a7acb7f8f..0000000000 --- a/apps/podcast-fetch-transcript/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -# -# Collect due transcripts from Google Speech API, store them locally as both raw JSON and download text -# - -FROM gcr.io/mcback/common:latest - -# Install Python dependencies -COPY src/requirements.txt /var/tmp/ -RUN \ - cd /var/tmp/ && \ - pip3 install -r requirements.txt && \ - rm requirements.txt && \ - rm -rf /root/.cache/ && \ - true - -# Copy sources -COPY src/ /opt/mediacloud/src/podcast-fetch-transcript/ -ENV PERL5LIB="/opt/mediacloud/src/podcast-fetch-transcript/perl:${PERL5LIB}" \ - PYTHONPATH="/opt/mediacloud/src/podcast-fetch-transcript/python:${PYTHONPATH}" - -# Copy worker script -COPY bin /opt/mediacloud/bin - -USER mediacloud - -CMD ["podcast_fetch_transcript_worker.py"] diff --git a/apps/podcast-fetch-transcript/bin/podcast_fetch_transcript_worker.py b/apps/podcast-fetch-transcript/bin/podcast_fetch_transcript_worker.py deleted file mode 100755 index ae25385834..0000000000 --- a/apps/podcast-fetch-transcript/bin/podcast_fetch_transcript_worker.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 - -from mediawords.db import connect_to_db -from mediawords.job import JobBroker -from mediawords.util.log import create_logger -from mediawords.util.perl import decode_object_from_bytes_if_needed -from mediawords.util.process import fatal_error - -from podcast_fetch_transcript.exceptions import McPodcastFetchTranscriptSoftException - -from podcast_fetch_transcript.fetch_store import fetch_store_transcript - -log = create_logger(__name__) - - -def run_podcast_fetch_transcript(podcast_episode_transcript_fetches_id: int) -> None: - """Fetch a completed episode transcripts from Speech API for story.""" - - if isinstance(podcast_episode_transcript_fetches_id, bytes): - podcast_episode_transcript_fetches_id = decode_object_from_bytes_if_needed( - podcast_episode_transcript_fetches_id) - podcast_episode_transcript_fetches_id = int(podcast_episode_transcript_fetches_id) - - if not podcast_episode_transcript_fetches_id: - fatal_error("'podcast_episode_transcript_fetches_id' is unset.") - - db = connect_to_db() - - log.info(f"Fetching transcript for fetch ID {podcast_episode_transcript_fetches_id}...") - - try: - stories_id = fetch_store_transcript( - db=db, - podcast_episode_transcript_fetches_id=podcast_episode_transcript_fetches_id, - ) - - if stories_id: - JobBroker(queue_name='MediaWords::Job::ExtractAndVector').add_to_queue(stories_id=stories_id) - - except McPodcastFetchTranscriptSoftException as ex: - # Soft exceptions - log.error(f"Unable to fetch transcript for fetch ID {podcast_episode_transcript_fetches_id}: {ex}") - raise ex - - except Exception as ex: - # Hard and other exceptions - fatal_error(( - f"Fatal / unknown error while fetching transcript " - f"for ID {podcast_episode_transcript_fetches_id}: {ex}" - )) - - log.info(f"Done fetching transcript for ID {podcast_episode_transcript_fetches_id}") - - -if __name__ == '__main__': - app = JobBroker(queue_name='MediaWords::Job::Podcast::FetchTranscript') - app.start_worker(handler=run_podcast_fetch_transcript) diff --git a/apps/podcast-fetch-transcript/docker-compose.tests.yml b/apps/podcast-fetch-transcript/docker-compose.tests.yml deleted file mode 100644 index ea93f92b0a..0000000000 --- a/apps/podcast-fetch-transcript/docker-compose.tests.yml +++ /dev/null @@ -1,117 +0,0 @@ -version: "3.7" - -services: - - podcast-fetch-transcript: - image: gcr.io/mcback/podcast-fetch-transcript:latest - init: true - stop_signal: SIGKILL - environment: - MC_PODCAST_GC_AUTH_JSON_BASE64: "${MC_PODCAST_GC_AUTH_JSON_BASE64}" - MC_PODCAST_FETCH_TRANSCRIPT_RUN_COSTLY_TEST: "${MC_PODCAST_FETCH_TRANSCRIPT_RUN_COSTLY_TEST}" - expose: - # "test_full_chain.py" test server's port - - 8080 - volumes: - - type: bind - source: ./bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./src/ - target: /opt/mediacloud/src/podcast-fetch-transcript/ - - type: bind - source: ./tests/ - target: /opt/mediacloud/tests/ - - type: bind - source: ./../common/src/ - target: /opt/mediacloud/src/common/ - depends_on: - - podcast-fetch-episode - - podcast-submit-operation - # No "podcast-poll-due-operations" as we'll just go ahead and fetch it ourselves - - postgresql-pgbouncer - - rabbitmq-server - - podcast-fetch-episode: - image: gcr.io/mcback/podcast-fetch-episode:latest - init: true - stop_signal: SIGKILL - environment: - MC_PODCAST_GC_AUTH_JSON_BASE64: "${MC_PODCAST_GC_AUTH_JSON_BASE64}" - MC_PODCAST_FETCH_EPISODE_BUCKET_NAME: "${MC_PODCAST_FETCH_EPISODE_BUCKET_NAME}" - MC_PODCAST_FETCH_EPISODE_PATH_PREFIX: "audio-files/" - volumes: - - type: bind - source: ./../podcast-fetch-episode/bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./../podcast-fetch-episode/src/ - target: /opt/mediacloud/src/podcast-fetch-episode/ - - type: bind - source: ./../common/src/ - target: /opt/mediacloud/src/common/ - depends_on: - - postgresql-pgbouncer - - rabbitmq-server - - podcast-submit-operation: - image: gcr.io/mcback/podcast-submit-operation:latest - init: true - stop_signal: SIGKILL - environment: - MC_PODCAST_GC_AUTH_JSON_BASE64: "${MC_PODCAST_GC_AUTH_JSON_BASE64}" - volumes: - - type: bind - source: ./../podcast-submit-operation/bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./../podcast-submit-operation/src/ - target: /opt/mediacloud/src/podcast-submit-operation/ - - type: bind - source: ./../common/src/ - target: /opt/mediacloud/src/common/ - depends_on: - - postgresql-pgbouncer - - rabbitmq-server - - postgresql-pgbouncer: - image: gcr.io/mcback/postgresql-pgbouncer:latest - init: true - stop_signal: SIGKILL - expose: - - 6432 - volumes: - - type: bind - source: ./../postgresql-pgbouncer/conf/ - target: /etc/pgbouncer/ - depends_on: - - postgresql-server - - postgresql-server: - image: gcr.io/mcback/postgresql-server:latest - init: true - stop_signal: SIGKILL - expose: - - 5432 - volumes: - - type: bind - source: ./../postgresql-server/bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./../postgresql-server/schema/ - target: /opt/mediacloud/schema/ - - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ - - rabbitmq-server: - image: gcr.io/mcback/rabbitmq-server:latest - init: true - stop_signal: SIGKILL - expose: - - 5672 - - 15672 - volumes: - - type: bind - source: ./../rabbitmq-server/conf/ - target: /etc/rabbitmq/ diff --git a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/config.py b/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/config.py deleted file mode 100644 index 782ed619c3..0000000000 --- a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/config.py +++ /dev/null @@ -1,12 +0,0 @@ -from mediawords.util.config import file_with_env_value - - -class PodcastFetchTranscriptConfig(object): - """ - Podcast transcript fetcher configuration. - """ - - @staticmethod - def gc_auth_json_file() -> str: - """Return path to Google Cloud authentication JSON file.""" - return file_with_env_value(name='MC_PODCAST_GC_AUTH_JSON_BASE64', encoded_with_base64=True) diff --git a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/exceptions.py b/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/exceptions.py deleted file mode 100644 index 0b64b540b6..0000000000 --- a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/exceptions.py +++ /dev/null @@ -1,52 +0,0 @@ -import abc - - -class _AbstractMcPodcastFetchTranscriptException(Exception, metaclass=abc.ABCMeta): - """Abstract exception.""" - pass - - -# --- - - -class McPodcastFetchTranscriptSoftException(_AbstractMcPodcastFetchTranscriptException): - """Soft errors exception.""" - pass - - -class McOperationNotFoundException(McPodcastFetchTranscriptSoftException): - """Exception thrown when a transcription operation was not found for a particular operation ID.""" - # Not a "hard" failure as sometimes these operations expire - pass - - -# --- - -class McPodcastFetchTranscriptHardException(_AbstractMcPodcastFetchTranscriptException): - """Hard errors exception.""" - pass - - -class McDatabaseNotFoundException(McPodcastFetchTranscriptHardException): - """Exception thrown when we can't find something in the database that we've expected to find.""" - pass - - -class McDatabaseErrorException(McPodcastFetchTranscriptHardException): - """Exception thrown when a database raises an error.""" - pass - - -class McMisconfiguredSpeechAPIException(McPodcastFetchTranscriptHardException): - """Exception thrown when we receive something we didn't expect from Speech API.""" - pass - - -class McTranscriptionReturnedErrorException(McPodcastFetchTranscriptHardException): - """ - Exception thrown when Speech API explicitly returns an error state. - - When Speech API returns with an error, it's unclear whether it was us who have messed up or - something is (temporarily) wrong on their end, so on the safe side we throw a "hard" exception. - """ - pass diff --git a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/fetch_store.py b/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/fetch_store.py deleted file mode 100644 index 72db28a80b..0000000000 --- a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/fetch_store.py +++ /dev/null @@ -1,118 +0,0 @@ -from typing import Optional - -from mediawords.db import DatabaseHandler -from mediawords.util.log import create_logger - -from podcast_fetch_transcript.exceptions import ( - McDatabaseErrorException, - McDatabaseNotFoundException, -) -from podcast_fetch_transcript.handler import AbstractHandler, DefaultHandler - -log = create_logger(__name__) - -NOT_READY_RETRY_INTERVAL = 60 * 10 -"""If the transcript is not ready yet, how many seconds to wait until retrying the fetch.""" - - -def fetch_store_transcript( - db: DatabaseHandler, - podcast_episode_transcript_fetches_id: int, - handler: Optional[AbstractHandler] = None, -) -> Optional[int]: - """ - Try fetching and storing the transcript and update "podcast_episode_transcript_fetches" depending on how well it - went. - - :param db: Database handler. - :param podcast_episode_transcript_fetches_id: Transcript fetch ID. - :param handler: Object of a AbstractHandler subclass which implements fetching and storing (useful for testing). - :return: Story ID if transcript was fetched and stored, None otherwise. - """ - - if not handler: - handler = DefaultHandler() - - transcript_fetch = db.query(""" - UPDATE podcast_episode_transcript_fetches - SET fetched_at = NOW() - WHERE podcast_episode_transcript_fetches_id = %(podcast_episode_transcript_fetches_id)s - RETURNING * - """, { - 'podcast_episode_transcript_fetches_id': podcast_episode_transcript_fetches_id, - }).hash() - if not transcript_fetch: - raise McDatabaseNotFoundException( - f"Transcript fetch for ID {podcast_episode_transcript_fetches_id} was not found." - ) - - try: - - transcript = handler.fetch_transcript( - db=db, - podcast_episode_transcript_fetches_id=podcast_episode_transcript_fetches_id, - ) - - if transcript: - log.info(f"Transcript fetched, storing...") - - handler.store_transcript(db=db, transcript=transcript) - - db.query(""" - UPDATE podcast_episode_transcript_fetches - SET result = 'success' - WHERE podcast_episode_transcript_fetches_id = %(podcast_episode_transcript_fetches_id)s - """, { - 'podcast_episode_transcript_fetches_id': podcast_episode_transcript_fetches_id, - }) - - else: - log.info(f"Transcript is not done yet, will retry in {NOT_READY_RETRY_INTERVAL} seconds...") - - db.query(""" - INSERT INTO podcast_episode_transcript_fetches ( - podcast_episodes_id, - add_to_queue_at - ) VALUES ( - %(podcast_episodes_id)s, - NOW() + INTERVAL %(add_to_queue_interval)s - ) - """, { - 'podcast_episodes_id': transcript_fetch['podcast_episodes_id'], - 'add_to_queue_interval': f"{NOT_READY_RETRY_INTERVAL} seconds", - }) - - db.query(""" - UPDATE podcast_episode_transcript_fetches - SET result = 'in_progress' - WHERE podcast_episode_transcript_fetches_id = %(podcast_episode_transcript_fetches_id)s - """, { - 'podcast_episode_transcript_fetches_id': podcast_episode_transcript_fetches_id, - }) - - except Exception as ex: - - # Try logging exception to the database - try: - db.query(""" - UPDATE podcast_episode_transcript_fetches - SET - result = 'error', - error_message = %(error_message)s - WHERE podcast_episode_transcript_fetches_id = %(podcast_episode_transcript_fetches_id)s - """, { - 'podcast_episode_transcript_fetches_id': podcast_episode_transcript_fetches_id, - 'error_message': str(ex), - }) - except Exception as ex2: - raise McDatabaseErrorException(( - f"Error while executing transcript fetch for ID {podcast_episode_transcript_fetches_id}: {ex}; " - f"further, I wasn't able to log it to database because: {ex2}" - )) - - raise ex - - if transcript: - return transcript.stories_id - else: - return None diff --git a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/handler.py b/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/handler.py deleted file mode 100644 index 8951f7c9fe..0000000000 --- a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/handler.py +++ /dev/null @@ -1,203 +0,0 @@ -import abc -from typing import Optional - -# noinspection PyPackageRequirements -from google.api_core.exceptions import InvalidArgument, NotFound, GoogleAPICallError -# noinspection PyPackageRequirements -from google.api_core.operation import from_gapic, Operation -# noinspection PyPackageRequirements -from google.api_core.operations_v1 import OperationsClient -# noinspection PyPackageRequirements -from google.cloud.speech_v1p1beta1 import SpeechClient, LongRunningRecognizeResponse, LongRunningRecognizeMetadata - -from mediawords.db import DatabaseHandler -from mediawords.dbi.downloads import create_download_for_new_story -from mediawords.dbi.downloads.store import store_content -from mediawords.util.log import create_logger - -from podcast_fetch_transcript.config import PodcastFetchTranscriptConfig -from podcast_fetch_transcript.exceptions import ( - McDatabaseNotFoundException, - McMisconfiguredSpeechAPIException, - McOperationNotFoundException, - McTranscriptionReturnedErrorException, -) -from podcast_fetch_transcript.transcript import UtteranceAlternative, Utterance, Transcript - -log = create_logger(__name__) - - -class AbstractHandler(object, metaclass=abc.ABCMeta): - """ - Abstract class that fetches and stores a transcript. - - Useful for testing as we can create a mock class which pretends to do it. - """ - - @classmethod - @abc.abstractmethod - def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetches_id: int) -> Optional[Transcript]: - """ - Attempt fetching a Speech API transcript for a given operation ID. - - :param db: Database handler. - :param podcast_episode_transcript_fetches_id: Transcript fetch attempt ID. - :return: None if transcript is not finished yet, a Transcript object otherwise. - """ - raise NotImplemented("Abstract method") - - @classmethod - @abc.abstractmethod - def store_transcript(cls, db: DatabaseHandler, transcript: Transcript) -> int: - """ - Store transcript to raw download store. - - We could write this directly to "download_texts", but if we decide to reextract everything (after, say, updating - an extractor), that "download_texts" row might disappear, so it's safer to just store a raw download on the - key-value store as if it was a HTML file or something. - - :param db: Database handler. - :param transcript: Transcript object. - :return: Download ID for a download that was created. - """ - raise NotImplemented("Abstract method") - - -class DefaultHandler(AbstractHandler): - - @classmethod - def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetches_id: int) -> Optional[Transcript]: - transcript_fetch = db.find_by_id( - table='podcast_episode_transcript_fetches', - object_id=podcast_episode_transcript_fetches_id, - ) - if not transcript_fetch: - raise McDatabaseNotFoundException( - f"Unable to find transcript fetch with ID {podcast_episode_transcript_fetches_id}" - ) - podcast_episodes_id = transcript_fetch['podcast_episodes_id'] - - episode = db.find_by_id(table='podcast_episodes', object_id=podcast_episodes_id) - if not episode: - raise McDatabaseNotFoundException( - f"Unable to find podcast episode with ID {podcast_episodes_id}" - ) - - stories_id = episode['stories_id'] - speech_operation_id = episode['speech_operation_id'] - - if not speech_operation_id: - raise McMisconfiguredSpeechAPIException(f"Speech ID for podcast episode {podcast_episodes_id} is unset.") - - try: - config = PodcastFetchTranscriptConfig() - client = SpeechClient.from_service_account_json(config.gc_auth_json_file()) - operations_client = OperationsClient(channel=client._transport._grpc_channel) - except Exception as ex: - raise McMisconfiguredSpeechAPIException(f"Unable to initialize Speech API operations client: {ex}") - - try: - operation = operations_client.get_operation(name=speech_operation_id) - except InvalidArgument as ex: - raise McMisconfiguredSpeechAPIException(f"Invalid operation ID '{speech_operation_id}': {ex}") - except NotFound as ex: - raise McOperationNotFoundException(f"Operation ID '{speech_operation_id}' was not found: {ex}") - except Exception as ex: - # On any other errors, raise a hard exception - raise McMisconfiguredSpeechAPIException(f"Error while fetching operation ID '{speech_operation_id}': {ex}") - - if not operation: - raise McMisconfiguredSpeechAPIException(f"Operation is unset.") - - try: - gapic_operation: Operation = from_gapic( - operation, - operations_client, - LongRunningRecognizeResponse, - metadata_type=LongRunningRecognizeMetadata, - ) - except Exception as ex: - raise McMisconfiguredSpeechAPIException(f"Unable to create GAPIC operation: {ex}") - - log.debug(f"GAPIC operation: {gapic_operation}") - log.debug(f"Operation metadata: {gapic_operation.metadata}") - log.debug(f"Operation is done: {gapic_operation.done()}") - log.debug(f"Operation error: {gapic_operation.done()}") - - try: - operation_is_done = gapic_operation.done() - except Exception as ex: - # 'done' attribute might be gone in a newer version of the Speech API client - raise McMisconfiguredSpeechAPIException( - f"Unable to test whether operation '{speech_operation_id}' is done: {ex}" - ) - - if not operation_is_done: - log.info(f"Operation '{speech_operation_id}' is still not done.") - return None - - utterances = [] - - try: - for result in gapic_operation.result().results: - - alternatives = [] - for alternative in result.alternatives: - alternatives.append( - UtteranceAlternative( - text=alternative.transcript.strip(), - confidence=alternative.confidence, - ) - ) - - utterances.append( - Utterance( - alternatives=alternatives, - bcp47_language_code=result.language_code, - ) - ) - - except GoogleAPICallError as ex: - raise McTranscriptionReturnedErrorException( - f"Unable to read transcript for operation '{speech_operation_id}': {ex}" - ) - - except Exception as ex: - raise McMisconfiguredSpeechAPIException( - f"Unable to read transcript for operation '{speech_operation_id}': {ex}" - ) - - return Transcript(stories_id=stories_id, utterances=utterances) - - @classmethod - def _download_text_from_transcript(cls, transcript: Transcript) -> str: - best_utterance_alternatives = [] - for utterance in transcript.utterances: - best_utterance_alternatives.append(utterance.best_alternative.text) - text = "\n\n".join(best_utterance_alternatives) - return text - - @classmethod - def store_transcript(cls, db: DatabaseHandler, transcript: Transcript) -> int: - story = db.find_by_id(table='stories', object_id=transcript.stories_id) - - feed = db.query(""" - SELECT * - FROM feeds - WHERE feeds_id = ( - SELECT feeds_id - FROM feeds_stories_map - WHERE stories_id = %(stories_id)s - ) - """, { - 'stories_id': transcript.stories_id, - }).hash() - - download = create_download_for_new_story(db=db, story=story, feed=feed) - - text = cls._download_text_from_transcript(transcript=transcript) - - # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later - store_content(db=db, download=download, content=text) - - return download['downloads_id'] diff --git a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/transcript.py b/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/transcript.py deleted file mode 100644 index edfbd257a4..0000000000 --- a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/transcript.py +++ /dev/null @@ -1,40 +0,0 @@ -import dataclasses -from typing import List - - -@dataclasses.dataclass -class UtteranceAlternative(object): - """One of the alternatives of what might have been said in an utterance.""" - - text: str - """Utterance text.""" - - confidence: float - """How confident Speech API is that it got it right.""" - - -@dataclasses.dataclass -class Utterance(object): - """A single transcribed utterance (often but not always a single sentence).""" - - alternatives: List[UtteranceAlternative] - """Alternatives of what might have been said in an utterance, ordered from the best to the worst guess.""" - - bcp47_language_code: str - """BCP 47 language code; might be different from what we've passed as the input.""" - - @property - def best_alternative(self) -> UtteranceAlternative: - """Return best alternative for what might have been said in an utterance.""" - return self.alternatives[0] - - -@dataclasses.dataclass -class Transcript(object): - """A single transcript.""" - - stories_id: int - """Story ID.""" - - utterances: List[Utterance] - """List of ordered utterances in a transcript.""" diff --git a/apps/podcast-fetch-transcript/src/requirements.txt b/apps/podcast-fetch-transcript/src/requirements.txt deleted file mode 100644 index 59e80a7b73..0000000000 --- a/apps/podcast-fetch-transcript/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -google-cloud-speech==2.0.1 diff --git a/apps/podcast-fetch-transcript/tests/data/media-samples b/apps/podcast-fetch-transcript/tests/data/media-samples deleted file mode 160000 index 45b179fd86..0000000000 --- a/apps/podcast-fetch-transcript/tests/data/media-samples +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 45b179fd867b6031c803cdbb7eddafa7e204d5bd diff --git a/apps/podcast-fetch-transcript/tests/python/setup_fetch.py b/apps/podcast-fetch-transcript/tests/python/setup_fetch.py deleted file mode 100644 index 9569d89587..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/setup_fetch.py +++ /dev/null @@ -1,182 +0,0 @@ -import abc -import os -import random -import socket -import time -from typing import Union -from unittest import TestCase - -from mediawords.db import connect_to_db -from mediawords.job import JobBroker -from mediawords.test.db.create import create_test_medium, create_test_feed -from mediawords.test.hash_server import HashServer -from mediawords.util.log import create_logger - -log = create_logger(__name__) - - -class AbstractFetchTranscriptTestCase(TestCase, metaclass=abc.ABCMeta): - __slots__ = [ - 'db', - 'hs', - 'stories_id', - 'transcript_fetches', - ] - - @classmethod - @abc.abstractmethod - def input_media_path(cls) -> str: - """Return full path to input media file.""" - raise NotImplemented("Abstract method") - - @classmethod - @abc.abstractmethod - def input_media_mime_type(cls) -> str: - """Return input media file's MIME type.""" - raise NotImplemented("Abstract method") - - @classmethod - @abc.abstractmethod - def story_title_description(cls) -> str: - """Return a string to store as both story title and description.""" - raise NotImplemented("Abstract method") - - @classmethod - @abc.abstractmethod - def retries_per_step(cls) -> int: - """How many retries to do per each local step.""" - raise NotImplemented("Abstract method") - - @classmethod - @abc.abstractmethod - def seconds_between_retries(cls) -> float: - """How many seconds to wait between retries.""" - raise NotImplemented("Abstract method") - - def setUp(self) -> None: - super().setUp() - - self.db = connect_to_db() - - test_medium = create_test_medium(db=self.db, label='test') - test_feed = create_test_feed(db=self.db, label='test', medium=test_medium) - - # Add a story with a random ID to decrease the chance that object in GCS will collide with another test running - # at the same time - self.stories_id = random.randint(1, 1000000) - - self.db.query(""" - INSERT INTO stories ( - stories_id, - media_id, - url, - guid, - title, - description, - publish_date, - collect_date, - full_text_rss - ) VALUES ( - %(stories_id)s, - %(media_id)s, - 'http://story.test/', - 'guid://story.test/', - 'story', - 'description', - '2016-10-15 08:00:00', - '2016-10-15 10:00:00', - true - ) - """, { - 'stories_id': self.stories_id, - 'media_id': test_feed['media_id'], - }) - - # Create missing partitions for "feeds_stories_map" - self.db.query('SELECT create_missing_partitions()') - - self.db.create( - table='feeds_stories_map', - insert_hash={ - 'feeds_id': int(test_feed['feeds_id']), - 'stories_id': self.stories_id, - } - ) - - assert os.path.isfile(self.input_media_path()), f"Test media file '{self.input_media_path()}' should exist." - - with open(self.input_media_path(), mode='rb') as f: - test_data = f.read() - - # noinspection PyUnusedLocal - def __media_callback(request: HashServer.Request) -> Union[str, bytes]: - response = "".encode('utf-8') - response += "HTTP/1.0 200 OK\r\n".encode('utf-8') - response += f"Content-Type: {self.input_media_mime_type()}\r\n".encode('utf-8') - response += f"Content-Length: {len(test_data)}\r\n".encode('utf-8') - response += "\r\n".encode('utf-8') - response += test_data - return response - - port = 8080 # Port exposed on docker-compose.tests.yml - media_path = '/test_media_file' - pages = { - media_path: { - 'callback': __media_callback, - } - } - - self.hs = HashServer(port=port, pages=pages) - self.hs.start() - - # Using our hostname as it will be another container that will be connecting to us - media_url = f'http://{socket.gethostname()}:{port}{media_path}' - - self.db.insert(table='story_enclosures', insert_hash={ - 'stories_id': self.stories_id, - 'url': media_url, - 'mime_type': self.input_media_mime_type(), - 'length': len(test_data), - }) - - # Add a "podcast-fetch-episode" job - JobBroker(queue_name='MediaWords::Job::Podcast::FetchEpisode').add_to_queue(stories_id=self.stories_id) - - total_time = int(self.retries_per_step() * self.seconds_between_retries()) - - # Wait for "podcast-fetch-episode" to transcode, upload to Google Storage, and write it to "podcast_episodes" - episodes = None - for x in range(1, self.retries_per_step() + 1): - log.info(f"Waiting for episode to appear (#{x})...") - - episodes = self.db.select(table='podcast_episodes', what_to_select='*').hashes() - if episodes: - log.info(f"Episode is here!") - break - - time.sleep(self.seconds_between_retries()) - - assert episodes, f"Episode didn't show up in {total_time} seconds." - - # Wait for "podcast-submit-operation" to submit Speech API operation - self.transcript_fetches = None - for x in range(1, self.retries_per_step() + 1): - log.info(f"Waiting for transcript fetch to appear (#{x})...") - - self.transcript_fetches = self.db.select( - table='podcast_episode_transcript_fetches', - what_to_select='*' - ).hashes() - - if self.transcript_fetches: - log.info(f"Transcript fetch is here!") - break - - time.sleep(self.seconds_between_retries()) - - assert self.transcript_fetches, f"Operation didn't show up in {total_time} seconds." - - def tearDown(self) -> None: - super().tearDown() - - self.hs.stop() diff --git a/apps/podcast-fetch-transcript/tests/python/setup_mock_fetch_store.py b/apps/podcast-fetch-transcript/tests/python/setup_mock_fetch_store.py deleted file mode 100644 index bf8065f670..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/setup_mock_fetch_store.py +++ /dev/null @@ -1,57 +0,0 @@ -import abc -from unittest import TestCase - -from mediawords.db import connect_to_db -from mediawords.test.db.create import create_test_medium, create_test_feed, create_test_story -from mediawords.util.log import create_logger - -log = create_logger(__name__) - - -class AbstractMockFetchStoreTestCase(TestCase, metaclass=abc.ABCMeta): - MOCK_SPEECH_OPERATION_ID = 'foo' - - __slots__ = [ - 'db', - 'enclosure', - 'episode', - 'transcript_fetch', - 'podcast_episode_transcript_fetches_id', - ] - - def setUp(self) -> None: - super().setUp() - - self.db = connect_to_db() - - test_medium = create_test_medium(db=self.db, label='test') - test_feed = create_test_feed(db=self.db, label='test', medium=test_medium) - test_story = create_test_story(db=self.db, feed=test_feed, label='test') - - self.enclosure = self.db.insert(table='story_enclosures', insert_hash={ - 'stories_id': test_story['stories_id'], - 'url': 'foo', - 'mime_type': 'foo', - 'length': 3, - }) - - self.episode = self.db.insert(table='podcast_episodes', insert_hash={ - 'stories_id': test_story['stories_id'], - 'story_enclosures_id': self.enclosure['story_enclosures_id'], - 'gcs_uri': 'gs://test', - 'duration': 3, - 'codec': 'FLAC', - 'sample_rate': 44100, - 'bcp47_language_code': 'en-US', - 'speech_operation_id': self.MOCK_SPEECH_OPERATION_ID, - }) - - self.transcript_fetch = self.db.query(""" - INSERT INTO podcast_episode_transcript_fetches (podcast_episodes_id, add_to_queue_at) - VALUES (%(podcast_episodes_id)s, NOW()) - RETURNING * - """, { - 'podcast_episodes_id': self.episode['podcast_episodes_id'], - }).hash() - - self.podcast_episode_transcript_fetches_id = self.transcript_fetch['podcast_episode_transcript_fetches_id'] diff --git a/apps/podcast-fetch-transcript/tests/python/test_fetch_long_audio.py b/apps/podcast-fetch-transcript/tests/python/test_fetch_long_audio.py deleted file mode 100644 index ef246d5bf8..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/test_fetch_long_audio.py +++ /dev/null @@ -1,68 +0,0 @@ -import os -import time - -import pytest - -from mediawords.util.log import create_logger - -from podcast_fetch_transcript.handler import DefaultHandler - -from .setup_fetch import AbstractFetchTranscriptTestCase - -log = create_logger(__name__) - - -@pytest.mark.skipif('MC_PODCAST_FETCH_TRANSCRIPT_RUN_COSTLY_TEST' not in os.environ, - reason="Costly; each run costs about 60 / 4 * 0.009 = $0.04") -class LongAudioTestCase(AbstractFetchTranscriptTestCase): - """Test the full chain against a long audio file to try out whether podcast-fetch-transcript manages to back off.""" - - @classmethod - def input_media_path(cls) -> str: - return '/opt/mediacloud/tests/data/media-samples/samples/nixon_speech-vorbis-1m.ogg' - - @classmethod - def input_media_mime_type(cls) -> str: - return 'audio/ogg' - - @classmethod - def story_title_description(cls) -> str: - return 'Resignation speech of United States President Richard Nixon' - - @classmethod - def retries_per_step(cls) -> int: - # Try more often and wait for longer as this is a bigger file - return 60 - - @classmethod - def seconds_between_retries(cls) -> float: - return 1.0 - - def test_long_audio(self): - transcript = None - - handler = DefaultHandler() - - # Input audio file is 1m0s, so wait for at least two minutes - for x in range(1, 12 + 1): - log.info(f"Waiting for transcript to be finished (#{x})...") - - podcast_episode_transcript_fetches_id = self.transcript_fetches[0]['podcast_episode_transcript_fetches_id'] - transcript = handler.fetch_transcript( - db=self.db, - podcast_episode_transcript_fetches_id=podcast_episode_transcript_fetches_id - ) - - if transcript: - log.info("Transcript is here!") - break - - time.sleep(5) - - print(transcript) - - assert transcript - assert transcript.stories_id - assert len(transcript.utterances) > 0 - assert len(transcript.utterances[0].alternatives) > 0 - assert 'evening' in transcript.utterances[0].alternatives[0].text.lower() diff --git a/apps/podcast-fetch-transcript/tests/python/test_fetch_store_full_chain.py b/apps/podcast-fetch-transcript/tests/python/test_fetch_store_full_chain.py deleted file mode 100644 index 1e6933a36d..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/test_fetch_store_full_chain.py +++ /dev/null @@ -1,70 +0,0 @@ -import time - -from mediawords.dbi.downloads.store import fetch_content -from mediawords.util.log import create_logger -from podcast_fetch_transcript.handler import DefaultHandler - -from .setup_fetch import AbstractFetchTranscriptTestCase - -log = create_logger(__name__) - - -class FullChainTestCase(AbstractFetchTranscriptTestCase): - """Test the full chain against a small audio file.""" - - @classmethod - def input_media_path(cls) -> str: - # Run the test with AAC file to test out both transcoding to FLAC and whether Speech API can transcribe audio - # files after lossy -> lossless transcoding - return '/opt/mediacloud/tests/data/media-samples/samples/kim_kardashian-aac.m4a' - - @classmethod - def input_media_mime_type(cls) -> str: - return 'audio/mp4' - - @classmethod - def story_title_description(cls) -> str: - # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be - # used to guess the probable language of the podcast episode - return 'keeping up with Kardashians' - - @classmethod - def retries_per_step(cls) -> int: - return 120 - - @classmethod - def seconds_between_retries(cls) -> float: - return 0.5 - - def test_full_chain(self): - transcript = None - - handler = DefaultHandler() - - for x in range(1, 60 + 1): - log.info(f"Waiting for transcript to be finished (#{x})...") - - podcast_episode_transcript_fetches_id = self.transcript_fetches[0]['podcast_episode_transcript_fetches_id'] - transcript = handler.fetch_transcript( - db=self.db, - podcast_episode_transcript_fetches_id=podcast_episode_transcript_fetches_id - ) - if transcript: - log.info("Transcript is here!") - break - - time.sleep(2) - - assert transcript - assert transcript.stories_id - assert len(transcript.utterances) == 1 - assert len(transcript.utterances[0].alternatives) == 1 - assert 'kim kardashian' in transcript.utterances[0].alternatives[0].text.lower() - - downloads_id = handler.store_transcript(db=self.db, transcript=transcript) - - download = self.db.find_by_id(table='downloads', object_id=downloads_id) - - raw_download = fetch_content(db=self.db, download=download) - assert raw_download - assert 'kim kardashian' in raw_download.lower() diff --git a/apps/podcast-fetch-transcript/tests/python/test_mock_error.py b/apps/podcast-fetch-transcript/tests/python/test_mock_error.py deleted file mode 100644 index 31033cbf86..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/test_mock_error.py +++ /dev/null @@ -1,45 +0,0 @@ -from typing import Optional - -import pytest - -from mediawords.db import DatabaseHandler - -from podcast_fetch_transcript.exceptions import McPodcastFetchTranscriptHardException -from podcast_fetch_transcript.fetch_store import fetch_store_transcript -from podcast_fetch_transcript.handler import AbstractHandler -from podcast_fetch_transcript.transcript import Transcript - -from .setup_mock_fetch_store import AbstractMockFetchStoreTestCase - - -class MockTranscriptErrorWithExceptionHandler(AbstractHandler): - """Mock handler that fails the transcription with soft error.""" - - @classmethod - def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetches_id: int) -> Optional[Transcript]: - raise McPodcastFetchTranscriptHardException("Some sort of a permanent problem") - - @classmethod - def store_transcript(cls, db: DatabaseHandler, transcript: Transcript) -> int: - raise NotImplemented("Shouldn't be called.") - - -class MockErrorTestCase(AbstractMockFetchStoreTestCase): - - def test_error(self): - handler = MockTranscriptErrorWithExceptionHandler() - - with pytest.raises(McPodcastFetchTranscriptHardException): - fetch_store_transcript( - db=self.db, - podcast_episode_transcript_fetches_id=self.podcast_episode_transcript_fetches_id, - handler=handler, - ) - - transcript_fetches = self.db.select(table='podcast_episode_transcript_fetches', what_to_select='*').hashes() - assert len(transcript_fetches) == 1 - - transcript_fetch = transcript_fetches[0] - assert transcript_fetch['fetched_at'] - assert transcript_fetch['result'] == 'error' - assert 'permanent problem' in transcript_fetch['error_message'] diff --git a/apps/podcast-fetch-transcript/tests/python/test_mock_not_done.py b/apps/podcast-fetch-transcript/tests/python/test_mock_not_done.py deleted file mode 100644 index 1d4e7cf1f9..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/test_mock_not_done.py +++ /dev/null @@ -1,53 +0,0 @@ -from typing import Optional - -from mediawords.db import DatabaseHandler - -from podcast_fetch_transcript.fetch_store import fetch_store_transcript -from podcast_fetch_transcript.handler import AbstractHandler -from podcast_fetch_transcript.transcript import Transcript - -from .setup_mock_fetch_store import AbstractMockFetchStoreTestCase - - -class MockTranscriptNotDoneHandler(AbstractHandler): - """Mock handler that reports that the transcript is not yet done.""" - - @classmethod - def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetches_id: int) -> Optional[Transcript]: - return None - - @classmethod - def store_transcript(cls, db: DatabaseHandler, transcript: Transcript) -> int: - raise NotImplemented("Shouldn't be called.") - - -class MockFailedTestCase(AbstractMockFetchStoreTestCase): - - def test_not_done(self): - handler = MockTranscriptNotDoneHandler() - - stories_id = fetch_store_transcript( - db=self.db, - podcast_episode_transcript_fetches_id=self.podcast_episode_transcript_fetches_id, - handler=handler, - ) - assert stories_id is None - - transcript_fetches = self.db.query(""" - SELECT * - FROM podcast_episode_transcript_fetches - ORDER BY podcast_episode_transcript_fetches_id - """).hashes() - assert len(transcript_fetches) == 2, "One fetch that's still in progress, another one added for the future." - - transcript_fetch_in_progress = transcript_fetches[0] - assert transcript_fetch_in_progress['fetched_at'] - assert transcript_fetch_in_progress['result'] == 'in_progress' - assert not transcript_fetch_in_progress['error_message'] - - transcript_fetch_readded = transcript_fetches[1] - assert transcript_fetch_readded['add_to_queue_at'] - assert not transcript_fetch_readded['added_to_queue_at'] - assert not transcript_fetch_readded['fetched_at'] - assert not transcript_fetch_readded['result'] - assert not transcript_fetch_readded['error_message'] diff --git a/apps/podcast-fetch-transcript/tests/python/test_mock_success.py b/apps/podcast-fetch-transcript/tests/python/test_mock_success.py deleted file mode 100644 index d388b58f0b..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/test_mock_success.py +++ /dev/null @@ -1,55 +0,0 @@ -from typing import Optional - -from mediawords.db import DatabaseHandler -from podcast_fetch_transcript.fetch_store import fetch_store_transcript - -from podcast_fetch_transcript.handler import AbstractHandler -from podcast_fetch_transcript.transcript import Transcript, Utterance, UtteranceAlternative - -from .setup_mock_fetch_store import AbstractMockFetchStoreTestCase - - -class MockTranscriptSuccessHandler(AbstractHandler): - """Mock handler that fetches the transcription successfully.""" - - @classmethod - def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetches_id: int) -> Optional[Transcript]: - return Transcript( - stories_id=42, - utterances=[ - Utterance( - alternatives=[ - UtteranceAlternative( - text='Kim Kardashian.', - confidence=1.00, - ) - ], - bcp47_language_code='en-US', - ), - ] - ) - - @classmethod - def store_transcript(cls, db: DatabaseHandler, transcript: Transcript) -> int: - return transcript.stories_id - - -class MockSuccessTestCase(AbstractMockFetchStoreTestCase): - - def test_success(self): - handler = MockTranscriptSuccessHandler() - - stories_id = fetch_store_transcript( - db=self.db, - podcast_episode_transcript_fetches_id=self.podcast_episode_transcript_fetches_id, - handler=handler, - ) - assert stories_id - - transcript_fetches = self.db.select(table='podcast_episode_transcript_fetches', what_to_select='*').hashes() - assert len(transcript_fetches) == 1 - - transcript_fetch = transcript_fetches[0] - assert transcript_fetch['fetched_at'] - assert transcript_fetch['result'] == 'success' - assert not transcript_fetch['error_message'] diff --git a/apps/podcast-poll-due-operations/.idea/externalDependencies.xml b/apps/podcast-poll-due-operations/.idea/externalDependencies.xml deleted file mode 100644 index 7872ffbcf2..0000000000 --- a/apps/podcast-poll-due-operations/.idea/externalDependencies.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/inspectionProfiles/profiles_settings.xml b/apps/podcast-poll-due-operations/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2da2d..0000000000 --- a/apps/podcast-poll-due-operations/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/misc.xml b/apps/podcast-poll-due-operations/.idea/misc.xml deleted file mode 100644 index 46a8a5a238..0000000000 --- a/apps/podcast-poll-due-operations/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/modules.xml b/apps/podcast-poll-due-operations/.idea/modules.xml deleted file mode 100644 index d113be0932..0000000000 --- a/apps/podcast-poll-due-operations/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/podcast-poll-due-operations.iml b/apps/podcast-poll-due-operations/.idea/podcast-poll-due-operations.iml deleted file mode 100644 index 83a606a6bd..0000000000 --- a/apps/podcast-poll-due-operations/.idea/podcast-poll-due-operations.iml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/runConfigurations/Dockerfile.xml b/apps/podcast-poll-due-operations/.idea/runConfigurations/Dockerfile.xml deleted file mode 100644 index e6a39721ac..0000000000 --- a/apps/podcast-poll-due-operations/.idea/runConfigurations/Dockerfile.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/Dockerfile b/apps/podcast-poll-due-operations/Dockerfile deleted file mode 100644 index 9de70b053f..0000000000 --- a/apps/podcast-poll-due-operations/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -# -# Poll database for operations which should be done by now, add transcription fetch for due operations -# - -FROM gcr.io/mcback/common:latest - -# Copy sources -COPY src/ /opt/mediacloud/src/podcast-poll-due-operations/ -ENV PERL5LIB="/opt/mediacloud/src/podcast-poll-due-operations/perl:${PERL5LIB}" \ - PYTHONPATH="/opt/mediacloud/src/podcast-poll-due-operations/python:${PYTHONPATH}" - -# Copy worker script -COPY bin /opt/mediacloud/bin - -USER mediacloud - -CMD ["podcast_poll_due_operations_worker.py"] diff --git a/apps/podcast-poll-due-operations/bin/podcast_poll_due_operations_worker.py b/apps/podcast-poll-due-operations/bin/podcast_poll_due_operations_worker.py deleted file mode 100755 index a87a2763ad..0000000000 --- a/apps/podcast-poll-due-operations/bin/podcast_poll_due_operations_worker.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python3 - -from mediawords.job import JobBroker -from mediawords.util.process import fatal_error - -from podcast_poll_due_operations.due_operations import poll_for_due_operations, AbstractFetchTranscriptQueue - - -class JobBrokerFetchTranscriptQueue(AbstractFetchTranscriptQueue): - """Add fetch transcript jobs to job broker's queue.""" - - def add_to_queue(self, podcast_episode_transcript_fetches_id: int) -> None: - JobBroker(queue_name='MediaWords::Job::Podcast::FetchTranscript').add_to_queue( - podcast_episode_transcript_fetches_id=podcast_episode_transcript_fetches_id, - ) - - -if __name__ == '__main__': - try: - fetch_transcript_queue = JobBrokerFetchTranscriptQueue() - poll_for_due_operations(fetch_transcript_queue=fetch_transcript_queue) - except Exception as ex: - # Hard and unknown errors (no soft errors here) - fatal_error(f"Unable to poll for due operations: {ex}") diff --git a/apps/podcast-poll-due-operations/docker-compose.tests.yml b/apps/podcast-poll-due-operations/docker-compose.tests.yml deleted file mode 100644 index 912d4cc95e..0000000000 --- a/apps/podcast-poll-due-operations/docker-compose.tests.yml +++ /dev/null @@ -1,54 +0,0 @@ -version: "3.7" - -services: - - podcast-poll-due-operations: - image: gcr.io/mcback/podcast-poll-due-operations:latest - init: true - stop_signal: SIGKILL - volumes: - - type: bind - source: ./bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./src/ - target: /opt/mediacloud/src/podcast-poll-due-operations/ - - type: bind - source: ./tests/ - target: /opt/mediacloud/tests/ - - type: bind - source: ./../common/src/ - target: /opt/mediacloud/src/common/ - depends_on: - - postgresql-pgbouncer - # We don't need "rabbitmq-server" to run tests - - postgresql-pgbouncer: - image: gcr.io/mcback/postgresql-pgbouncer:latest - init: true - stop_signal: SIGKILL - expose: - - 6432 - volumes: - - type: bind - source: ./../postgresql-pgbouncer/conf/ - target: /etc/pgbouncer/ - depends_on: - - postgresql-server - - postgresql-server: - image: gcr.io/mcback/postgresql-server:latest - init: true - stop_signal: SIGKILL - expose: - - 5432 - volumes: - - type: bind - source: ./../postgresql-server/bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./../postgresql-server/schema/ - target: /opt/mediacloud/schema/ - - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ diff --git a/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/due_operations.py b/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/due_operations.py deleted file mode 100644 index ea82dc8949..0000000000 --- a/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/due_operations.py +++ /dev/null @@ -1,112 +0,0 @@ -import abc -import time - -from mediawords.db import connect_to_db -from mediawords.util.log import create_logger - -from podcast_poll_due_operations.exceptions import McJobBrokerErrorException - -log = create_logger(__name__) - - -class AbstractFetchTranscriptQueue(object, metaclass=abc.ABCMeta): - """ - Abstract class for adding a story ID to the "podcast-fetch-transcript" queue. - - Useful for testing as having such a class can help us find out whether stories get added to the actual job queue. - """ - - @abc.abstractmethod - def add_to_queue(self, podcast_episode_transcript_fetches_id: int) -> None: - """ - Add story ID to "podcast-fetch-transcript" job queue. - - :param podcast_episode_transcript_fetches_id: Transcript fetch ID. - """ - raise NotImplemented("Abstract method") - - -def poll_for_due_operations(fetch_transcript_queue: AbstractFetchTranscriptQueue, - stop_after_first_empty_chunk: bool = False, - wait_after_empty_poll: int = 30, - stories_chunk_size: int = 100) -> None: - """ - Continuously poll for due operations, add such operations to "podcast-fetch-transcript" queue. - - Never returns, unless 'stop_after_first_empty_chunk' is set. - - :param fetch_transcript_queue: Queue helper object to use for adding a story ID to "podcast-fetch-transcript" - queue (useful for testing). - :param stop_after_first_empty_chunk: If True, stop after the first attempt to fetch a chunk of due story IDs comes - out empty (useful for testing). - :param wait_after_empty_poll: Seconds to wait after there were no due story IDs found. - :param stories_chunk_size: Max. due story IDs to fetch in one go; the chunk will be deleted + returned in a - transaction, which will get reverted if RabbitMQ fails, so we don't want to - hold that transaction for too long. - """ - - if not fetch_transcript_queue: - raise McJobBrokerErrorException(f"Fetch transcript queue object is unset.") - - while True: - - db = connect_to_db() - - log.info("Polling...") - due_operations = db.query(""" - SELECT - podcast_episode_transcript_fetches_id, - add_to_queue_at - FROM podcast_episode_transcript_fetches - - -- Transcript fetch is due - WHERE add_to_queue_at <= NOW() - - -- Transcript fetch wasn't added to the job broker's queue yet - AND podcast_episode_transcript_was_added_to_queue(added_to_queue_at) = 'f' - - -- Get the oldest operations first - ORDER BY add_to_queue_at - - -- Don't fetch too much of stories at once - LIMIT %(stories_chunk_size)s - """, { - 'stories_chunk_size': stories_chunk_size, - }).hashes() - - if due_operations: - - try: - log.info(f"Adding {len(due_operations)} due operations to the transcription fetch queue...") - - for operation in due_operations: - podcast_episode_transcript_fetches_id = operation['podcast_episode_transcript_fetches_id'] - log.debug( - f"Adding fetch ID {podcast_episode_transcript_fetches_id} to the transcription fetch queue..." - ) - fetch_transcript_queue.add_to_queue( - podcast_episode_transcript_fetches_id=podcast_episode_transcript_fetches_id, - ) - - # Update "added_to_queue_at" individually in case RabbitMQ decides to fail on us - db.query(""" - UPDATE podcast_episode_transcript_fetches - SET added_to_queue_at = NOW() - WHERE podcast_episode_transcript_fetches_id = %(podcast_episode_transcript_fetches_id)s - """, { - 'podcast_episode_transcript_fetches_id': podcast_episode_transcript_fetches_id, - }) - - log.info(f"Done adding {len(due_operations)} due operations to the transcription fetch queue") - except Exception as ex: - - raise McJobBrokerErrorException(f"Unable to add one or more stories the the job queue: {ex}") - - else: - - if stop_after_first_empty_chunk: - log.info(f"No due story IDs found, stopping...") - break - else: - log.info(f"No due story IDs found, waiting for {wait_after_empty_poll} seconds...") - time.sleep(wait_after_empty_poll) diff --git a/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/exceptions.py b/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/exceptions.py deleted file mode 100644 index 1bc47e477a..0000000000 --- a/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/exceptions.py +++ /dev/null @@ -1,13 +0,0 @@ -class McPodcastPollDueOperationsHardException(Exception): - """Hard errors exception.""" - pass - - -class McDatabaseErrorException(McPodcastPollDueOperationsHardException): - """Exception thrown when we encounter a database error.""" - pass - - -class McJobBrokerErrorException(McPodcastPollDueOperationsHardException): - """Exception thrown when we encounter a job broker (RabbitMQ) error.""" - pass diff --git a/apps/podcast-poll-due-operations/tests/python/setup_due_operation.py b/apps/podcast-poll-due-operations/tests/python/setup_due_operation.py deleted file mode 100644 index 5fcc58e52d..0000000000 --- a/apps/podcast-poll-due-operations/tests/python/setup_due_operation.py +++ /dev/null @@ -1,55 +0,0 @@ -import abc -from unittest import TestCase - -from mediawords.db import connect_to_db -from mediawords.test.db.create import create_test_medium, create_test_feed, create_test_story - - -class SetupTestOperation(TestCase, metaclass=abc.ABCMeta): - __slots__ = [ - 'db', - 'test_medium', - 'test_feed', - 'story', - 'stories_id', - ] - - def setUp(self): - self.db = connect_to_db() - - self.test_medium = create_test_medium(db=self.db, label='test') - self.test_feed = create_test_feed(db=self.db, label='test', medium=self.test_medium) - self.story = create_test_story(db=self.db, label='test', feed=self.test_feed) - - stories_id = self.story['stories_id'] - - enclosure = self.db.insert(table='story_enclosures', insert_hash={ - 'stories_id': stories_id, - # URL doesn't really matter as we won't be fetching it - 'url': 'http://example.com/', - 'mime_type': 'audio/mpeg', - 'length': 100000, - }) - - episode = self.db.insert(table='podcast_episodes', insert_hash={ - 'stories_id': stories_id, - 'story_enclosures_id': enclosure['story_enclosures_id'], - 'gcs_uri': 'gs://whatever', - 'duration': 1, - 'codec': 'MP3', - 'sample_rate': 44100, - 'bcp47_language_code': 'en-US', - 'speech_operation_id': 'foo', - }) - - self.db.query(""" - INSERT INTO podcast_episode_transcript_fetches ( - podcast_episodes_id, - add_to_queue_at - ) VALUES ( - %(podcast_episodes_id)s, - NOW() - ) - """, { - 'podcast_episodes_id': episode['podcast_episodes_id'], - }) diff --git a/apps/podcast-poll-due-operations/tests/python/test_due_operations.py b/apps/podcast-poll-due-operations/tests/python/test_due_operations.py deleted file mode 100644 index d121178e03..0000000000 --- a/apps/podcast-poll-due-operations/tests/python/test_due_operations.py +++ /dev/null @@ -1,40 +0,0 @@ -from podcast_poll_due_operations.due_operations import poll_for_due_operations, AbstractFetchTranscriptQueue - -from .setup_due_operation import SetupTestOperation - - -class MockCounterFetchTranscriptQueue(AbstractFetchTranscriptQueue): - __slots__ = [ - 'story_count', - ] - - def __init__(self): - self.story_count = 0 - - def add_to_queue(self, podcast_episode_transcript_fetches_id: int) -> None: - self.story_count += 1 - - -class TestPollForDueOperations(SetupTestOperation): - - def test_poll_for_due_operations(self): - """Simple test.""" - - fetch_transcript_queue = MockCounterFetchTranscriptQueue() - - poll_for_due_operations( - fetch_transcript_queue=fetch_transcript_queue, - stop_after_first_empty_chunk=True, - ) - - all_fetches = self.db.select( - table='podcast_episode_transcript_fetches', - what_to_select='*', - ).hashes() - - assert len(all_fetches) == 1, "The fetch should have been kept in the table." - fetch = all_fetches[0] - - assert fetch['added_to_queue_at'], "Timestamp for when the fetch as added to the queue should be set." - - assert fetch_transcript_queue.story_count == 1, "A single story should have been added to the fetch queue." diff --git a/apps/podcast-poll-due-operations/tests/python/test_failing_job_broker.py b/apps/podcast-poll-due-operations/tests/python/test_failing_job_broker.py deleted file mode 100644 index a0e6897f80..0000000000 --- a/apps/podcast-poll-due-operations/tests/python/test_failing_job_broker.py +++ /dev/null @@ -1,36 +0,0 @@ -import pytest - -from podcast_poll_due_operations.due_operations import poll_for_due_operations, AbstractFetchTranscriptQueue -from podcast_poll_due_operations.exceptions import McJobBrokerErrorException - -from .setup_due_operation import SetupTestOperation - - -class MockFailingFetchTranscriptQueue(AbstractFetchTranscriptQueue): - - def add_to_queue(self, podcast_episode_transcript_fetches_id: int) -> None: - raise Exception("Job broker is down") - - -class TestFailingJobBroker(SetupTestOperation): - - def test_failing_job_broker(self): - """Test what happens if the job broker fails.""" - - fetch_transcript_queue = MockFailingFetchTranscriptQueue() - - with pytest.raises(McJobBrokerErrorException): - poll_for_due_operations( - fetch_transcript_queue=fetch_transcript_queue, - stop_after_first_empty_chunk=True, - ) - - all_fetches = self.db.select( - table='podcast_episode_transcript_fetches', - what_to_select='*', - ).hashes() - - assert len(all_fetches) == 1, "The fetch should have been kept in the table." - fetch = all_fetches[0] - - assert not fetch['added_to_queue_at'], "Timestamp for when the fetch as added to the queue should be empty." diff --git a/apps/podcast-submit-operation/.idea/externalDependencies.xml b/apps/podcast-submit-operation/.idea/externalDependencies.xml deleted file mode 100644 index 7872ffbcf2..0000000000 --- a/apps/podcast-submit-operation/.idea/externalDependencies.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/inspectionProfiles/profiles_settings.xml b/apps/podcast-submit-operation/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2da2d..0000000000 --- a/apps/podcast-submit-operation/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/misc.xml b/apps/podcast-submit-operation/.idea/misc.xml deleted file mode 100644 index 06b8bbff3f..0000000000 --- a/apps/podcast-submit-operation/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/podcast-submit-operation.iml b/apps/podcast-submit-operation/.idea/podcast-submit-operation.iml deleted file mode 100644 index 1c3aa105bd..0000000000 --- a/apps/podcast-submit-operation/.idea/podcast-submit-operation.iml +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/runConfigurations/Dockefile.xml b/apps/podcast-submit-operation/.idea/runConfigurations/Dockefile.xml deleted file mode 100644 index b5e047b7b0..0000000000 --- a/apps/podcast-submit-operation/.idea/runConfigurations/Dockefile.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/sqldialects.xml b/apps/podcast-submit-operation/.idea/sqldialects.xml deleted file mode 100644 index 790b3f37f8..0000000000 --- a/apps/podcast-submit-operation/.idea/sqldialects.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/vcs.xml b/apps/podcast-submit-operation/.idea/vcs.xml deleted file mode 100644 index b2bdec2d71..0000000000 --- a/apps/podcast-submit-operation/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/Dockerfile b/apps/podcast-submit-operation/Dockerfile deleted file mode 100644 index acf06f32e4..0000000000 --- a/apps/podcast-submit-operation/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -# -# Submit a long running operation to Google Speech to Text API for it to transcribe the episode -# - -FROM gcr.io/mcback/common:latest - -# Install Python dependencies -COPY src/requirements.txt /var/tmp/ -RUN \ - cd /var/tmp/ && \ - pip3 install -r requirements.txt && \ - rm requirements.txt && \ - rm -rf /root/.cache/ && \ - true - -# Copy sources -COPY src/ /opt/mediacloud/src/podcast-submit-operation/ -ENV PERL5LIB="/opt/mediacloud/src/podcast-submit-operation/perl:${PERL5LIB}" \ - PYTHONPATH="/opt/mediacloud/src/podcast-submit-operation/python:${PYTHONPATH}" - -# Copy worker script -COPY bin /opt/mediacloud/bin - -USER mediacloud - -CMD ["podcast_submit_operation_worker.py"] diff --git a/apps/podcast-submit-operation/bin/podcast_submit_operation_worker.py b/apps/podcast-submit-operation/bin/podcast_submit_operation_worker.py deleted file mode 100755 index ae3b712e80..0000000000 --- a/apps/podcast-submit-operation/bin/podcast_submit_operation_worker.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python3 - -from mediawords.db import connect_to_db -from mediawords.job import JobBroker -from mediawords.util.log import create_logger -from mediawords.util.perl import decode_object_from_bytes_if_needed -from mediawords.util.process import fatal_error - -from podcast_submit_operation.exceptions import McPodcastSubmitOperationSoftException -from podcast_submit_operation.submit_operation import get_podcast_episode, submit_transcribe_operation - -log = create_logger(__name__) - -ADD_TO_QUEUE_AT_DURATION_MULTIPLIER = 1.1 -""" -How soon to expect the transcription results to become available in relation to episode's duration. - -For example, if the episode's duration is 60 minutes, and the multiplier is 1.1, the transcription results fetch will -first be attempted after 60 * 1.1 = 66 minutes. -""" - - -def run_podcast_submit_operation(stories_id: int) -> None: - """Submit a podcast episode to the Speech API.""" - - if isinstance(stories_id, bytes): - stories_id = decode_object_from_bytes_if_needed(stories_id) - stories_id = int(stories_id) - - db = connect_to_db() - - log.info(f"Submitting story's {stories_id} podcast episode for transcription...") - - try: - episode = get_podcast_episode(db=db, stories_id=stories_id) - speech_operation_id = submit_transcribe_operation(episode=episode) - - db.query(""" - UPDATE podcast_episodes - SET speech_operation_id = %(speech_operation_id)s - WHERE podcast_episodes_id = %(podcast_episodes_id)s - """, { - 'podcast_episodes_id': episode.podcast_episodes_id, - 'speech_operation_id': speech_operation_id, - }) - - add_to_queue_interval = f"{int(episode.duration + ADD_TO_QUEUE_AT_DURATION_MULTIPLIER)} seconds" - db.query(""" - INSERT INTO podcast_episode_transcript_fetches ( - podcast_episodes_id, - add_to_queue_at - ) VALUES ( - %(podcast_episodes_id)s, - NOW() + INTERVAL %(add_to_queue_interval)s - ) - """, { - 'podcast_episodes_id': episode.podcast_episodes_id, - 'add_to_queue_interval': add_to_queue_interval, - }) - - except McPodcastSubmitOperationSoftException as ex: - # Soft exceptions - log.error(f"Unable to submit podcast episode for story {stories_id}: {ex}") - raise ex - - except Exception as ex: - # Hard and other exceptions - fatal_error(f"Fatal / unknown error while submitting podcast episode for story {stories_id}: {ex}") - - log.info(f"Done submitting story's {stories_id} podcast episode for transcription") - - -if __name__ == '__main__': - app = JobBroker(queue_name='MediaWords::Job::Podcast::SubmitOperation') - app.start_worker(handler=run_podcast_submit_operation) diff --git a/apps/podcast-submit-operation/docker-compose.tests.yml b/apps/podcast-submit-operation/docker-compose.tests.yml deleted file mode 100644 index 349eaa1c3b..0000000000 --- a/apps/podcast-submit-operation/docker-compose.tests.yml +++ /dev/null @@ -1,56 +0,0 @@ -version: "3.7" - -services: - - podcast-submit-operation: - image: gcr.io/mcback/podcast-submit-operation:latest - init: true - stop_signal: SIGKILL - environment: - MC_PODCAST_GC_AUTH_JSON_BASE64: "${MC_PODCAST_GC_AUTH_JSON_BASE64}" - volumes: - - type: bind - source: ./bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./src/ - target: /opt/mediacloud/src/podcast-submit-operation/ - - type: bind - source: ./tests/ - target: /opt/mediacloud/tests/ - - type: bind - source: ./../common/src/ - target: /opt/mediacloud/src/common/ - depends_on: - - postgresql-pgbouncer - # We don't need "rabbitmq-server" to run tests - - postgresql-pgbouncer: - image: gcr.io/mcback/postgresql-pgbouncer:latest - init: true - stop_signal: SIGKILL - expose: - - 6432 - volumes: - - type: bind - source: ./../postgresql-pgbouncer/conf/ - target: /etc/pgbouncer/ - depends_on: - - postgresql-server - - postgresql-server: - image: gcr.io/mcback/postgresql-server:latest - init: true - stop_signal: SIGKILL - expose: - - 5432 - volumes: - - type: bind - source: ./../postgresql-server/bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./../postgresql-server/schema/ - target: /opt/mediacloud/schema/ - - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ diff --git a/apps/podcast-submit-operation/src/python/podcast_submit_operation/config.py b/apps/podcast-submit-operation/src/python/podcast_submit_operation/config.py deleted file mode 100644 index 0d20337c98..0000000000 --- a/apps/podcast-submit-operation/src/python/podcast_submit_operation/config.py +++ /dev/null @@ -1,12 +0,0 @@ -from mediawords.util.config import file_with_env_value - - -class PodcastSubmitOperationConfig(object): - """ - Podcast submit transcription operation configuration. - """ - - @staticmethod - def gc_auth_json_file() -> str: - """Return path to Google Cloud authentication JSON file.""" - return file_with_env_value(name='MC_PODCAST_GC_AUTH_JSON_BASE64', encoded_with_base64=True) diff --git a/apps/podcast-submit-operation/src/python/podcast_submit_operation/exceptions.py b/apps/podcast-submit-operation/src/python/podcast_submit_operation/exceptions.py deleted file mode 100644 index 2ed79eb39a..0000000000 --- a/apps/podcast-submit-operation/src/python/podcast_submit_operation/exceptions.py +++ /dev/null @@ -1,54 +0,0 @@ -import abc - - -class _AbstractMcPodcastSubmitOperationException(Exception, metaclass=abc.ABCMeta): - """Abstract exception.""" - pass - - -class McPodcastSubmitOperationSoftException(_AbstractMcPodcastSubmitOperationException): - """Soft errors exception.""" - pass - - -class McPodcastNoEpisodesException(McPodcastSubmitOperationSoftException): - """Exception thrown when there are no episodes for a story.""" - pass - - -class McPodcastEpisodeTooLongException(McPodcastSubmitOperationSoftException): - """Exception raised when podcast's episode is too long.""" - pass - - -# --- - -class McPodcastSubmitOperationHardException(_AbstractMcPodcastSubmitOperationException): - """Hard errors exception.""" - pass - - -class McPodcastDatabaseErrorException(McPodcastSubmitOperationHardException): - """Exception thrown on database errors.""" - pass - - -class McPodcastInvalidInputException(McPodcastSubmitOperationHardException): - """Exception thrown on invalid inputs.""" - pass - - -class McPodcastMisconfiguredSpeechAPIException(McPodcastSubmitOperationHardException): - """Exception thrown on misconfigured Google Speech API.""" - pass - - -class McPodcastSpeechAPIRequestFailedException(McPodcastSubmitOperationHardException): - """ - Exception that is thrown when we're unable to submit a new job to Speech API. - - This is a hard exception because we should be able to handle "soft" failures (e.g. temporary network errors) of - Speech API in the code, and on any other, previously unseen, problems (service downtime, ran out of money, blocked, - outdated API version, etc.) it's better just to shut down the worker - """ - pass diff --git a/apps/podcast-submit-operation/src/python/podcast_submit_operation/submit_operation.py b/apps/podcast-submit-operation/src/python/podcast_submit_operation/submit_operation.py deleted file mode 100644 index 32e754ad97..0000000000 --- a/apps/podcast-submit-operation/src/python/podcast_submit_operation/submit_operation.py +++ /dev/null @@ -1,216 +0,0 @@ -import time -from typing import Dict, Any - -# noinspection PyPackageRequirements -from google.api_core.exceptions import ServiceUnavailable -# noinspection PyPackageRequirements -from google.cloud.speech_v1p1beta1 import SpeechClient, RecognitionConfig - -from mediawords.db import DatabaseHandler -from mediawords.util.log import create_logger - -from podcast_submit_operation.config import PodcastSubmitOperationConfig -from podcast_submit_operation.exceptions import ( - McPodcastNoEpisodesException, - McPodcastDatabaseErrorException, - McPodcastInvalidInputException, - McPodcastMisconfiguredSpeechAPIException, - McPodcastEpisodeTooLongException, - McPodcastSpeechAPIRequestFailedException, -) - -log = create_logger(__name__) - -MAX_DURATION = 60 * 60 * 2 -"""Max. podcast episode duration (in seconds) to submit for transcription.""" - -MAX_RETRIES = 10 -"""Max. number of retries for submitting a Speech API long running operation.""" - -DELAY_BETWEEN_RETRIES = 5 -"""How long to wait (in seconds) between retries.""" - - -class PodcastEpisode(object): - """ - Podcast episode object. - - Postprocesses database row from "podcast_episodes" and does some extra checks. - """ - __slots__ = [ - '__stories_id', - '__podcast_episodes_id', - '__gcs_uri', - '__duration', - '__codec', - '__sample_rate', - '__bcp47_language_code', - ] - - def __init__(self, stories_id: int, db_row: Dict[str, Any]): - self.__stories_id = stories_id - self.__podcast_episodes_id = db_row['podcast_episodes_id'] - self.__gcs_uri = db_row['gcs_uri'] - self.__duration = db_row['duration'] - self.__codec = db_row['codec'] - self.__sample_rate = db_row['sample_rate'] - self.__bcp47_language_code = db_row['bcp47_language_code'] - - @property - def stories_id(self) -> int: - return self.__stories_id - - @property - def podcast_episodes_id(self) -> int: - return self.__podcast_episodes_id - - @property - def gcs_uri(self) -> str: - if not self.__gcs_uri.startswith('gs://'): - raise McPodcastInvalidInputException("Google Cloud Storage URI doesn't have gs:// prefix.") - return self.__gcs_uri - - @property - def duration(self) -> int: - if not self.__duration: - raise McPodcastInvalidInputException("Duration is unset or zero.") - return self.__duration - - @property - def codec(self) -> RecognitionConfig.AudioEncoding: - try: - encoding_obj = getattr(RecognitionConfig.AudioEncoding, self.__codec) - except Exception as ex: - raise McPodcastInvalidInputException(f"Invalid codec '{self.__codec}': {ex}") - - return encoding_obj - - @property - def sample_rate(self) -> int: - if not self.__sample_rate: - raise McPodcastInvalidInputException("Sample rate is unset or zero.") - return self.__sample_rate - - @property - def bcp47_language_code(self) -> str: - if '-' not in self.__bcp47_language_code and self.__bcp47_language_code != 'zh': - raise McPodcastInvalidInputException(f"Invalid BCP 47 language code '{self.__bcp47_language_code}'.") - return self.__bcp47_language_code - - -def get_podcast_episode(db: DatabaseHandler, stories_id: int) -> PodcastEpisode: - """ - Get podcast episode object for story ID. - - :param db: Database handler. - :param stories_id: Story ID. - :return: Podcast episode object. - """ - try: - podcast_episodes = db.select( - table='podcast_episodes', - what_to_select='*', - condition_hash={'stories_id': stories_id}, - ).hashes() - - except Exception as ex: - raise McPodcastDatabaseErrorException(f"Unable to fetch story's {stories_id} podcast episodes: {ex}") - - if not podcast_episodes: - raise McPodcastNoEpisodesException(f"There are no podcast episodes for story {stories_id}") - - if len(podcast_episodes) > 1: - # That's very weird, there should be only one episode per story - raise McPodcastDatabaseErrorException(f"There's more than one podcast episode for story {stories_id}") - - try: - episode = PodcastEpisode(stories_id=stories_id, db_row=podcast_episodes[0]) - except Exception as ex: - raise McPodcastInvalidInputException(f"Invalid episode for story {stories_id}: {ex}") - - if episode.duration > MAX_DURATION: - raise McPodcastEpisodeTooLongException( - f"Story's {stories_id} podcast episode is too long ({episode.duration} seconds)." - ) - - return episode - - -def submit_transcribe_operation(episode: PodcastEpisode) -> int: - """ - Submit a Speech API long running operation to transcribe a podcast episode. - - :param episode: Podcast episode object. - :return Operation's ID to use for fetching operation results. - """ - - try: - config = PodcastSubmitOperationConfig() - client = SpeechClient.from_service_account_json(config.gc_auth_json_file()) - except Exception as ex: - raise McPodcastMisconfiguredSpeechAPIException(f"Unable to create Speech API client: {ex}") - - try: - config = RecognitionConfig( - encoding=episode.codec, - sample_rate_hertz=episode.sample_rate, - # We always set the channel count to 1 and disable separate recognition per channel as our inputs are all - # mono audio files and do not have separate speakers per audio channel. - audio_channel_count=1, - enable_separate_recognition_per_channel=False, - language_code=episode.bcp47_language_code, - alternative_language_codes=[ - # FIXME add all Chinese variants - # FIXME add Mexican Spanish variants - ], - - speech_contexts=[ - # Speech API works pretty well without custom contexts - ], - # Don't care that much about word confidence - enable_word_confidence=False, - # Punctuation doesn't work that well but we still enable it here - enable_automatic_punctuation=True, - # Not setting 'model' as 'use_enhanced' will then choose the best model for us - # Using enhanced (more expensive) model, where available - use_enhanced=True, - ) - except Exception as ex: - raise McPodcastMisconfiguredSpeechAPIException(f"Unable to initialize Speech API configuration: {ex}") - - log.info(f"Submitting a Speech API operation for story {episode.stories_id}...") - speech_operation = None - for attempt in range(1, MAX_RETRIES + 1): - - if attempt > 1: - log.warning(f"Waiting for {DELAY_BETWEEN_RETRIES} seconds and retrying #{attempt}...") - time.sleep(DELAY_BETWEEN_RETRIES) - - try: - speech_operation = client.long_running_recognize(config=config, audio={"uri": episode.gcs_uri}) - except ServiceUnavailable as ex: - # Speech API sometimes throws: - # - # google.api_core.exceptions.ServiceUnavailable: 503 failed to connect to all addresses - # - log.error(f"Unable to submit an operation because service is unavailable: {ex}") - except Exception as ex: - raise McPodcastSpeechAPIRequestFailedException(f"Unable to submit a Speech API operation: {ex}") - else: - break - - if not speech_operation: - raise McPodcastSpeechAPIRequestFailedException(f"Ran out of retries while submitting Speech API operation.") - - try: - # We get the operation name in a try-except block because accessing it is not that well documented, so Google - # might change the property names whenever they please and we wouldn't necessarily notice otherwise - operation_id = speech_operation.operation.name - if not operation_id: - raise McPodcastMisconfiguredSpeechAPIException(f"Operation name is empty.") - except Exception as ex: - raise McPodcastMisconfiguredSpeechAPIException(f"Unable to get operation name: {ex}") - - log.info(f"Submitted Speech API operation '{operation_id}' for story {episode.stories_id}") - - return operation_id diff --git a/apps/podcast-submit-operation/src/requirements.txt b/apps/podcast-submit-operation/src/requirements.txt deleted file mode 100644 index 59e80a7b73..0000000000 --- a/apps/podcast-submit-operation/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -google-cloud-speech==2.0.1 diff --git a/apps/podcast-submit-operation/tests/python/test_submit_operation.py b/apps/podcast-submit-operation/tests/python/test_submit_operation.py deleted file mode 100644 index 013037530c..0000000000 --- a/apps/podcast-submit-operation/tests/python/test_submit_operation.py +++ /dev/null @@ -1,40 +0,0 @@ -from mediawords.db import connect_to_db -from mediawords.test.db.create import create_test_medium, create_test_feed, create_test_story - -from podcast_submit_operation.submit_operation import get_podcast_episode, submit_transcribe_operation - - -def test_submit_transcribe_operation(): - test_gcs_uri = "gs://mc-podcast-sample-audio-files/samples/kim_kardashian-mp3.mp3" - - db = connect_to_db() - test_medium = create_test_medium(db=db, label='test') - test_feed = create_test_feed(db=db, label='test', medium=test_medium) - story = create_test_story(db=db, label='test', feed=test_feed) - - stories_id = story['stories_id'] - - enclosure = db.insert(table='story_enclosures', insert_hash={ - 'stories_id': stories_id, - # URL doesn't really matter as we won't be fetching it - 'url': 'http://example.com/', - 'mime_type': 'audio/mpeg', - 'length': 100000, - }) - - db.insert(table='podcast_episodes', insert_hash={ - 'stories_id': stories_id, - 'story_enclosures_id': enclosure['story_enclosures_id'], - 'gcs_uri': test_gcs_uri, - - # We lie about the duration because we want to test whether 'add_to_queue_at' will be set way into the future - 'duration': 60 * 60, - - 'codec': 'MP3', - 'sample_rate': 44100, - 'bcp47_language_code': 'en-US', - }) - - episode = get_podcast_episode(db=db, stories_id=stories_id) - speech_operation_id = submit_transcribe_operation(episode=episode) - assert speech_operation_id diff --git a/apps/podcast-fetch-transcript/.dockerignore b/apps/podcast-transcribe-episode/.dockerignore similarity index 100% rename from apps/podcast-fetch-transcript/.dockerignore rename to apps/podcast-transcribe-episode/.dockerignore diff --git a/apps/podcast-transcribe-episode/.idea/.gitignore b/apps/podcast-transcribe-episode/.idea/.gitignore new file mode 100644 index 0000000000..73f69e0958 --- /dev/null +++ b/apps/podcast-transcribe-episode/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/apps/podcast-fetch-episode/.idea/externalDependencies.xml b/apps/podcast-transcribe-episode/.idea/externalDependencies.xml similarity index 100% rename from apps/podcast-fetch-episode/.idea/externalDependencies.xml rename to apps/podcast-transcribe-episode/.idea/externalDependencies.xml diff --git a/apps/podcast-transcribe-episode/.idea/inspectionProfiles/Project_Default.xml b/apps/podcast-transcribe-episode/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000000..fe9d3b7548 --- /dev/null +++ b/apps/podcast-transcribe-episode/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,15 @@ + + + + \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/inspectionProfiles/profiles_settings.xml b/apps/podcast-transcribe-episode/.idea/inspectionProfiles/profiles_settings.xml similarity index 100% rename from apps/podcast-fetch-episode/.idea/inspectionProfiles/profiles_settings.xml rename to apps/podcast-transcribe-episode/.idea/inspectionProfiles/profiles_settings.xml diff --git a/apps/podcast-transcribe-episode/.idea/mediawords.sql b/apps/podcast-transcribe-episode/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/podcast-transcribe-episode/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/podcast-transcribe-episode/.idea/misc.xml b/apps/podcast-transcribe-episode/.idea/misc.xml new file mode 100644 index 0000000000..d89177f747 --- /dev/null +++ b/apps/podcast-transcribe-episode/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/modules.xml b/apps/podcast-transcribe-episode/.idea/modules.xml similarity index 51% rename from apps/podcast-submit-operation/.idea/modules.xml rename to apps/podcast-transcribe-episode/.idea/modules.xml index 26bb21f27e..9023537213 100644 --- a/apps/podcast-submit-operation/.idea/modules.xml +++ b/apps/podcast-transcribe-episode/.idea/modules.xml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/.idea/podcast-fetch-transcript.iml b/apps/podcast-transcribe-episode/.idea/podcast-transcribe-episode.iml similarity index 80% rename from apps/podcast-fetch-transcript/.idea/podcast-fetch-transcript.iml rename to apps/podcast-transcribe-episode/.idea/podcast-transcribe-episode.iml index ffc8ff3cc9..16f0d9a079 100644 --- a/apps/podcast-fetch-transcript/.idea/podcast-fetch-transcript.iml +++ b/apps/podcast-transcribe-episode/.idea/podcast-transcribe-episode.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/podcast-fetch-episode/.idea/runConfigurations/Dockerfile.xml b/apps/podcast-transcribe-episode/.idea/runConfigurations/Dockerfile.xml similarity index 83% rename from apps/podcast-fetch-episode/.idea/runConfigurations/Dockerfile.xml rename to apps/podcast-transcribe-episode/.idea/runConfigurations/Dockerfile.xml index 83b1a58573..85f79e0693 100644 --- a/apps/podcast-fetch-episode/.idea/runConfigurations/Dockerfile.xml +++ b/apps/podcast-transcribe-episode/.idea/runConfigurations/Dockerfile.xml @@ -2,12 +2,12 @@ - diff --git a/apps/podcast-transcribe-episode/.idea/sqlDataSources.xml b/apps/podcast-transcribe-episode/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..688da92e7e --- /dev/null +++ b/apps/podcast-transcribe-episode/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/sqldialects.xml b/apps/podcast-transcribe-episode/.idea/sqldialects.xml similarity index 62% rename from apps/podcast-poll-due-operations/.idea/sqldialects.xml rename to apps/podcast-transcribe-episode/.idea/sqldialects.xml index 790b3f37f8..f8c2c59528 100644 --- a/apps/podcast-poll-due-operations/.idea/sqldialects.xml +++ b/apps/podcast-transcribe-episode/.idea/sqldialects.xml @@ -1,7 +1,7 @@ - + \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/vcs.xml b/apps/podcast-transcribe-episode/.idea/vcs.xml similarity index 100% rename from apps/podcast-fetch-episode/.idea/vcs.xml rename to apps/podcast-transcribe-episode/.idea/vcs.xml diff --git a/apps/podcast-transcribe-episode/Dockerfile b/apps/podcast-transcribe-episode/Dockerfile new file mode 100644 index 0000000000..9cec878422 --- /dev/null +++ b/apps/podcast-transcribe-episode/Dockerfile @@ -0,0 +1,36 @@ +# +# Fetch podcast episode, convert it (if needed), transcribe and store to the database +# + +FROM gcr.io/mcback/common:latest + +# Install FFmpeg for manipulating audio files +RUN apt-get -y --no-install-recommends install ffmpeg + +# Install Python dependencies +COPY src/requirements.txt /var/tmp/ +RUN \ + cd /var/tmp/ && \ + pip3 install -r requirements.txt && \ + rm requirements.txt && \ + rm -rf /root/.cache/ && \ + true + +# Copy sources +COPY src/ /opt/mediacloud/src/podcast-transcribe-episode/ +ENV PERL5LIB="/opt/mediacloud/src/podcast-transcribe-episode/perl:${PERL5LIB}" \ + PYTHONPATH="/opt/mediacloud/src/podcast-transcribe-episode/python:${PYTHONPATH}" + +# Copy worker script +COPY bin /opt/mediacloud/bin + +USER mediacloud + +# Set a failing CMD because we'll be using the same image to run: +# +# * "rabbitmq_worker.py" - processes Celery jobs, starts Temporal workflows for those; +# * "workflow_worker.py" - runs Temporal workflows. +# +# so the user is expected to set "command" in docker-compose.yml to run a specific worker. +# +CMD ["SET_CONTAINER_COMMAND_TO_ONE_OF_THE_WORKERS"] diff --git a/apps/podcast-transcribe-episode/README.md b/apps/podcast-transcribe-episode/README.md new file mode 100644 index 0000000000..a975e3f612 --- /dev/null +++ b/apps/podcast-transcribe-episode/README.md @@ -0,0 +1,18 @@ +# Podcast transcription + +## TODO + +* [Upload transcriptions directly to GCS](https://cloud.google.com/speech-to-text/docs/async-recognize#speech_transcribe_async_gcs-python) + once that's no longer a demo feature +* Add all Chinese variants to `alternative_language_codes` +* Add all Mexican Spanish variants to `alternative_language_codes` +* Post-init [validation of dataclasses](https://docs.python.org/3/library/dataclasses.html#post-init-processing) +* When operation ID can't be found, resubmit the podcast for transcription as that might mean that the operation results + weren't fetched in time and so the operation has expired +* Add heartbeats to transcoding activity +* Test running the same activity multiple times +* If an activity throws an exception, its message should get printed out to the console as well (in addition to + Temporal's log) +* Track failed workflows / activities in Munin +* Instead (in addition to) of setting `workflow_run_timeout` in `test_workflow.py`, limit retries of the individual + activities too so that when they fail, we'd get a nice error message printed to the test log diff --git a/apps/podcast-transcribe-episode/bin/rabbitmq_worker.py b/apps/podcast-transcribe-episode/bin/rabbitmq_worker.py new file mode 100755 index 0000000000..52f6863072 --- /dev/null +++ b/apps/podcast-transcribe-episode/bin/rabbitmq_worker.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +import asyncio + +from mediawords.job import JobBroker +from mediawords.util.log import create_logger +from mediawords.util.perl import decode_object_from_bytes_if_needed +from mediawords.workflow.client import workflow_client + +# noinspection PyPackageRequirements +from temporal.workflow import WorkflowClient, WorkflowOptions + +from podcast_transcribe_episode.workflow_interface import PodcastTranscribeWorkflow + +log = create_logger(__name__) + + +async def _start_workflow(stories_id: int) -> None: + log.info(f"Starting a workflow for story {stories_id}...") + + client = workflow_client() + workflow: PodcastTranscribeWorkflow = client.new_workflow_stub( + cls=PodcastTranscribeWorkflow, + workflow_options=WorkflowOptions(workflow_id=str(stories_id)), + ) + + # Fire and forget as the workflow will do everything (including adding a extraction job) itself + await WorkflowClient.start(workflow.transcribe_episode, stories_id) + + log.info(f"Started a workflow for story {stories_id}...") + + +def run_podcast_fetch_episode(stories_id: int) -> None: + if isinstance(stories_id, bytes): + stories_id = decode_object_from_bytes_if_needed(stories_id) + stories_id = int(stories_id) + + asyncio.run(_start_workflow(stories_id=stories_id)) + + +if __name__ == '__main__': + app = JobBroker(queue_name='MediaWords::Job::Podcast::TranscribeEpisode') + app.start_worker(handler=run_podcast_fetch_episode) diff --git a/apps/podcast-transcribe-episode/bin/workflow_worker.py b/apps/podcast-transcribe-episode/bin/workflow_worker.py new file mode 100755 index 0000000000..6859753a8e --- /dev/null +++ b/apps/podcast-transcribe-episode/bin/workflow_worker.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 + +import asyncio + +# noinspection PyPackageRequirements +from temporal.workerfactory import WorkerFactory + +from mediawords.util.log import create_logger +from mediawords.workflow.client import workflow_client + +from podcast_transcribe_episode.workflow import PodcastTranscribeWorkflowImpl, PodcastTranscribeActivitiesImpl +from podcast_transcribe_episode.workflow_interface import TASK_QUEUE, PodcastTranscribeActivities + +log = create_logger(__name__) + + +async def _start_worker(): + client = workflow_client() + factory = WorkerFactory(client=client, namespace=client.namespace) + worker = factory.new_worker(task_queue=TASK_QUEUE) + worker.register_activities_implementation( + activities_instance=PodcastTranscribeActivitiesImpl(), + activities_cls_name=PodcastTranscribeActivities.__name__, + ) + worker.register_workflow_implementation_type(impl_cls=PodcastTranscribeWorkflowImpl) + factory.start() + + +if __name__ == '__main__': + loop = asyncio.get_event_loop() + asyncio.ensure_future(_start_worker()) + loop.run_forever() diff --git a/apps/podcast-transcribe-episode/docker-compose.tests.yml b/apps/podcast-transcribe-episode/docker-compose.tests.yml new file mode 100644 index 0000000000..dff828c7c0 --- /dev/null +++ b/apps/podcast-transcribe-episode/docker-compose.tests.yml @@ -0,0 +1,148 @@ +version: "3.7" + +services: + + podcast-transcribe-episode: + image: gcr.io/mcback/podcast-transcribe-episode:latest + init: true + stop_signal: SIGKILL + environment: + MC_PODCAST_AUTH_JSON_BASE64: "${MC_PODCAST_AUTH_JSON_BASE64}" + MC_PODCAST_RAW_ENCLOSURES_BUCKET_NAME: "${MC_PODCAST_RAW_ENCLOSURES_BUCKET_NAME}" + MC_PODCAST_TRANSCODED_EPISODES_BUCKET_NAME: "${MC_PODCAST_TRANSCODED_EPISODES_BUCKET_NAME}" + MC_PODCAST_TRANSCRIPTS_BUCKET_NAME: "${MC_PODCAST_TRANSCRIPTS_BUCKET_NAME}" + # Dev/test environments don't use path prefixes: + # + # * MC_PODCAST_RAW_ENCLOSURES_PATH_PREFIX + # * MC_PODCAST_TRANSCODED_EPISODES_PATH_PREFIX + # * MC_PODCAST_TRANSCRIPTS_PATH_PREFIX + # + # as they create a different, timestamped prefix for every test run. + + volumes: + - type: bind + source: ./bin/ + target: /opt/mediacloud/bin/ + - type: bind + source: ./src/ + target: /opt/mediacloud/src/podcast-transcribe-episode/ + - type: bind + source: ./tests/ + target: /opt/mediacloud/tests/ + - type: bind + source: ./../common/src/ + target: /opt/mediacloud/src/common/ + depends_on: + - postgresql-pgbouncer + - rabbitmq-server + - temporal-server + + # Not needed for running the test but useful for debugging, demos + # and such + - temporal-webapp + + postgresql-pgbouncer: + image: gcr.io/mcback/postgresql-pgbouncer:latest + init: true + stop_signal: SIGKILL + expose: + - 6432 + volumes: + - type: bind + source: ./../postgresql-pgbouncer/conf/ + target: /etc/pgbouncer/ + depends_on: + - postgresql-server + + postgresql-server: + image: gcr.io/mcback/postgresql-server:latest + init: true + stop_signal: SIGKILL + expose: + - 5432 + volumes: + - type: bind + source: ./../postgresql-server/bin/ + target: /opt/mediacloud/bin/ + - type: bind + source: ./../postgresql-server/schema/ + target: /opt/mediacloud/schema/ + - type: bind + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ + + rabbitmq-server: + image: gcr.io/mcback/rabbitmq-server:latest + init: true + stop_signal: SIGKILL + expose: + - 5672 + - 15672 + volumes: + - type: bind + source: ./../rabbitmq-server/conf/ + target: /etc/rabbitmq/ + + temporal-server: + image: gcr.io/mcback/temporal-server:latest + init: true + stop_signal: SIGKILL + depends_on: + - temporal-postgresql + - temporal-elasticsearch + expose: + - 6933 + - 6934 + - 6935 + - 6939 + - 7233 + - 7234 + - 7235 + - 7239 + volumes: + - type: bind + source: ./../temporal-server/bin/ + target: /opt/temporal-server/bin/ + - type: bind + source: ./../temporal-server/config/dynamicconfig.yaml + target: /opt/temporal-server/config/dynamicconfig.yaml + - type: bind + source: ./../temporal-server/config/mediacloud_template.yaml + target: /opt/temporal-server/config/mediacloud_template.yaml + + temporal-postgresql: + image: gcr.io/mcback/temporal-postgresql:latest + init: true + stop_signal: SIGKILL + expose: + - 5432 + volumes: + - type: bind + source: ./../temporal-postgresql/bin/ + target: /opt/temporal-postgresql/bin/ + - type: bind + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ + + temporal-elasticsearch: + image: gcr.io/mcback/temporal-elasticsearch:latest + init: true + stop_signal: SIGKILL + expose: + - "9200" + - "9300" + volumes: + - type: bind + source: ./../elasticsearch-base/bin/elasticsearch.sh + target: /opt/elasticsearch/bin/elasticsearch.sh + # Not mounting config as it gets concatenated into a single file + + temporal-webapp: + image: gcr.io/mcback/temporal-webapp:latest + init: true + stop_signal: SIGKILL + expose: + - "8088" + ports: + # Expose to host for debugging + - "8088:8088" diff --git a/apps/podcast-fetch-episode/tests/python/__init__.py b/apps/podcast-transcribe-episode/src/__init__.py similarity index 100% rename from apps/podcast-fetch-episode/tests/python/__init__.py rename to apps/podcast-transcribe-episode/src/__init__.py diff --git a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/__init__.py b/apps/podcast-transcribe-episode/src/python/__init__.py similarity index 100% rename from apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/__init__.py rename to apps/podcast-transcribe-episode/src/python/__init__.py diff --git a/apps/podcast-fetch-transcript/tests/python/__init__.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/__init__.py similarity index 100% rename from apps/podcast-fetch-transcript/tests/python/__init__.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/__init__.py diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/audio_codecs.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/audio_codecs.py similarity index 84% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/audio_codecs.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/audio_codecs.py index 4dafee4146..1529b3b34c 100644 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/audio_codecs.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/audio_codecs.py @@ -10,36 +10,32 @@ class AbstractAudioCodec(object, metaclass=abc.ABCMeta): - @classmethod - @abc.abstractmethod - def postgresql_enum_value(cls) -> str: - """Return value from 'podcast_episodes_audio_codec' PostgreSQL enum.""" - raise NotImplemented("Abstract method") - @classmethod @abc.abstractmethod def ffmpeg_stream_is_this_codec(cls, ffmpeg_stream: Dict[str, Any]) -> bool: """Return True if ffmpeg.probe()'s one of the streams ('streams' key) is of this codec.""" - raise NotImplemented("Abstract method") + raise NotImplementedError @classmethod @abc.abstractmethod def ffmpeg_container_format(cls) -> str: """Return FFmpeg container format (-f argument).""" - raise NotImplemented("Abstract method") + raise NotImplementedError @classmethod @abc.abstractmethod def mime_type(cls) -> str: """Return MIME type to store as GCS object metadata.""" - raise NotImplemented("Abstract method") + raise NotImplementedError + @classmethod + @abc.abstractmethod + def speech_api_codec(cls) -> str: + """Return codec enum value to pass to Speech API when submitting the transcription operation.""" + raise NotImplementedError -class Linear16AudioCodec(AbstractAudioCodec): - @classmethod - def postgresql_enum_value(cls) -> str: - return 'LINEAR16' +class Linear16AudioCodec(AbstractAudioCodec): @classmethod def ffmpeg_stream_is_this_codec(cls, ffmpeg_stream: Dict[str, Any]) -> bool: @@ -53,12 +49,12 @@ def ffmpeg_container_format(cls) -> str: def mime_type(cls) -> str: return 'audio/wav' + @classmethod + def speech_api_codec(cls) -> str: + return 'LINEAR16' -class FLACAudioCodec(AbstractAudioCodec): - @classmethod - def postgresql_enum_value(cls) -> str: - return 'FLAC' +class FLACAudioCodec(AbstractAudioCodec): @classmethod def ffmpeg_stream_is_this_codec(cls, ffmpeg_stream: Dict[str, Any]) -> bool: @@ -73,12 +69,12 @@ def ffmpeg_container_format(cls) -> str: def mime_type(cls) -> str: return 'audio/flac' + @classmethod + def speech_api_codec(cls) -> str: + return 'FLAC' -class MULAWAudioCodec(AbstractAudioCodec): - @classmethod - def postgresql_enum_value(cls) -> str: - return 'MULAW' +class MULAWAudioCodec(AbstractAudioCodec): @classmethod def ffmpeg_stream_is_this_codec(cls, ffmpeg_stream: Dict[str, Any]) -> bool: @@ -92,12 +88,12 @@ def ffmpeg_container_format(cls) -> str: def mime_type(cls) -> str: return 'audio/basic' + @classmethod + def speech_api_codec(cls) -> str: + return 'MULAW' -class OggOpusAudioCodec(AbstractAudioCodec): - @classmethod - def postgresql_enum_value(cls) -> str: - return 'OGG_OPUS' +class OggOpusAudioCodec(AbstractAudioCodec): @classmethod def ffmpeg_stream_is_this_codec(cls, ffmpeg_stream: Dict[str, Any]) -> bool: @@ -111,12 +107,12 @@ def ffmpeg_container_format(cls) -> str: def mime_type(cls) -> str: return 'audio/ogg' + @classmethod + def speech_api_codec(cls) -> str: + return 'OGG_OPUS' -class MP3AudioCodec(AbstractAudioCodec): - @classmethod - def postgresql_enum_value(cls) -> str: - return 'MP3' +class MP3AudioCodec(AbstractAudioCodec): @classmethod def ffmpeg_stream_is_this_codec(cls, ffmpeg_stream: Dict[str, Any]) -> bool: @@ -129,3 +125,7 @@ def ffmpeg_container_format(cls) -> str: @classmethod def mime_type(cls) -> str: return 'audio/mpeg' + + @classmethod + def speech_api_codec(cls) -> str: + return 'MP3' diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/bcp47_lang.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/bcp47_lang.py similarity index 100% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/bcp47_lang.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/bcp47_lang.py diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/config.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/config.py new file mode 100644 index 0000000000..cd41237dcc --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/config.py @@ -0,0 +1,108 @@ +import abc + +from mediawords.util.config import env_value, file_with_env_value + + +class AbstractGCBucketConfig(object, metaclass=abc.ABCMeta): + """ + Configuration of a single GCS bucket. + """ + + __slots__ = [ + '__bucket_name', + '__path_prefix', + ] + + def __init__(self, bucket_name: str = None, path_prefix: str = None): + """ + Constructor. + + Test classes might decide to override those. + """ + self.__bucket_name = bucket_name or self._default_bucket_name() + self.__path_prefix = path_prefix or self._default_path_prefix() + + def bucket_name(self) -> str: + return self.__bucket_name + + def path_prefix(self) -> str: + return self.__path_prefix + + @abc.abstractmethod + def _default_bucket_name(self) -> str: + """Default bucket name to upload objects to / download from.""" + raise NotImplementedError + + @abc.abstractmethod + def _default_path_prefix(self) -> str: + """Default path prefix under which the objects are to be found.""" + raise NotImplementedError + + +class RawEnclosuresGCBucketConfig(AbstractGCBucketConfig): + + def _default_bucket_name(self) -> str: + return env_value(name='MC_PODCAST_RAW_ENCLOSURES_BUCKET_NAME') + + def _default_path_prefix(self) -> str: + return env_value(name='MC_PODCAST_RAW_ENCLOSURES_PATH_PREFIX') + + +class TranscodedEpisodesGCBucketConfig(AbstractGCBucketConfig): + + def _default_bucket_name(self) -> str: + return env_value(name='MC_PODCAST_TRANSCODED_EPISODES_BUCKET_NAME') + + def _default_path_prefix(self) -> str: + return env_value(name='MC_PODCAST_TRANSCODED_EPISODES_PATH_PREFIX') + + +class TranscriptsGCBucketConfig(AbstractGCBucketConfig): + + def _default_bucket_name(self) -> str: + return env_value(name='MC_PODCAST_TRANSCRIPTS_BUCKET_NAME') + + def _default_path_prefix(self) -> str: + return env_value(name='MC_PODCAST_TRANSCRIPTS_PATH_PREFIX') + + +class GCAuthConfig(object): + + # noinspection PyMethodMayBeStatic + def json_file(self) -> str: + """Path to Google Cloud authentication JSON file.""" + return file_with_env_value(name='MC_PODCAST_AUTH_JSON_BASE64', encoded_with_base64=True) + + +class PodcastTranscribeEpisodeConfig(object): + """Podcast transcription configuration.""" + + # noinspection PyMethodMayBeStatic + def max_enclosure_size(self) -> int: + """Max. enclosure size (in bytes) that we're willing to download.""" + return 1024 * 1024 * 500 + + # noinspection PyMethodMayBeStatic + def max_duration(self) -> int: + """Max. podcast episode duration (in seconds) to submit for transcription.""" + return 60 * 60 * 2 + + # noinspection PyMethodMayBeStatic + def gc_auth(self) -> GCAuthConfig: + """Google Cloud (both Storage and Speech API) authentication configuration.""" + return GCAuthConfig() + + # noinspection PyMethodMayBeStatic + def raw_enclosures(self) -> AbstractGCBucketConfig: + """Configuration for GCS bucket where raw enclosures will be stored.""" + return RawEnclosuresGCBucketConfig() + + # noinspection PyMethodMayBeStatic + def transcoded_episodes(self) -> AbstractGCBucketConfig: + """Configuration for GCS bucket where transcoded, Speech API-ready episodes will be stored.""" + return TranscodedEpisodesGCBucketConfig() + + # noinspection PyMethodMayBeStatic + def transcripts(self) -> AbstractGCBucketConfig: + """Configuration for GCS bucket where JSON transcripts will be stored.""" + return TranscriptsGCBucketConfig() diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/enclosure.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/enclosure.py similarity index 72% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/enclosure.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/enclosure.py index 6d734c4d8b..f368de5ece 100644 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/enclosure.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/enclosure.py @@ -1,6 +1,7 @@ import dataclasses from typing import Optional, Dict, Any +# noinspection PyPackageRequirements from furl import furl from mediawords.db import DatabaseHandler @@ -9,25 +10,32 @@ log = create_logger(__name__) -_MP3_MIME_TYPES = {'audio/mpeg', 'audio/mpeg3', 'audio/mp3', 'audio/x-mpeg-3'} -"""MIME types which MP3 files might have.""" - -MAX_ENCLOSURE_SIZE = 1024 * 1024 * 500 -"""Max. enclosure size (in bytes) that we're willing to download.""" +StoryEnclosureDict = Dict[str, Any] @dataclasses.dataclass class StoryEnclosure(object): """Single story enclosure derived from feed's element.""" + + __MP3_MIME_TYPES = {'audio/mpeg', 'audio/mpeg3', 'audio/mp3', 'audio/x-mpeg-3'} + """MIME types which MP3 files might have.""" + story_enclosures_id: int + """ID from 'story_enclosures' table.""" + url: str + """Enclosure's URL, e.g. 'https://www.example.com/episode.mp3'.""" + mime_type: Optional[str] + """Enclosure's reported MIME type, or None if it wasn't reported; e.g. 'audio/mpeg'.""" + length: Optional[int] + """Enclosure's reported length in bytes, or None if it wasn't reported.""" def mime_type_is_mp3(self) -> bool: """Return True if declared MIME type is one of the MP3 ones.""" if self.mime_type: - if self.mime_type.lower() in _MP3_MIME_TYPES: + if self.mime_type.lower() in self.__MP3_MIME_TYPES: return True return False @@ -62,8 +70,15 @@ def from_db_row(cls, db_row: Dict[str, Any]) -> 'StoryEnclosure': length=db_row['length'], ) + def to_dict(self) -> StoryEnclosureDict: + return dataclasses.asdict(self) + + @classmethod + def from_dict(cls, input_dict: StoryEnclosureDict) -> 'StoryEnclosure': + return cls(**input_dict) + -def podcast_viable_enclosure_for_story(db: DatabaseHandler, stories_id: int) -> Optional[StoryEnclosure]: +def viable_story_enclosure(db: DatabaseHandler, stories_id: int) -> Optional[StoryEnclosure]: """Fetch all enclosures, find and return the one that looks like a podcast episode the most (or None).""" story_enclosures_dicts = db.query(""" SELECT * @@ -85,14 +100,14 @@ def podcast_viable_enclosure_for_story(db: DatabaseHandler, stories_id: int) -> for enclosure_dict in story_enclosures_dicts: if is_http_url(enclosure_dict['url']): - story_enclosures.append(StoryEnclosure.from_db_row(db_row=enclosure_dict)) + story_enclosures.append(StoryEnclosure.from_db_row(enclosure_dict)) chosen_enclosure = None # Look for MP3 files in MIME type for enclosure in story_enclosures: if enclosure.mime_type_is_mp3(): - log.info(f"Choosing enclosure '{enclosure}' by its MP3 MIME type '{enclosure.mime_type}'") + log.info(f"Choosing enclosure '{enclosure}' due to its MP3 MIME type '{enclosure.mime_type}'") chosen_enclosure = enclosure break @@ -100,7 +115,7 @@ def podcast_viable_enclosure_for_story(db: DatabaseHandler, stories_id: int) -> if not chosen_enclosure: for enclosure in story_enclosures: if enclosure.url_path_has_mp3_extension(): - log.info(f"Choosing enclosure '{enclosure}' by its URL '{enclosure.url}'") + log.info(f"Choosing enclosure '{enclosure}' due to its URL '{enclosure.url}'") chosen_enclosure = enclosure break @@ -109,7 +124,7 @@ def podcast_viable_enclosure_for_story(db: DatabaseHandler, stories_id: int) -> if not chosen_enclosure: for enclosure in story_enclosures: if enclosure.mime_type_is_audio(): - log.info(f"Choosing enclosure '{enclosure}' by its audio MIME type '{enclosure.mime_type}'") + log.info(f"Choosing enclosure '{enclosure}' due to its audio MIME type '{enclosure.mime_type}'") chosen_enclosure = enclosure break @@ -117,7 +132,7 @@ def podcast_viable_enclosure_for_story(db: DatabaseHandler, stories_id: int) -> if not chosen_enclosure: for enclosure in story_enclosures: if enclosure.mime_type_is_video(): - log.info(f"Choosing enclosure '{enclosure}' by its video MIME type '{enclosure.mime_type}'") + log.info(f"Choosing enclosure '{enclosure}' due to its video MIME type '{enclosure.mime_type}'") chosen_enclosure = enclosure break diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/fetch_url.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_url.py similarity index 63% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/fetch_url.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_url.py index 7d7b6716e6..5d76f9b768 100644 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/fetch_url.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_url.py @@ -1,10 +1,10 @@ import os +# noinspection PyPackageRequirements import requests from mediawords.util.log import create_logger - -from podcast_fetch_episode.exceptions import McPodcastFileFetchFailureException, McPodcastFileStoreFailureException +from mediawords.workflow.exceptions import McProgrammingError, McPermanentError, McTransientError log = create_logger(__name__) @@ -22,16 +22,17 @@ def fetch_big_file(url: str, dest_file: str, max_size: int = 0) -> None: """ Fetch a huge file from an URL to a local file. - Raises on exceptions. + Raises one of the _AbstractFetchBigFileException exceptions. :param url: URL that points to a huge file. :param dest_file: Destination path to write the fetched file to. :param max_size: If >0, limit the file size to a defined number of bytes. + :raise: ProgrammingError on unexpected fatal conditions. """ if os.path.exists(dest_file): # Something's wrong with the code - raise McPodcastFileStoreFailureException(f"Destination file '{dest_file}' already exists.") + raise McProgrammingError(f"Destination file '{dest_file}' already exists.") try: @@ -49,36 +50,31 @@ def fetch_big_file(url: str, dest_file: str, max_size: int = 0) -> None: bytes_read += len(chunk) if max_size: if bytes_read > max_size: - raise McPodcastFileFetchFailureException( - f"The file is bigger than the max. size of {max_size}" - ) + raise McPermanentError(f"The file is bigger than the max. size of {max_size}") f.write(chunk) f.flush() - except McPodcastFileFetchFailureException as ex: + except McPermanentError as ex: __cleanup_dest_file(dest_file=dest_file) - # Raise fetching failures further as they're soft exceptions - raise McPodcastFileFetchFailureException(f"Unable to fetch {url}: {ex}") + raise ex except requests.exceptions.RequestException as ex: __cleanup_dest_file(dest_file=dest_file) - # Treat any "requests" exception as a soft failure - raise McPodcastFileFetchFailureException(f"'requests' exception while fetching {url}: {ex}") + raise McTransientError(f"'requests' exception while fetching {url}: {ex}") except Exception as ex: __cleanup_dest_file(dest_file=dest_file) - # Any other exception is assumed to be a temporary file write problem - raise McPodcastFileStoreFailureException(f"Unable to fetch and store {url}: {ex}") + raise McTransientError(f"Unable to fetch and store {url}: {ex}") if not os.path.isfile(dest_file): __cleanup_dest_file(dest_file=dest_file) # There should be something here so in some way it is us that have messed up - raise McPodcastFileStoreFailureException(f"Fetched file {dest_file} is not here after fetching it.") + raise McProgrammingError(f"Fetched file {dest_file} is not here after fetching it.") diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/gcs_store.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/gcs_store.py new file mode 100644 index 0000000000..a41f76209d --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/gcs_store.py @@ -0,0 +1,217 @@ +import os +from typing import Optional + +# noinspection PyPackageRequirements +from google.cloud import storage +# noinspection PyPackageRequirements +from google.cloud.exceptions import NotFound +# noinspection PyPackageRequirements +from google.cloud.storage import Blob, Bucket +# noinspection PyPackageRequirements +from google.cloud.storage.retry import DEFAULT_RETRY + +from mediawords.util.log import create_logger +from mediawords.workflow.exceptions import McProgrammingError, McConfigurationError, McPermanentError, McTransientError + +from .config import AbstractGCBucketConfig, GCAuthConfig + +log = create_logger(__name__) + +_GCS_API_RETRIES = DEFAULT_RETRY.with_delay(initial=5, maximum=60, multiplier=2).with_deadline(deadline=60 * 10) +"""Google Cloud Storage's retry policy.""" + +_GCS_UPLOAD_DOWNLOAD_NUM_RETRIES = 10 +"""Number of retries to do when uploading / downloading.""" + + +class GCSStore(object): + """Google Cloud Storage store.""" + + __slots__ = [ + '__bucket_internal', + '__gc_auth_config', + '__bucket_config', + ] + + def __init__(self, bucket_config: AbstractGCBucketConfig, gc_auth_config: Optional[GCAuthConfig] = None): + if not bucket_config: + raise McConfigurationError("Bucket configuration is unset.") + + if not gc_auth_config: + gc_auth_config = GCAuthConfig() + + self.__gc_auth_config = gc_auth_config + self.__bucket_config = bucket_config + self.__bucket_internal = None + + @property + def _bucket(self) -> Bucket: + """Lazy-loaded bucket.""" + if not self.__bucket_internal: + + try: + storage_client = storage.Client.from_service_account_json(self.__gc_auth_config.json_file()) + self.__bucket_internal = storage_client.get_bucket( + bucket_or_name=self.__bucket_config.bucket_name(), + retry=_GCS_API_RETRIES, + ) + except Exception as ex: + raise McConfigurationError(f"Unable to get GCS bucket '{self.__bucket_config.bucket_name()}': {ex}") + + return self.__bucket_internal + + @classmethod + def _remote_path(cls, path_prefix: str, object_id: str): + if not object_id: + raise McProgrammingError("Object ID is unset.") + + path = os.path.join(path_prefix, object_id) + + # GCS doesn't like double slashes... + path = os.path.normpath(path) + + # ...nor is a fan of slashes at the start of path + while path.startswith('/'): + path = path[1:] + + return path + + def _blob_from_object_id(self, object_id: str) -> Blob: + if not object_id: + raise McProgrammingError("Object ID is unset.") + + remote_path = self._remote_path(path_prefix=self.__bucket_config.path_prefix(), object_id=object_id) + blob = self._bucket.blob(remote_path) + return blob + + def object_exists(self, object_id: str) -> bool: + """ + Test if object exists at remote location. + + :param object_id: Object ID that should be tested. + :return: True if object already exists under a given object ID. + """ + + if not object_id: + raise McProgrammingError("Object ID is unset.") + + log.debug(f"Testing if object ID {object_id} exists...") + + blob = self._blob_from_object_id(object_id=object_id) + + log.debug(f"Testing blob for existence: {blob}") + + try: + # blob.reload() returns metadata too + blob.reload(retry=_GCS_API_RETRIES) + + except NotFound as ex: + log.debug(f"Object '{object_id}' was not found: {ex}") + exists = False + + except Exception as ex: + raise McProgrammingError(f"Unable to test whether GCS object {object_id} exists: {ex}") + + else: + exists = True + + return exists + + def upload_object(self, local_file_path: str, object_id: str) -> None: + """ + Upload a local file to a GCS object. + + Will overwrite existing objects with a warning. + + :param local_file_path: Local file that should be stored. + :param object_id: Object ID under which the object should be stored. + """ + + if not os.path.isfile(local_file_path): + raise McProgrammingError(f"Local file '{local_file_path}' does not exist.") + + if not object_id: + raise McProgrammingError("Object ID is unset.") + + log.debug(f"Uploading '{local_file_path}' as object ID {object_id}...") + + if self.object_exists(object_id=object_id): + log.warning(f"Object {object_id} already exists, will overwrite.") + + blob = self._blob_from_object_id(object_id=object_id) + + try: + blob.upload_from_filename(filename=local_file_path, content_type='application/octet-stream') + except Exception as ex: + raise McTransientError(f"Unable to upload '{local_file_path}' as object ID {object_id}: {ex}") + + def download_object(self, object_id: str, local_file_path: str) -> None: + """ + Download a GCS object to a local file. + + :param object_id: Object ID of an object that should be downloaded. + :param local_file_path: Local file that the object should be stored to. + """ + + if os.path.isfile(local_file_path): + raise McProgrammingError(f"Local file '{local_file_path}' already exists.") + + if not object_id: + raise McProgrammingError("Object ID is unset.") + + log.debug(f"Downloading object ID {object_id} to '{local_file_path}'...") + + if not self.object_exists(object_id=object_id): + raise McPermanentError(f"Object ID {object_id} was not found.") + + blob = self._blob_from_object_id(object_id=object_id) + + try: + blob.download_to_filename(filename=local_file_path) + except Exception as ex: + raise McTransientError(f"Unable to download object ID {object_id} to '{local_file_path}': {ex}") + + def delete_object(self, object_id: str) -> None: + """ + Delete object from remote location. + + Doesn't raise if object doesn't exist. + + Used mostly for running tests, e.g. to find out what happens if the object to be fetched doesn't exist anymore. + + :param object_id: Object ID that should be deleted. + """ + + if not object_id: + raise McProgrammingError("Object ID is unset.") + + log.debug(f"Deleting object ID {object_id}...") + + blob = self._blob_from_object_id(object_id=object_id) + + try: + blob.delete(retry=_GCS_API_RETRIES) + + except NotFound: + log.warning(f"Object {object_id} doesn't exist.") + + except Exception as ex: + raise McProgrammingError(f"Unable to delete GCS object {object_id}: {ex}") + + def object_uri(self, object_id: str) -> str: + """ + Generate Google Cloud Storage URI for the object. + + :param object_id: Object ID to return the URI for. + :return: Full Google Cloud Storage URI of the object, e.g. "gs:////". + """ + + if not object_id: + raise McProgrammingError("Object ID is unset.") + + uri = "gs://{host}/{remote_path}".format( + host=self.__bucket_config.bucket_name(), + remote_path=self._remote_path(path_prefix=self.__bucket_config.path_prefix(), object_id=object_id), + ) + + return uri diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/media_info.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/media_info.py new file mode 100644 index 0000000000..b1bbaaf344 --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/media_info.py @@ -0,0 +1,190 @@ +import dataclasses +import math +import os +from typing import Type, Optional, List, Any, Dict + +# noinspection PyPackageRequirements +import ffmpeg + +from mediawords.util.log import create_logger +from mediawords.workflow.exceptions import McProgrammingError, McPermanentError + +from .audio_codecs import ( + AbstractAudioCodec, + Linear16AudioCodec, + FLACAudioCodec, + MULAWAudioCodec, + OggOpusAudioCodec, + MP3AudioCodec, +) + +log = create_logger(__name__) + +_SUPPORTED_CODEC_CLASSES = { + Linear16AudioCodec, + FLACAudioCodec, + MULAWAudioCodec, + OggOpusAudioCodec, + MP3AudioCodec, +} +"""Supported native audio codec classes.""" + +MediaFileInfoAudioStreamDict = Dict[str, Any] + + +@dataclasses.dataclass +class MediaFileInfoAudioStream(object): + """Information about a single audio stream in a media file.""" + + ffmpeg_stream_index: int + """FFmpeg internal stream index.""" + + audio_codec_class: Optional[Type[AbstractAudioCodec]] + """Audio codec class if the stream is one of the supported types and has single (mono) channel, None otherwise.""" + + duration: int + """Duration (in seconds).""" + + audio_channel_count: int + """Audio channel count.""" + + sample_rate: int + """Audio sample rate.""" + + def to_dict(self) -> MediaFileInfoAudioStreamDict: + return { + 'ffmpeg_stream_index': self.ffmpeg_stream_index, + 'audio_codec_class': self.audio_codec_class.__name__ if self.audio_codec_class else None, + 'duration': self.duration, + 'audio_channel_count': self.audio_channel_count, + 'sample_rate': self.sample_rate, + } + + @classmethod + def from_dict(cls, input_dict: MediaFileInfoAudioStreamDict) -> 'MediaFileInfoAudioStream': + return cls( + ffmpeg_stream_index=input_dict['ffmpeg_stream_index'], + + # FIXME a bit lame to do it this way + audio_codec_class=globals()[input_dict['audio_codec_class']] if input_dict['audio_codec_class'] else None, + + duration=input_dict['duration'], + audio_channel_count=input_dict['audio_channel_count'], + sample_rate=input_dict['sample_rate'], + ) + + +@dataclasses.dataclass +class MediaFileInfo(object): + """Information about media file.""" + + audio_streams: List[MediaFileInfoAudioStream] + """List of audio streams found in the media file.""" + + has_video_streams: bool + """True if the media file has video streams.""" + + def best_supported_audio_stream(self) -> Optional[MediaFileInfoAudioStream]: + """Return the first supported audio stream, if any.""" + for stream in self.audio_streams: + if stream.audio_codec_class: + return stream + return None + + +def media_file_info(media_file_path: str) -> MediaFileInfo: + """ + Read audio / video media file information, or raise if it can't be read. + + :param media_file_path: Full path to media file. + :return: MediaFileInfo object. + """ + if not os.path.isfile(media_file_path): + # Input file should exist at this point; it it doesn't, we have probably messed up something in the code + raise McProgrammingError(f"Input file {media_file_path} does not exist.") + + try: + file_info = ffmpeg.probe(media_file_path) + if not file_info: + raise Exception("Returned metadata is empty.") + except Exception as ex: + raise McPermanentError(f"Unable to read metadata from file {media_file_path}: {ex}") + + if 'streams' not in file_info: + # FFmpeg should come up with some sort of a stream in any case + raise McProgrammingError("Returned probe doesn't have 'streams' key.") + + # Test if one of the audio streams is of one of the supported codecs + audio_streams = [] + has_video_streams = False + for stream in file_info['streams']: + if stream['codec_type'] == 'audio': + + try: + audio_channel_count = int(stream['channels']) + if audio_channel_count == 0: + raise Exception("Audio channel count is 0") + except Exception as ex: + log.warning(f"Unable to read audio channel count from stream {stream}: {ex}") + # Just skip this stream if we can't figure it out + continue + + audio_codec_class = None + + # We'll need to transcode audio files with more than one channel count anyway + if audio_channel_count == 1: + for codec in _SUPPORTED_CODEC_CLASSES: + if codec.ffmpeg_stream_is_this_codec(ffmpeg_stream=stream): + audio_codec_class = codec + break + + try: + + if 'duration' in stream: + # 'duration': '3.766621' + duration = math.floor(float(stream['duration'])) + + elif 'DURATION' in stream.get('tags', {}): + # 'DURATION': '00:00:03.824000000' + duration_parts = stream['tags']['DURATION'].split(':') + if len(duration_parts) != 3: + raise McPermanentError(f"Unable to parse 'DURATION': {duration_parts}") + + hh = int(duration_parts[0]) + mm = int(duration_parts[1]) + ss_ms = duration_parts[2].split('.') + + if len(ss_ms) == 1: + ss = int(ss_ms[0]) + ms = 0 + elif len(ss_ms) == 2: + ss = int(ss_ms[0]) + ms = int(ss_ms[1]) + else: + raise McPermanentError(f"Unable to parse 'DURATION': {duration_parts}") + + duration = hh * 3600 + mm * 60 + ss + (1 if ms > 0 else 0) + + else: + raise McPermanentError(f"Stream doesn't have duration: {stream}") + + audio_stream = MediaFileInfoAudioStream( + ffmpeg_stream_index=stream['index'], + audio_codec_class=audio_codec_class, + duration=duration, + audio_channel_count=audio_channel_count, + sample_rate=int(stream['sample_rate']), + ) + audio_streams.append(audio_stream) + + except Exception as ex: + # Just skip this stream if we can't figure it out + log.warning(f"Unable to read audio stream data for stream {stream}: {ex}") + + elif stream['codec_type'] == 'video': + has_video_streams = True + + return MediaFileInfo( + audio_streams=audio_streams, + has_video_streams=has_video_streams, + ) diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/speech_api.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/speech_api.py new file mode 100644 index 0000000000..b7c100677b --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/speech_api.py @@ -0,0 +1,195 @@ +from typing import Optional + +# noinspection PyPackageRequirements +from google.api_core.exceptions import InvalidArgument, NotFound +# noinspection PyPackageRequirements +from google.api_core.operation import from_gapic, Operation +# noinspection PyPackageRequirements +from google.api_core.retry import Retry +# noinspection PyPackageRequirements +from google.cloud.speech_v1p1beta1 import ( + SpeechClient, RecognitionConfig, RecognitionAudio, LongRunningRecognizeResponse, LongRunningRecognizeMetadata, +) + +from mediawords.util.log import create_logger +from mediawords.workflow.exceptions import McProgrammingError + +from .config import GCAuthConfig +from .transcript import Transcript, UtteranceAlternative, Utterance +from .media_info import MediaFileInfoAudioStream + +log = create_logger(__name__) + +# Speech API sometimes throws: +# +# google.api_core.exceptions.ServiceUnavailable: 503 failed to connect to all addresses +# +# so let it retry for 10 minutes or so. +_GOOGLE_API_RETRIES = Retry(initial=5, maximum=60, multiplier=2, deadline=60 * 10) +"""Google Cloud API's own retry policy.""" + + +def submit_transcribe_operation(gs_uri: str, + episode_metadata: MediaFileInfoAudioStream, + bcp47_language_code: str, + gc_auth_config: Optional[GCAuthConfig] = None) -> str: + """ + Submit a Speech API long running operation to transcribe a podcast episode. + + :param gs_uri: Google Cloud Storage URI to a transcoded episode. + :param episode_metadata: Metadata derived from the episode while transcoding it. + :param bcp47_language_code: Episode's BCP 47 language code guessed from story's title + description. + :param gc_auth_config: Google Cloud authentication configuration instance. + :return Google Speech API operation ID by which the transcription operation can be referred to. + """ + + if not gc_auth_config: + gc_auth_config = GCAuthConfig() + + try: + client = SpeechClient.from_service_account_json(gc_auth_config.json_file()) + except Exception as ex: + raise McProgrammingError(f"Unable to create Speech API client: {ex}") + + try: + # noinspection PyTypeChecker + config = RecognitionConfig( + encoding=getattr(RecognitionConfig.AudioEncoding, episode_metadata.audio_codec_class.speech_api_codec()), + sample_rate_hertz=episode_metadata.sample_rate, + # We always set the channel count to 1 and disable separate recognition per channel as our inputs are all + # mono audio files and do not have separate speakers per audio channel. + audio_channel_count=1, + enable_separate_recognition_per_channel=False, + language_code=bcp47_language_code, + alternative_language_codes=[], + speech_contexts=[ + # Speech API works pretty well without custom contexts + ], + # Don't care that much about word confidence + enable_word_confidence=False, + # Punctuation doesn't work that well but we still enable it here + enable_automatic_punctuation=True, + # Not setting 'model' as 'use_enhanced' will then choose the best model for us + # Using enhanced (more expensive) model, where available + use_enhanced=True, + ) + except Exception as ex: + raise McProgrammingError(f"Unable to initialize Speech API configuration: {ex}") + + log.info(f"Submitting a Speech API operation for URI {gs_uri}...") + + try: + + # noinspection PyTypeChecker + audio = RecognitionAudio(uri=gs_uri) + + speech_operation = client.long_running_recognize(config=config, audio=audio, retry=_GOOGLE_API_RETRIES) + + except Exception as ex: + # If client's own retry mechanism doesn't work, then it's probably a programming error, e.g. outdated API client + raise McProgrammingError(f"Unable to submit a Speech API operation: {ex}") + + try: + # We get the operation name in a try-except block because accessing it is not that well documented, so Google + # might change the property names whenever they please and we wouldn't necessarily notice otherwise + operation_id = speech_operation.operation.name + if not operation_id: + raise McProgrammingError(f"Operation name is empty.") + except Exception as ex: + raise McProgrammingError(f"Unable to get operation name: {ex}") + + log.info(f"Submitted Speech API operation for URI {gs_uri}") + + return operation_id + + +def fetch_transcript(speech_operation_id: str, gc_auth_config: Optional[GCAuthConfig] = None) -> Optional[Transcript]: + """ + Try to fetch a transcript for a given speech operation ID. + + :param speech_operation_id: Speech operation ID. + :param gc_auth_config: Google Cloud authentication configuration instance. + :return: Transcript, or None if the transcript hasn't been prepared yet. + """ + if not speech_operation_id: + raise McProgrammingError(f"Speech operation ID is unset.") + + if not gc_auth_config: + gc_auth_config = GCAuthConfig() + + try: + client = SpeechClient.from_service_account_json(gc_auth_config.json_file()) + except Exception as ex: + raise McProgrammingError(f"Unable to initialize Speech API operations client: {ex}") + + try: + operation = client.transport.operations_client.get_operation( + name=speech_operation_id, + retry=_GOOGLE_API_RETRIES, + ) + except InvalidArgument as ex: + raise McProgrammingError(f"Invalid operation ID '{speech_operation_id}': {ex}") + except NotFound as ex: + raise McProgrammingError(f"Operation ID '{speech_operation_id}' was not found: {ex}") + except Exception as ex: + # On any other errors, raise a hard exception + raise McProgrammingError(f"Error while fetching operation ID '{speech_operation_id}': {ex}") + + if not operation: + raise McProgrammingError(f"Operation is unset.") + + try: + gapic_operation: Operation = from_gapic( + operation=operation, + operations_client=client.transport.operations_client, + result_type=LongRunningRecognizeResponse, + metadata_type=LongRunningRecognizeMetadata, + retry=_GOOGLE_API_RETRIES, + ) + except Exception as ex: + raise McProgrammingError(f"Unable to create GAPIC operation: {ex}") + + log.debug(f"GAPIC operation: {gapic_operation}") + log.debug(f"Operation metadata: {gapic_operation.metadata}") + log.debug(f"Operation is done: {gapic_operation.done()}") + log.debug(f"Operation error: {gapic_operation.done()}") + + try: + operation_is_done = gapic_operation.done(retry=_GOOGLE_API_RETRIES) + except Exception as ex: + # 'done' attribute might be gone in a newer version of the Speech API client + raise McProgrammingError( + f"Unable to test whether operation '{speech_operation_id}' is done: {ex}" + ) + + if not operation_is_done: + log.info(f"Operation '{speech_operation_id}' is still not done.") + return None + + utterances = [] + + try: + for result in gapic_operation.result(retry=_GOOGLE_API_RETRIES).results: + + alternatives = [] + for alternative in result.alternatives: + alternatives.append( + UtteranceAlternative( + text=alternative.transcript.strip(), + confidence=alternative.confidence, + ) + ) + + utterances.append( + Utterance( + alternatives=alternatives, + bcp47_language_code=result.language_code, + ) + ) + + except Exception as ex: + raise McProgrammingError( + f"Unable to read transcript for operation '{speech_operation_id}' due to other error: {ex}" + ) + + return Transcript(utterances=utterances) diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/transcode.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/transcode.py new file mode 100644 index 0000000000..627430e2e0 --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/transcode.py @@ -0,0 +1,92 @@ +import subprocess +import os + +from mediawords.util.log import create_logger +from mediawords.workflow.exceptions import McProgrammingError, McPermanentError + +from .media_info import media_file_info + +log = create_logger(__name__) + + +def maybe_transcode_file(input_file: str, maybe_output_file: str) -> bool: + """ + Transcode file (if needed) to something that Speech API will support. + + * If input has a video stream, it will be discarded; + * If input has more than one audio stream, others will be discarded leaving only one (preferably the one that Speech + API can support); + * If input doesn't have an audio stream in Speech API-supported codec, it will be transcoded to lossless + FLAC 16 bit in order to preserve quality; + * If the chosen audio stream has multiple channels (e.g. stereo or 5.1), it will be mixed into a single (mono) + channel as Speech API supports multi-channel recognition only when different voices speak into each of the + channels. + + :param input_file: Input media file to consider for transcoding. + :param maybe_output_file: If we decide to transcode, output media file to transcode to. + :return: True if file had to be transcoded into "maybe_output_file", or False if input file can be used as it is. + """ + + if not os.path.isfile(input_file): + raise McProgrammingError(f"File '{input_file}' does not exist.") + + # Independently from what has told us, identify the file type again ourselves + media_info = media_file_info(media_file_path=input_file) + + if not media_info.audio_streams: + raise McPermanentError("Downloaded file doesn't appear to have any audio streams.") + + ffmpeg_args = [] + + supported_audio_stream = media_info.best_supported_audio_stream() + if supported_audio_stream: + log.info(f"Found a supported audio stream") + + # Test if there is more than one audio stream + if len(media_info.audio_streams) > 1: + log.info(f"Found other audio streams besides the supported one, will discard those") + + ffmpeg_args.extend(['-f', supported_audio_stream.audio_codec_class.ffmpeg_container_format()]) + + # Select all audio streams + ffmpeg_args.extend(['-map', '0:a']) + + for stream in media_info.audio_streams: + # Deselect the unsupported streams + if stream != supported_audio_stream: + ffmpeg_args.extend(['-map', f'-0:a:{stream.ffmpeg_stream_index}']) + + # If a stream of a supported codec was not found, transcode it to FLAC 16 bit in order to not lose any quality + else: + log.info(f"None of the audio streams are supported by the Speech API, will transcode to FLAC") + + # Map first audio stream to input 0 + ffmpeg_args.extend(['-map', '0:a:0']) + + # Transcode to FLAC (16 bit) in order to not lose any quality + ffmpeg_args.extend(['-acodec', 'flac']) + ffmpeg_args.extend(['-f', 'flac']) + ffmpeg_args.extend(['-sample_fmt', 's16']) + + # Ensure that we end up with mono audio + ffmpeg_args.extend(['-ac', '1']) + + # If there's video in the file (e.g. video), remove it + if media_info.has_video_streams: + # Discard all video streams + ffmpeg_args.extend(['-map', '-0:v']) + + if not ffmpeg_args: + # No need to transcode -- caller should use the input file as-is + return False + + log.info(f"Transcoding '{input_file}' to '{maybe_output_file}'...") + + # I wasn't sure how to map outputs in "ffmpeg-python" library so here we call ffmpeg directly + ffmpeg_command = ['ffmpeg', '-nostdin', '-hide_banner', '-i', input_file] + ffmpeg_args + [maybe_output_file] + log.debug(f"FFmpeg command: {ffmpeg_command}") + subprocess.check_call(ffmpeg_command) + + log.info(f"Done transcoding '{input_file}' to '{maybe_output_file}'") + + return True diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/transcript.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/transcript.py new file mode 100644 index 0000000000..a497ee3bcd --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/transcript.py @@ -0,0 +1,75 @@ +import abc +import dataclasses +from typing import List, Dict, Any + + +class _AbstractFromDict(object, metaclass=abc.ABCMeta): + + @classmethod + @abc.abstractmethod + def from_dict(cls, input_dict: Dict[str, Any]) -> '_AbstractFromDict': + raise NotImplementedError + + +@dataclasses.dataclass +class UtteranceAlternative(object): + """One of the alternatives of what might have been said in an utterance.""" + + text: str + """Utterance text.""" + + confidence: float + """How confident Speech API is that it got it right.""" + + @classmethod + def from_dict(cls, input_dict: Dict[str, Any]) -> 'UtteranceAlternative': + return cls( + text=input_dict['text'], + confidence=input_dict['confidence'], + ) + + +@dataclasses.dataclass +class Utterance(object): + """A single transcribed utterance (often but not always a single sentence).""" + + alternatives: List[UtteranceAlternative] + """Alternatives of what might have been said in an utterance, ordered from the best to the worst guess.""" + + bcp47_language_code: str + """BCP 47 language code; might be different from what we've passed as the input.""" + + @property + def best_alternative(self) -> UtteranceAlternative: + """Return best alternative for what might have been said in an utterance.""" + return self.alternatives[0] + + @classmethod + def from_dict(cls, input_dict: Dict[str, Any]) -> 'Utterance': + return cls( + alternatives=[UtteranceAlternative.from_dict(x) for x in input_dict['alternatives']], + bcp47_language_code=input_dict['bcp47_language_code'], + ) + + +@dataclasses.dataclass +class Transcript(object): + """A single transcript.""" + + utterances: List[Utterance] + """List of ordered utterances in a transcript.""" + + # Only Transcript is to be serialized to JSON so to_dict() is implemented only here + def to_dict(self) -> Dict[str, Any]: + return dataclasses.asdict(self) + + @classmethod + def from_dict(cls, input_dict: Dict[str, Any]) -> 'Transcript': + return cls(utterances=[Utterance.from_dict(x) for x in input_dict['utterances']]) + + def download_text_from_transcript(self) -> str: + best_utterance_alternatives = [] + for utterance in self.utterances: + best_utterance_alternatives.append(utterance.best_alternative.text) + text = "\n\n".join(best_utterance_alternatives) + return text diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py new file mode 100644 index 0000000000..9788b0da60 --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py @@ -0,0 +1,322 @@ +import os +import tempfile +from typing import Optional + +# noinspection PyPackageRequirements +from temporal.workflow import Workflow + +from mediawords.db import connect_to_db_or_raise +from mediawords.dbi.downloads.store import store_content +from mediawords.job import JobBroker +from mediawords.util.parse_json import encode_json, decode_json +from mediawords.util.config.common import RabbitMQConfig +from mediawords.util.identify_language import identification_would_be_reliable, language_code_for_text +from mediawords.util.log import create_logger +from mediawords.util.parse_html import html_strip +from mediawords.util.url import get_url_host +from mediawords.workflow.exceptions import McProgrammingError, McTransientError, McPermanentError + +from .config import PodcastTranscribeEpisodeConfig +from .enclosure import viable_story_enclosure, StoryEnclosure, StoryEnclosureDict +from .fetch_url import fetch_big_file +from .gcs_store import GCSStore +from .bcp47_lang import iso_639_1_code_to_bcp_47_identifier +from .media_info import MediaFileInfoAudioStream, media_file_info, MediaFileInfoAudioStreamDict +from .speech_api import submit_transcribe_operation, fetch_transcript +from .transcode import maybe_transcode_file +from .transcript import Transcript +from .workflow_interface import PodcastTranscribeWorkflow, PodcastTranscribeActivities + +log = create_logger(__name__) + + +class PodcastTranscribeActivitiesImpl(PodcastTranscribeActivities): + """Activities implementation.""" + + async def identify_story_bcp47_language_code(self, stories_id: int) -> Optional[str]: + log.info(f"Identifying story language for story {stories_id}...") + + db = connect_to_db_or_raise() + + story = db.find_by_id(table='stories', object_id=stories_id) + if not story: + raise McPermanentError(f"Story {stories_id} was not found.") + + # Podcast episodes typically come with title and description set so try guessing from that + story_title = story['title'] + story_description = html_strip(story['description']) + sample_text = f"{story_title}\n{story_description}" + + bcp_47_language_code = None + if identification_would_be_reliable(text=sample_text): + iso_639_1_language_code = language_code_for_text(text=sample_text) + + # Convert to BCP 47 identifier + bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier( + iso_639_1_code=iso_639_1_language_code, + url_hint=story['url'], + ) + + log.info(f"Language code for story {stories_id} is {bcp_47_language_code}") + + return bcp_47_language_code + + async def determine_best_enclosure(self, stories_id: int) -> Optional[StoryEnclosureDict]: + + log.info(f"Determining best enclosure for story {stories_id}...") + + db = connect_to_db_or_raise() + + # Find the enclosure that might work the best + best_enclosure = viable_story_enclosure(db=db, stories_id=stories_id) + if not best_enclosure: + raise McPermanentError(f"There were no viable enclosures found for story {stories_id}") + + if best_enclosure.length: + if best_enclosure.length > self.config.max_enclosure_size(): + raise McPermanentError(f"Chosen enclosure {best_enclosure} is too big.") + + log.info(f"Done determining best enclosure for story {stories_id}") + log.debug(f"Best enclosure for story {stories_id}: {best_enclosure}") + + return best_enclosure.to_dict() + + async def fetch_enclosure_to_gcs(self, stories_id: int, enclosure: StoryEnclosureDict) -> None: + + log.info(f"Fetching enclosure to GCS for story {stories_id}") + log.debug(f"Best enclosure for story {stories_id}: {enclosure}") + + enclosure = StoryEnclosure.from_dict(enclosure) + + with tempfile.TemporaryDirectory(prefix='fetch_enclosure_to_gcs') as temp_dir: + raw_enclosure_path = os.path.join(temp_dir, 'raw_enclosure') + fetch_big_file(url=enclosure.url, dest_file=raw_enclosure_path, max_size=self.config.max_enclosure_size()) + + if os.stat(raw_enclosure_path).st_size == 0: + # Might happen with misconfigured webservers + raise McPermanentError(f"Fetched file {raw_enclosure_path} is empty.") + + gcs = GCSStore(bucket_config=self.config.raw_enclosures()) + gcs.upload_object(local_file_path=raw_enclosure_path, object_id=str(stories_id)) + + log.info(f"Done fetching enclosure to GCS for story {stories_id}") + + async def fetch_transcode_store_episode(self, stories_id: int) -> MediaFileInfoAudioStreamDict: + + log.info(f"Fetching, transcoding, storing episode for story {stories_id}...") + + with tempfile.TemporaryDirectory(prefix='fetch_transcode_store_episode') as temp_dir: + raw_enclosure_path = os.path.join(temp_dir, 'raw_enclosure') + + gcs_raw_enclosures = GCSStore(bucket_config=self.config.raw_enclosures()) + gcs_raw_enclosures.download_object( + object_id=str(stories_id), + local_file_path=raw_enclosure_path, + ) + del gcs_raw_enclosures + + if os.stat(raw_enclosure_path).st_size == 0: + # If somehow the file from GCS ended up being of zero length, then this is very much unexpected + raise McProgrammingError(f"Fetched file {raw_enclosure_path} is empty.") + + transcoded_episode_path = os.path.join(temp_dir, 'transcoded_episode') + + raw_enclosure_transcoded = maybe_transcode_file( + input_file=raw_enclosure_path, + maybe_output_file=transcoded_episode_path, + ) + if not raw_enclosure_transcoded: + transcoded_episode_path = raw_enclosure_path + + del raw_enclosure_path + + gcs_transcoded_episodes = GCSStore(bucket_config=self.config.transcoded_episodes()) + gcs_transcoded_episodes.upload_object(local_file_path=transcoded_episode_path, object_id=str(stories_id)) + + # (Re)read the properties of either the original or the transcoded file + media_info = media_file_info(media_file_path=transcoded_episode_path) + best_audio_stream = media_info.best_supported_audio_stream() + + if not best_audio_stream.audio_codec_class: + raise McProgrammingError("Best audio stream doesn't have audio class set") + + log.info(f"Done fetching, transcoding, storing episode for story {stories_id}") + log.debug(f"Best audio stream for story {stories_id}: {best_audio_stream}") + + return best_audio_stream.to_dict() + + async def submit_transcribe_operation(self, + stories_id: int, + episode_metadata: MediaFileInfoAudioStreamDict, + bcp47_language_code: str) -> str: + + log.info(f"Submitting transcribe operation for story {stories_id}...") + log.debug(f"Episode metadata for story {stories_id}: {episode_metadata}") + log.debug(f"Language code for story {stories_id}: {bcp47_language_code}") + + episode_metadata = MediaFileInfoAudioStream.from_dict(episode_metadata) + + if not episode_metadata.audio_codec_class: + raise McProgrammingError("Best audio stream doesn't have audio class set") + + gcs_transcoded_episodes = GCSStore(bucket_config=self.config.transcoded_episodes()) + gs_uri = gcs_transcoded_episodes.object_uri(object_id=str(stories_id)) + + speech_operation_id = submit_transcribe_operation( + gs_uri=gs_uri, + episode_metadata=episode_metadata, + bcp47_language_code=bcp47_language_code, + gc_auth_config=self.config.gc_auth(), + ) + + log.info(f"Done submitting transcribe operation for story {stories_id}") + log.debug(f"Speech operation ID for story {stories_id}: {speech_operation_id}") + + return speech_operation_id + + async def fetch_store_raw_transcript_json(self, stories_id: int, speech_operation_id: str) -> None: + + log.info(f"Fetching and storing raw transcript JSON for story {stories_id}...") + log.debug(f"Speech operation ID: {speech_operation_id}") + + transcript = fetch_transcript(speech_operation_id=speech_operation_id, gc_auth_config=self.config.gc_auth()) + if transcript is None: + raise McTransientError(f"Speech operation with ID '{speech_operation_id}' hasn't been completed yet.") + + transcript_json = encode_json(transcript.to_dict()) + + with tempfile.TemporaryDirectory(prefix='fetch_store_raw_transcript_json') as temp_dir: + transcript_json_path = os.path.join(temp_dir, 'transcript.json') + + with open(transcript_json_path, 'w') as f: + f.write(transcript_json) + + gcs = GCSStore(bucket_config=self.config.transcripts()) + gcs.upload_object(local_file_path=transcript_json_path, object_id=str(stories_id)) + + log.info(f"Done fetching and storing raw transcript JSON for story {stories_id}") + + async def fetch_store_transcript(self, stories_id: int) -> None: + + log.info(f"Fetching and storing transcript for story {stories_id}...") + + with tempfile.TemporaryDirectory(prefix='fetch_store_transcript') as temp_dir: + transcript_json_path = os.path.join(temp_dir, 'transcript.json') + + gcs = GCSStore(bucket_config=self.config.transcripts()) + gcs.download_object(object_id=str(stories_id), local_file_path=transcript_json_path) + + with open(transcript_json_path, 'r') as f: + transcript_json = f.read() + + transcript = Transcript.from_dict(decode_json(transcript_json)) + + db = connect_to_db_or_raise() + + story = db.find_by_id(table='stories', object_id=stories_id) + + feed = db.query(""" + SELECT * + FROM feeds + WHERE feeds_id = ( + SELECT feeds_id + FROM feeds_stories_map + WHERE stories_id = %(stories_id)s + ) + """, { + 'stories_id': stories_id, + }).hash() + + # Just like create_download_for_new_story(), it creates a new download except that it tests if such a download + # exists first + download = db.find_or_create( + table='downloads', + insert_hash={ + 'feeds_id': feed['feeds_id'], + 'stories_id': story['stories_id'], + 'url': story['url'], + 'host': get_url_host(story['url']), + 'type': 'content', + 'sequence': 1, + 'state': 'success', + 'path': 'content:pending', + 'priority': 1, + 'extracted': 'f' + }, + ) + + text = transcript.download_text_from_transcript() + + # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later + store_content(db=db, download=download, content=text) + + log.info(f"Done fetching and storing transcript for story {stories_id}") + + async def add_to_extraction_queue(self, stories_id: int) -> None: + + log.info(f"Adding an extraction job for story {stories_id}...") + + job_broker = JobBroker( + queue_name='MediaWords::Job::ExtractAndVector', + rabbitmq_config=RabbitMQConfig( + + # Keep RabbitMQ's timeout smaller than the action's "start_to_close_timeout" + timeout=60, + + # Disable retries as Temporal will be the one that does all the retrying + retries=None, + ), + ) + + # add_to_queue() is not idempotent but it's not a big deal to extract a single story twice + job_broker.add_to_queue(stories_id=stories_id) + + log.info(f"Done adding an extraction job for story {stories_id}") + + +class PodcastTranscribeWorkflowImpl(PodcastTranscribeWorkflow): + """Workflow implementation.""" + + def __init__(self): + self.activities: PodcastTranscribeActivities = Workflow.new_activity_stub( + activities_cls=PodcastTranscribeActivities, + # No retry_parameters here as they get set individually in @activity_method() + ) + + async def transcribe_episode(self, stories_id: int) -> None: + + bcp47_language_code = await self.activities.identify_story_bcp47_language_code(stories_id) + if bcp47_language_code is None: + # Default to English in case there wasn't enough sizable text in title / description to make a good guess + bcp47_language_code = 'en' + + enclosure = await self.activities.determine_best_enclosure(stories_id) + if not enclosure: + raise McPermanentError(f"No viable enclosure found for story {stories_id}") + + await self.activities.fetch_enclosure_to_gcs(stories_id, enclosure) + + episode_metadata_dict = await self.activities.fetch_transcode_store_episode(stories_id) + + episode_metadata = MediaFileInfoAudioStream.from_dict(episode_metadata_dict) + + max_duration = PodcastTranscribeEpisodeConfig().max_duration() + if episode_metadata.duration > max_duration: + raise McPermanentError( + f"Episode's duration ({episode_metadata.duration} s) exceeds max. duration ({max_duration} s)" + ) + + speech_operation_id = await self.activities.submit_transcribe_operation( + stories_id, + episode_metadata_dict, + bcp47_language_code, + ) + + # Wait for Google Speech API to finish up transcribing + await Workflow.sleep(int(episode_metadata.duration * 1.1)) + + await self.activities.fetch_store_raw_transcript_json(stories_id, speech_operation_id) + + await self.activities.fetch_store_transcript(stories_id) + + await self.activities.add_to_extraction_queue(stories_id) diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow_interface.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow_interface.py new file mode 100644 index 0000000000..b44ab9694a --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow_interface.py @@ -0,0 +1,220 @@ +import dataclasses +from datetime import timedelta +from typing import Optional + +# noinspection PyPackageRequirements +from temporal.activity_method import activity_method, RetryParameters +# noinspection PyPackageRequirements +from temporal.workflow import workflow_method + +from mediawords.workflow.exceptions import McPermanentError + +from .config import PodcastTranscribeEpisodeConfig +from .enclosure import StoryEnclosureDict +from .media_info import MediaFileInfoAudioStreamDict + +TASK_QUEUE = "podcast-transcribe-episode" +"""Temporal task queue.""" + +DEFAULT_RETRY_PARAMETERS = RetryParameters( + initial_interval=timedelta(seconds=1), + backoff_coefficient=2, + maximum_interval=timedelta(hours=2), + maximum_attempts=1000, + non_retryable_error_types=[ + McPermanentError.__name__, + ], +) + + +class PodcastTranscribeActivities(object): + + @classmethod + def _create_config(cls) -> PodcastTranscribeEpisodeConfig: + """ + Create and return configuration instance to be used for running the workflow. + + Might get overridden in case some configuration changes have to be made while running the tests. + """ + return PodcastTranscribeEpisodeConfig() + + def __init__(self): + super().__init__() + self.config = self._create_config() + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=60), + retry_parameters=DEFAULT_RETRY_PARAMETERS, + ) + async def identify_story_bcp47_language_code(self, stories_id: int) -> Optional[str]: + """ + Guess BCP 47 language code of a story. + + https://cloud.google.com/speech-to-text/docs/languages + + :param stories_id: Story to guess the language code for. + :return: BCP 47 language code (e.g. 'en-US') or None if the language code could not be determined. + """ + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=60), + retry_parameters=DEFAULT_RETRY_PARAMETERS, + ) + async def determine_best_enclosure(self, stories_id: int) -> Optional[StoryEnclosureDict]: + """ + Fetch a list of story enclosures, determine which one looks like a podcast episode the most. + + Uses or similar tag. + + :param stories_id: Story to fetch the enclosures for. + :return: Best enclosure metadata object (as dict), or None if no best enclosure could be determined. + """ + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + # With a super-slow server, it's probably reasonable to expect that it might take a few hours to fetch a single + # episode + start_to_close_timeout=timedelta(hours=2), + retry_parameters=dataclasses.replace( + DEFAULT_RETRY_PARAMETERS, + + # Wait for a minute before trying again + initial_interval=timedelta(minutes=1), + + # Hope for the server to resurrect in a week + maximum_interval=timedelta(weeks=1), + + # Don't kill ourselves trying to hit a permanently dead server + maximum_attempts=50, + ), + ) + async def fetch_enclosure_to_gcs(self, stories_id: int, enclosure: StoryEnclosureDict) -> None: + """ + Fetch enclosure and store it to GCS as an episode. + + Doesn't do transcoding or anything because transcoding or any subsequent steps might fail, and if they do, we + want to have the raw episode fetched and safely stored somewhere. + + :param stories_id: Story to fetch the enclosure for. + :param enclosure: Enclosure to fetch (as dict). + """ + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + + # Let's expect super long episodes or super slow servers + start_to_close_timeout=timedelta(hours=2), + + retry_parameters=dataclasses.replace( + DEFAULT_RETRY_PARAMETERS, + + # Wait for a minute before trying again (GCS might be down) + initial_interval=timedelta(minutes=1), + + # Hope for GCS to resurrect in a day + maximum_interval=timedelta(days=1), + + # Limit attempts because transcoding itself might be broken, and we don't want to be fetching huge objects + # from GCS periodically + maximum_attempts=20, + ), + ) + async def fetch_transcode_store_episode(self, stories_id: int) -> MediaFileInfoAudioStreamDict: + """ + Fetch episode from GCS, transcode it if needed and store it to GCS again in a separate bucket. + + Now that the raw episode file is safely located in GCS, we can try transcoding it. + + :param stories_id: Story ID the episode of which should be transcoded. + :return: Metadata of the best audio stream determined as part of the transcoding (as dict). + """ + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + + # Give a bit more time as the implementation is likely to do some non-Temporal retries on weird Speech API + # errors + start_to_close_timeout=timedelta(minutes=5), + + retry_parameters=dataclasses.replace( + DEFAULT_RETRY_PARAMETERS, + + # Given that the thing is costly, wait a whole hour before retrying anything + initial_interval=timedelta(hours=1), + + # Hope for the Speech API to resurrect in a week + maximum_interval=timedelta(weeks=1), + + # Don't retry too much as each try is potentially very costly + maximum_attempts=10, + ), + ) + async def submit_transcribe_operation(self, + stories_id: int, + episode_metadata: MediaFileInfoAudioStreamDict, + bcp47_language_code: str) -> str: + """ + Submit a long-running transcription operation to the Speech API. + + :param stories_id: Story ID of the episode which should be submitted for transcribing. + :param episode_metadata: Metadata of transcoded episode (as dict). + :param bcp47_language_code: BCP 47 language code of the story. + :return: Speech API operation ID for the transcription operation. + """ + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=60), + retry_parameters=DEFAULT_RETRY_PARAMETERS, + ) + async def fetch_store_raw_transcript_json(self, stories_id: int, speech_operation_id: str) -> None: + """ + Fetch a finished transcription and store the raw JSON of it into a GCS bucket. + + Raises an exception if the transcription operation is not finished yet. + + :param stories_id: Story ID the episode of which should be submitted for transcribing. + :param speech_operation_id: Speech API operation ID. + """ + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=60), + retry_parameters=DEFAULT_RETRY_PARAMETERS, + ) + async def fetch_store_transcript(self, stories_id: int) -> None: + """ + Fetch a raw JSON transcript from a GCS bucket, store it to "download_texts". + + :param stories_id: Story ID the transcript of which should be stored into the database. + """ + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(minutes=2), + retry_parameters=DEFAULT_RETRY_PARAMETERS, + ) + async def add_to_extraction_queue(self, stories_id: int) -> None: + """ + Add a story to the extraction queue. + + :param stories_id: Story ID to be added to the extraction queue. + """ + raise NotImplementedError + + +class PodcastTranscribeWorkflow(object): + """Workflow interface.""" + + @workflow_method(task_queue=TASK_QUEUE) + async def transcribe_episode(self, stories_id: int) -> None: + raise NotImplementedError diff --git a/apps/podcast-transcribe-episode/src/requirements.txt b/apps/podcast-transcribe-episode/src/requirements.txt new file mode 100644 index 0000000000..cdd9a7b8bd --- /dev/null +++ b/apps/podcast-transcribe-episode/src/requirements.txt @@ -0,0 +1,3 @@ +ffmpeg-python==0.2.0 +google-cloud-speech==2.3.0 +google-cloud-storage==1.38.0 diff --git a/apps/podcast-fetch-episode/tests/data/media-samples b/apps/podcast-transcribe-episode/tests/data/media-samples similarity index 100% rename from apps/podcast-fetch-episode/tests/data/media-samples rename to apps/podcast-transcribe-episode/tests/data/media-samples diff --git a/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/__init__.py b/apps/podcast-transcribe-episode/tests/python/__init__.py similarity index 100% rename from apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/__init__.py rename to apps/podcast-transcribe-episode/tests/python/__init__.py diff --git a/apps/podcast-transcribe-episode/tests/python/random_gcs_prefix.py b/apps/podcast-transcribe-episode/tests/python/random_gcs_prefix.py new file mode 100644 index 0000000000..1ffc4e59c9 --- /dev/null +++ b/apps/podcast-transcribe-episode/tests/python/random_gcs_prefix.py @@ -0,0 +1,16 @@ +import datetime + +from mediawords.util.text import random_string + + +def random_gcs_path_prefix() -> str: + """ + Generates a random path prefix to store the objects at. + + Makes it easier to debug what gets written to GCS and get rid of said objects afterwards. + """ + + date = datetime.datetime.utcnow().isoformat() + date = date.replace(':', '_') + prefix = f'tests-{date}-{random_string(length=32)}' + return prefix diff --git a/apps/podcast-fetch-episode/tests/python/test_bcp47_lang.py b/apps/podcast-transcribe-episode/tests/python/test_bcp47_lang.py similarity index 85% rename from apps/podcast-fetch-episode/tests/python/test_bcp47_lang.py rename to apps/podcast-transcribe-episode/tests/python/test_bcp47_lang.py index ce52f78140..a1f9604162 100644 --- a/apps/podcast-fetch-episode/tests/python/test_bcp47_lang.py +++ b/apps/podcast-transcribe-episode/tests/python/test_bcp47_lang.py @@ -1,5 +1,5 @@ # noinspection PyProtectedMember -from podcast_fetch_episode.bcp47_lang import _country_tld_from_url, iso_639_1_code_to_bcp_47_identifier +from podcast_transcribe_episode.bcp47_lang import _country_tld_from_url, iso_639_1_code_to_bcp_47_identifier def test_country_tld_from_url(): diff --git a/apps/podcast-fetch-episode/tests/python/test_enclosure.py b/apps/podcast-transcribe-episode/tests/python/test_enclosure.py similarity index 94% rename from apps/podcast-fetch-episode/tests/python/test_enclosure.py rename to apps/podcast-transcribe-episode/tests/python/test_enclosure.py index 76813bd367..5d3d1be33b 100644 --- a/apps/podcast-fetch-episode/tests/python/test_enclosure.py +++ b/apps/podcast-transcribe-episode/tests/python/test_enclosure.py @@ -9,7 +9,7 @@ create_test_story, ) -from podcast_fetch_episode.enclosure import podcast_viable_enclosure_for_story, StoryEnclosure +from podcast_transcribe_episode.enclosure import viable_story_enclosure, StoryEnclosure @dataclasses.dataclass @@ -43,7 +43,7 @@ def test_no_enclosures(self): ) ) - assert podcast_viable_enclosure_for_story( + assert viable_story_enclosure( db=self._DB, stories_id=no_enclosures.stories_id, ) is None, "Story with no enclosures." @@ -66,7 +66,7 @@ def test_enclosure_with_empty_url(self): }) ) - assert podcast_viable_enclosure_for_story( + assert viable_story_enclosure( db=self._DB, stories_id=enclosure_with_empty_url.stories_id, ) is None, "Story with an empty enclosure URL." @@ -89,7 +89,7 @@ def test_single_mp3_enclosure(self): }) ) - assert podcast_viable_enclosure_for_story( + assert viable_story_enclosure( db=self._DB, stories_id=single_mp3_enclosure.stories_id, ) == StoryEnclosure.from_db_row(single_mp3_enclosure.enclosures[0]), ( @@ -114,7 +114,7 @@ def test_single_mp3_without_mime_enclosure(self): }) ) - assert podcast_viable_enclosure_for_story( + assert viable_story_enclosure( db=self._DB, stories_id=single_mp3_without_mime_enclosure.stories_id, ) == StoryEnclosure.from_db_row(single_mp3_without_mime_enclosure.enclosures[0]), ( @@ -145,7 +145,7 @@ def test_multiple_audio_enclosures(self): }), ]) - assert podcast_viable_enclosure_for_story( + assert viable_story_enclosure( db=self._DB, stories_id=multiple_audio_enclosures.stories_id, ) == StoryEnclosure.from_db_row(multiple_audio_enclosures.enclosures[1]), ( @@ -176,7 +176,7 @@ def test_multiple_unsupported_audio_enclosures(self): }), ]) - assert podcast_viable_enclosure_for_story( + assert viable_story_enclosure( db=self._DB, stories_id=multiple_unsupported_audio_enclosures.stories_id, ) == StoryEnclosure.from_db_row(multiple_unsupported_audio_enclosures.enclosures[0]), ( @@ -207,7 +207,7 @@ def test_audio_and_video_enclosures(self): }), ]) - assert podcast_viable_enclosure_for_story( + assert viable_story_enclosure( db=self._DB, stories_id=audio_and_video_enclosures.stories_id, ) == StoryEnclosure.from_db_row(audio_and_video_enclosures.enclosures[1]), ( @@ -238,7 +238,7 @@ def test_only_video_enclosures(self): }), ]) - assert podcast_viable_enclosure_for_story( + assert viable_story_enclosure( db=self._DB, stories_id=only_video_enclosures.stories_id, ) == StoryEnclosure.from_db_row(only_video_enclosures.enclosures[0]), ( diff --git a/apps/podcast-fetch-episode/tests/python/test_fetch_url.py b/apps/podcast-transcribe-episode/tests/python/test_fetch_url.py similarity index 92% rename from apps/podcast-fetch-episode/tests/python/test_fetch_url.py rename to apps/podcast-transcribe-episode/tests/python/test_fetch_url.py index 5b546a4c7f..7106a3882c 100644 --- a/apps/podcast-fetch-episode/tests/python/test_fetch_url.py +++ b/apps/podcast-transcribe-episode/tests/python/test_fetch_url.py @@ -4,13 +4,14 @@ from typing import Union from unittest import TestCase +# noinspection PyPackageRequirements import pytest from mediawords.test.hash_server import HashServer from mediawords.util.network import random_unused_port +from mediawords.workflow.exceptions import McPermanentError -from podcast_fetch_episode.exceptions import McPodcastFileFetchFailureException -from podcast_fetch_episode.fetch_url import fetch_big_file +from podcast_transcribe_episode.fetch_url import fetch_big_file class TestFetchBigFile(TestCase): @@ -74,6 +75,6 @@ def test_max_size(self): max_size = len(self.__mock_data) - 1000 # Function should refuse to fetch more than {max_size} bytes - with pytest.raises(McPodcastFileFetchFailureException): + with pytest.raises(McPermanentError): fetch_big_file(url=self.__url, dest_file=self.__dest_file, max_size=max_size) assert not os.path.isfile(self.__dest_file), f"File '{self.__dest_file}' should exist after a failed download." diff --git a/apps/podcast-fetch-episode/tests/python/test_gcs_store.py b/apps/podcast-transcribe-episode/tests/python/test_gcs_store.py similarity index 52% rename from apps/podcast-fetch-episode/tests/python/test_gcs_store.py rename to apps/podcast-transcribe-episode/tests/python/test_gcs_store.py index 71aa095811..4f732da6dc 100644 --- a/apps/podcast-fetch-episode/tests/python/test_gcs_store.py +++ b/apps/podcast-transcribe-episode/tests/python/test_gcs_store.py @@ -1,23 +1,31 @@ +import filecmp import os import tempfile from unittest import TestCase +# noinspection PyPackageRequirements import pytest -from podcast_fetch_episode.config import PodcastFetchEpisodeConfig -from podcast_fetch_episode.exceptions import McPodcastMisconfiguredGCSException +from mediawords.workflow.exceptions import McProgrammingError, McPermanentError -from podcast_fetch_episode.gcs_store import GCSStore +from podcast_transcribe_episode.config import RawEnclosuresGCBucketConfig +from podcast_transcribe_episode.gcs_store import GCSStore -from .config_random_gcs_prefix import RandomPathPrefixConfig +from .random_gcs_prefix import random_gcs_path_prefix + + +class _RandomPrefixBucketConfig(RawEnclosuresGCBucketConfig): + """Bucket with random path prefix.""" + + def __init__(self): + super().__init__(path_prefix=random_gcs_path_prefix()) class TestGCSStore(TestCase): def test_remote_path(self): - # Empty object ID - with pytest.raises(McPodcastMisconfiguredGCSException): + with pytest.raises(McProgrammingError): GCSStore._remote_path(path_prefix='', object_id='') assert GCSStore._remote_path(path_prefix='', object_id='a') == 'a' @@ -35,52 +43,34 @@ def test_remote_path(self): assert GCSStore._remote_path(path_prefix='//', object_id='//a///b//../b/c') == 'a/b/c' - def test_object_uri(self): - gcs = GCSStore() - - # Empty object ID - with pytest.raises(McPodcastMisconfiguredGCSException): - gcs.object_uri(object_id='') - - class NoPathPrefixConfig(PodcastFetchEpisodeConfig): - - @staticmethod - def gc_storage_path_prefix() -> str: - return '' - - config = NoPathPrefixConfig() - gcs = GCSStore(config=config) - assert gcs.object_uri(object_id='a') == f'gs://{config.gc_storage_bucket_name()}/a' - - class MultiPathPrefixConfig(PodcastFetchEpisodeConfig): - - @staticmethod - def gc_storage_path_prefix() -> str: - return '//foo/bar//' - - config = MultiPathPrefixConfig() - gcs = GCSStore(config=config) - assert gcs.object_uri(object_id='a') == f'gs://{config.gc_storage_bucket_name()}/foo/bar/a' - def test_store_exists_delete(self): - config = RandomPathPrefixConfig() - gcs = GCSStore(config=config) + config = _RandomPrefixBucketConfig() + gcs = GCSStore(bucket_config=config) object_id = 'test' assert gcs.object_exists(object_id=object_id) is False mock_data = os.urandom(1024 * 10) - temp_file = os.path.join(tempfile.mkdtemp('test'), 'test') - with open(temp_file, mode='wb') as f: + src_file = os.path.join(tempfile.mkdtemp('test'), 'src') + with open(src_file, mode='wb') as f: f.write(mock_data) - gcs.store_object(local_file_path=temp_file, object_id=object_id) + gcs.upload_object(local_file_path=src_file, object_id=object_id) assert gcs.object_exists(object_id=object_id) is True # Try storing twice - gcs.store_object(local_file_path=temp_file, object_id=object_id) + gcs.upload_object(local_file_path=src_file, object_id=object_id) assert gcs.object_exists(object_id=object_id) is True + dst_file = os.path.join(tempfile.mkdtemp('test'), 'dst') + gcs.download_object(object_id=object_id, local_file_path=dst_file) + assert os.path.isfile(dst_file) + assert filecmp.cmp(src_file, dst_file, shallow=False) + + # Try downloading nonexistent file + with pytest.raises(McPermanentError): + gcs.download_object(object_id='999999', local_file_path=os.path.join(tempfile.mkdtemp('test'), 'foo')) + gcs.delete_object(object_id=object_id) assert gcs.object_exists(object_id=object_id) is False diff --git a/apps/podcast-fetch-episode/tests/python/test_media_file.py b/apps/podcast-transcribe-episode/tests/python/test_media_file.py similarity index 71% rename from apps/podcast-fetch-episode/tests/python/test_media_file.py rename to apps/podcast-transcribe-episode/tests/python/test_media_file.py index 8ca91429fe..17aca0d29a 100644 --- a/apps/podcast-fetch-episode/tests/python/test_media_file.py +++ b/apps/podcast-transcribe-episode/tests/python/test_media_file.py @@ -1,17 +1,16 @@ import hashlib import inspect import os +import tempfile +# noinspection PyPackageRequirements import pytest -from podcast_fetch_episode.audio_codecs import AbstractAudioCodec -from podcast_fetch_episode.exceptions import McPodcastFileIsInvalidException -from podcast_fetch_episode.media_file import ( - MediaFileInfo, - media_file_info, - TranscodeTempDirAndFile, - transcode_media_file_if_needed, -) +from mediawords.workflow.exceptions import McPermanentError + +from podcast_transcribe_episode.audio_codecs import AbstractAudioCodec +from podcast_transcribe_episode.media_info import media_file_info, MediaFileInfo +from podcast_transcribe_episode.transcode import maybe_transcode_file MEDIA_SAMPLES_PATH = '/opt/mediacloud/tests/data/media-samples/samples/' assert os.path.isdir(MEDIA_SAMPLES_PATH), f"Directory with media samples '{MEDIA_SAMPLES_PATH}' should exist." @@ -37,8 +36,7 @@ def test_media_file_info(): if '-invalid' in filename: - # - with pytest.raises(McPodcastFileIsInvalidException): + with pytest.raises(McPermanentError): media_file_info(media_file_path=input_file_path) else: @@ -79,35 +77,42 @@ def _file_sha1_hash(file_path: str) -> str: return sha1.hexdigest() -def test_transcode_media_file_if_needed(): - """Test transcode_media_if_needed().""" - +def test_maybe_transcode_file(): for filename in SAMPLE_FILENAMES: input_file_path = os.path.join(MEDIA_SAMPLES_PATH, filename) assert os.path.isfile(input_file_path), f"Input file '{filename}' exists." before_sha1_hash = _file_sha1_hash(input_file_path) - input_media_file = TranscodeTempDirAndFile(temp_dir=MEDIA_SAMPLES_PATH, filename=filename) - if '-noaudio' in filename: # Media file with no audio - with pytest.raises(McPodcastFileIsInvalidException): - transcode_media_file_if_needed(input_media_file=input_media_file) + with pytest.raises(McPermanentError): + maybe_transcode_file( + input_file=input_file_path, + maybe_output_file=os.path.join(tempfile.mkdtemp('test'), 'test'), + ) elif '-invalid' in filename: # Invalid media file - with pytest.raises(McPodcastFileIsInvalidException): - transcode_media_file_if_needed(input_media_file=input_media_file) + with pytest.raises(McPermanentError): + maybe_transcode_file( + input_file=input_file_path, + maybe_output_file=os.path.join(tempfile.mkdtemp('test'), 'test'), + ) else: - output_media_file = transcode_media_file_if_needed(input_media_file=input_media_file) + maybe_output_file = os.path.join(tempfile.mkdtemp('test'), 'test') - assert output_media_file, f"Output media file was set for filename '{filename}'." + media_file_transcoded = maybe_transcode_file( + input_file=input_file_path, + maybe_output_file=maybe_output_file, + ) - output_file_info = media_file_info(media_file_path=output_media_file.temp_full_path) + output_file_info = media_file_info( + media_file_path=maybe_output_file if media_file_transcoded else input_file_path, + ) assert not output_file_info.has_video_streams, f"There should be no video streams in '{filename}'." assert len(output_file_info.audio_streams) == 1, f"There should be only one audio stream in '{filename}'." @@ -122,13 +127,11 @@ def test_transcode_media_file_if_needed(): assert audio_stream.audio_channel_count == 1, f"Output file should be only mono for filename '{filename}'." if '-mp3-mono' in filename: - assert ( - output_media_file.temp_full_path == input_media_file.temp_full_path - ), "Mono MP3 file shouldn't have been transcoded." + assert media_file_transcoded is False, "Mono MP3 file shouldn't have been transcoded." + assert not os.path.isfile(maybe_output_file), "Output file should not exist." else: - assert ( - output_media_file.temp_full_path != input_media_file.temp_full_path - ), f"File '{filename}' should have been transcoded." + assert media_file_transcoded is True, f"File '{filename}' should have been transcoded." + assert os.path.isfile(maybe_output_file), "Output file should exist." after_sha1_hash = _file_sha1_hash(input_file_path) diff --git a/apps/podcast-transcribe-episode/tests/python/test_workflow.py b/apps/podcast-transcribe-episode/tests/python/test_workflow.py new file mode 100644 index 0000000000..765375530a --- /dev/null +++ b/apps/podcast-transcribe-episode/tests/python/test_workflow.py @@ -0,0 +1,183 @@ +import os +from datetime import timedelta +from typing import Union + +# noinspection PyPackageRequirements +import pytest +# noinspection PyPackageRequirements +from temporal.workerfactory import WorkerFactory +# noinspection PyPackageRequirements +from temporal.workflow import WorkflowOptions + +from mediawords.db import connect_to_db +from mediawords.dbi.downloads.store import fetch_content +from mediawords.test.db.create import create_test_medium, create_test_feed, create_test_story +from mediawords.test.hash_server import HashServer +from mediawords.util.log import create_logger +from mediawords.util.network import random_unused_port +from mediawords.workflow.client import workflow_client +from mediawords.workflow.worker import stop_worker_faster + +from podcast_transcribe_episode.config import ( + PodcastTranscribeEpisodeConfig, + AbstractGCBucketConfig, + RawEnclosuresGCBucketConfig, + TranscodedEpisodesGCBucketConfig, + TranscriptsGCBucketConfig, +) +from podcast_transcribe_episode.gcs_store import GCSStore +from podcast_transcribe_episode.workflow import PodcastTranscribeActivitiesImpl, PodcastTranscribeWorkflowImpl +from podcast_transcribe_episode.workflow_interface import ( + TASK_QUEUE, + PodcastTranscribeActivities, + PodcastTranscribeWorkflow, +) + +from .random_gcs_prefix import random_gcs_path_prefix + +log = create_logger(__name__) + +TEST_MP3_PATH = '/opt/mediacloud/tests/data/media-samples/samples/kim_kardashian-mp3-mono.mp3' +assert os.path.isfile(TEST_MP3_PATH), f"Test MP3 file '{TEST_MP3_PATH}' should exist." + + +class _RandomPrefixesPodcastTranscribeEpisodeConfig(PodcastTranscribeEpisodeConfig): + """Custom configuration which uses random GCS prefixes.""" + + __slots__ = [ + '__raw_enclosures_config', + '__transcoded_episodes_config', + '__transcripts_config', + ] + + def __init__(self): + super().__init__() + + # Create bucket config classes once so that if we call the getters again, the random prefixes don't get + # regenerated + self.__raw_enclosures_config = RawEnclosuresGCBucketConfig(path_prefix=random_gcs_path_prefix()) + self.__transcoded_episodes_config = TranscodedEpisodesGCBucketConfig(path_prefix=random_gcs_path_prefix()) + self.__transcripts_config = TranscriptsGCBucketConfig(path_prefix=random_gcs_path_prefix()) + + def raw_enclosures(self) -> AbstractGCBucketConfig: + return self.__raw_enclosures_config + + def transcoded_episodes(self) -> AbstractGCBucketConfig: + return self.__transcoded_episodes_config + + def transcripts(self) -> AbstractGCBucketConfig: + return self.__transcripts_config + + +# Custom activities subclass with random bucket prefixes +class _RandomPrefixesPodcastTranscribeActivities(PodcastTranscribeActivitiesImpl): + + @classmethod + def _create_config(cls) -> PodcastTranscribeEpisodeConfig: + return _RandomPrefixesPodcastTranscribeEpisodeConfig() + + +@pytest.mark.asyncio +async def test_workflow(): + db = connect_to_db() + + test_medium = create_test_medium(db=db, label='test') + test_feed = create_test_feed(db=db, label='test', medium=test_medium) + + # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be + # used to guess the probable language of the podcast episode + test_story = create_test_story(db=db, label='keeping up with Kardashians', feed=test_feed) + + stories_id = test_story['stories_id'] + + with open(TEST_MP3_PATH, mode='rb') as f: + test_mp3_data = f.read() + + # noinspection PyUnusedLocal + def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]: + response = "".encode('utf-8') + response += "HTTP/1.0 200 OK\r\n".encode('utf-8') + response += "Content-Type: audio/mpeg\r\n".encode('utf-8') + response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8') + response += "\r\n".encode('utf-8') + response += test_mp3_data + return response + + port = random_unused_port() + pages = { + '/test.mp3': { + 'callback': __mp3_callback, + } + } + + hs = HashServer(port=port, pages=pages) + hs.start() + + # Not localhost as this might get fetched from a remote worker + mp3_url = hs.page_url('/test.mp3') + + db.insert(table='story_enclosures', insert_hash={ + 'stories_id': stories_id, + 'url': mp3_url, + 'mime_type': 'audio/mpeg', + 'length': len(test_mp3_data), + }) + + client = workflow_client() + + # Start worker + factory = WorkerFactory(client=client, namespace=client.namespace) + worker = factory.new_worker(task_queue=TASK_QUEUE) + + # Use an activities implementation with random GCS prefixes set + activities = _RandomPrefixesPodcastTranscribeActivities() + + worker.register_activities_implementation( + activities_instance=activities, + activities_cls_name=PodcastTranscribeActivities.__name__, + ) + worker.register_workflow_implementation_type(impl_cls=PodcastTranscribeWorkflowImpl) + factory.start() + + # Initialize workflow instance + workflow: PodcastTranscribeWorkflow = client.new_workflow_stub( + cls=PodcastTranscribeWorkflow, + workflow_options=WorkflowOptions( + workflow_id=str(stories_id), + + # By default, if individual activities of the workflow fail, they will get restarted pretty much + # indefinitely, and so this test might run for days (or rather just timeout on the CI). So we cap the + # workflow so that if it doesn't manage to complete in X minutes, we consider it as failed. + workflow_run_timeout=timedelta(minutes=5), + + ), + ) + + # Wait for the workflow to complete + await workflow.transcribe_episode(stories_id) + + downloads = db.select(table='downloads', what_to_select='*').hashes() + assert len(downloads) == 1 + first_download = downloads[0] + assert first_download['stories_id'] == stories_id + assert first_download['type'] == 'content' + assert first_download['state'] == 'success' + + download_content = fetch_content(db=db, download=first_download) + + # It's what gets said in the sample MP3 file + assert 'Kim Kardashian' in download_content + + # Initiate the worker shutdown in the background while we do the GCS cleanup so that the stop_workers_faster() + # doesn't have to wait that long + await worker.stop(background=True) + + log.info("Cleaning up GCS...") + GCSStore(bucket_config=activities.config.raw_enclosures()).delete_object(object_id=str(stories_id)) + GCSStore(bucket_config=activities.config.transcoded_episodes()).delete_object(object_id=str(stories_id)) + GCSStore(bucket_config=activities.config.transcripts()).delete_object(object_id=str(stories_id)) + log.info("Cleaned up GCS") + + log.info("Stopping workers...") + await stop_worker_faster(worker) + log.info("Stopped workers") diff --git a/apps/postgresql-base/Dockerfile b/apps/postgresql-base/Dockerfile index 327d9ecede..272124e3d6 100644 --- a/apps/postgresql-base/Dockerfile +++ b/apps/postgresql-base/Dockerfile @@ -1,14 +1,84 @@ # -# PostgreSQL base +# PostgreSQL base server # -FROM gcr.io/mcback/base:latest +FROM gcr.io/mcback/postgresql-repo-base:latest -# Add Add PostgreSQL GPG key -RUN curl -L https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - +# Install packages +RUN \ + # + # Install PostgreSQL + apt-get -y --no-install-recommends install \ + postgresql-13 \ + postgresql-client-13 \ + postgresql-contrib-13 \ + postgresql-plperl-13 \ + && \ + true -# Add PostgreSQL APT repository -RUN echo "deb http://apt.postgresql.org/pub/repos/apt/ focal-pgdg main" > /etc/apt/sources.list.d/pgdg.list +# Make some run directories +RUN \ + mkdir -p /var/run/postgresql/13-main.pg_stat_tmp && \ + chown -R postgres:postgres /var/run/postgresql/13-main.pg_stat_tmp && \ + true -# Fetch new repositories -RUN apt-get -y update +# Write our own configuration +RUN rm -rf /etc/postgresql/13/main/ +COPY conf/ /etc/postgresql/13/main/ + +# This is where "update_memory_config.sh" script will write its memory settings +# which it will auto-determine from available RAM on every run. +RUN \ + touch /var/run/postgresql/postgresql-memory.conf && \ + chown postgres:postgres /var/run/postgresql/postgresql-memory.conf && \ + true + +# Copy helper scripts +RUN mkdir -p /opt/postgresql-base/ +COPY bin/* /opt/postgresql-base/bin/ + +USER postgres + +RUN \ + # + # Remove APT-initialized data directory because it doesn't have the right + # locale, doesn't use checksums etc. + rm -rf /var/lib/postgresql/13/main/ && \ + # + # Update memory configuration in case we decide to start PostgreSQL at + # build time + # Update memory configuration + /opt/postgresql-base/bin/update_memory_config.sh && \ + # + # Run initdb + mkdir -p /var/lib/postgresql/13/main/ && \ + /usr/lib/postgresql/13/bin/initdb \ + --pgdata=/var/lib/postgresql/13/main/ \ + --data-checksums \ + --encoding=UTF-8 \ + --lc-collate='en_US.UTF-8' \ + --lc-ctype='en_US.UTF-8' \ + && \ + true + +# VOLUME doesn't get set here as children of this image might amend the initial +# data directory somehow (e.g. pre-initialize it with some schema). Once you do +# that in the sub-image, don't forget to define VOLUME afterwards! + +# SIGTERM (Docker's default) will initiate PostgreSQL's "Smart Shutdown" mode +# which will then wait for the current transactions to finish. If there are +# active long-running queries, Docker will wait for "stop_grace_period", run +# out of patience and SIGKILL the process, forcing PostgreSQL to recover the +# database on restart. +# So, instead we stop the database with SIGINT which triggers "Fast Shutdown": +# active connections get terminated, and PostgreSQL shuts down considerably +# faster and safer. +STOPSIGNAL SIGINT + +# Server +EXPOSE 5432 + +# *Not* adding /opt/postgresql-base/ to $PATH so that users get to pick which +# specific version of "postgresql.sh" to run + +CMD ["/opt/postgresql-base/bin/postgresql.sh"] diff --git a/apps/postgresql-base/bin/postgresql.sh b/apps/postgresql-base/bin/postgresql.sh new file mode 100755 index 0000000000..032c9e0a38 --- /dev/null +++ b/apps/postgresql-base/bin/postgresql.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -u +set -e + +MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/13/bin/" +MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/13/main/" +MC_POSTGRESQL_CONF_PATH="/etc/postgresql/13/main/postgresql.conf" + +# Update memory configuration +/opt/postgresql-base/bin/update_memory_config.sh + +# Start PostgreSQL +exec "${MC_POSTGRESQL_BIN_DIR}/postgres" \ + -D "${MC_POSTGRESQL_DATA_DIR}" \ + -c "config_file=${MC_POSTGRESQL_CONF_PATH}" diff --git a/apps/postgresql-server/bin/update_memory_config.sh b/apps/postgresql-base/bin/update_memory_config.sh similarity index 83% rename from apps/postgresql-server/bin/update_memory_config.sh rename to apps/postgresql-base/bin/update_memory_config.sh index e60bbbc0dc..a0456d51c0 100755 --- a/apps/postgresql-server/bin/update_memory_config.sh +++ b/apps/postgresql-base/bin/update_memory_config.sh @@ -3,8 +3,6 @@ set -u set -e -MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/11/bin/" -MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/11/main/" MC_POSTGRESQL_MEMORY_CONF_PATH="/var/run/postgresql/postgresql-memory.conf" # Adjust configuration based on amount of RAM diff --git a/apps/postgresql-server/conf/environment b/apps/postgresql-base/conf/environment similarity index 100% rename from apps/postgresql-server/conf/environment rename to apps/postgresql-base/conf/environment diff --git a/apps/postgresql-server/conf/pg_ctl.conf b/apps/postgresql-base/conf/pg_ctl.conf similarity index 100% rename from apps/postgresql-server/conf/pg_ctl.conf rename to apps/postgresql-base/conf/pg_ctl.conf diff --git a/apps/postgresql-server/conf/pg_hba.conf b/apps/postgresql-base/conf/pg_hba.conf similarity index 89% rename from apps/postgresql-server/conf/pg_hba.conf rename to apps/postgresql-base/conf/pg_hba.conf index d734d1ffe9..1338cb83eb 100644 --- a/apps/postgresql-server/conf/pg_hba.conf +++ b/apps/postgresql-base/conf/pg_hba.conf @@ -6,4 +6,4 @@ host all all ::1/128 md5 local replication all peer host replication all 127.0.0.1/32 md5 host replication all ::1/128 md5 -host all mediacloud samenet md5 +host all all samenet md5 diff --git a/apps/postgresql-server/conf/pg_ident.conf b/apps/postgresql-base/conf/pg_ident.conf similarity index 100% rename from apps/postgresql-server/conf/pg_ident.conf rename to apps/postgresql-base/conf/pg_ident.conf diff --git a/apps/postgresql-server/conf/postgresql.conf b/apps/postgresql-base/conf/postgresql.conf similarity index 85% rename from apps/postgresql-server/conf/postgresql.conf rename to apps/postgresql-base/conf/postgresql.conf index 8170f7bc85..cbd4c22669 100644 --- a/apps/postgresql-server/conf/postgresql.conf +++ b/apps/postgresql-base/conf/postgresql.conf @@ -2,10 +2,10 @@ # Media Cloud PostgreSQL static configuration # -data_directory = '/var/lib/postgresql/11/main' -hba_file = '/etc/postgresql/11/main/pg_hba.conf' -ident_file = '/etc/postgresql/11/main/pg_ident.conf' -external_pid_file = '/var/run/postgresql/11-main.pid' +data_directory = '/var/lib/postgresql/13/main' +hba_file = '/etc/postgresql/13/main/pg_hba.conf' +ident_file = '/etc/postgresql/13/main/pg_ident.conf' +external_pid_file = '/var/run/postgresql/13-main.pid' port = 5432 max_connections = 610 @@ -38,13 +38,13 @@ hot_standby_feedback = on random_page_cost = 1.0 -cluster_name = '11/main' +cluster_name = '13/main' log_line_prefix = '%t [%p-%l] %q%u@%d ' log_timezone = 'localtime' log_lock_waits = on -stats_temp_directory = '/var/run/postgresql/11-main.pg_stat_tmp' +stats_temp_directory = '/var/run/postgresql/13-main.pg_stat_tmp' datestyle = 'iso, mdy' timezone = 'localtime' diff --git a/apps/postgresql-server/conf/start.conf b/apps/postgresql-base/conf/start.conf similarity index 100% rename from apps/postgresql-server/conf/start.conf rename to apps/postgresql-base/conf/start.conf diff --git a/apps/postgresql-pgbouncer/Dockerfile b/apps/postgresql-pgbouncer/Dockerfile index 37d2dbc4c0..a2496d679f 100644 --- a/apps/postgresql-pgbouncer/Dockerfile +++ b/apps/postgresql-pgbouncer/Dockerfile @@ -2,7 +2,7 @@ # PgBouncer # -FROM gcr.io/mcback/postgresql-base:latest +FROM gcr.io/mcback/postgresql-repo-base:latest # Install PgBouncer RUN \ diff --git a/apps/postgresql-pgbouncer/conf/pgbouncer.ini b/apps/postgresql-pgbouncer/conf/pgbouncer.ini index cd6760882b..eb3f28662c 100644 --- a/apps/postgresql-pgbouncer/conf/pgbouncer.ini +++ b/apps/postgresql-pgbouncer/conf/pgbouncer.ini @@ -16,13 +16,26 @@ auth_file = /etc/pgbouncer/userlist.txt pool_mode = session server_reset_query = DISCARD ALL -max_client_conn = 600 -default_pool_size = 600 + +# Maximum number of client connections allowed +max_client_conn = 5000 + +# How many server connections to allow per user/database pair +default_pool_size = 450 + +# Do not allow more than this many server connections per database (regardless +# of user) +max_db_connections = 500 + log_connections = 0 log_disconnections = 0 stats_period = 600 server_login_retry = 1 +# Don't let transactions idle around for more than 10 minutes to prevent buggy +# code from leading to transaction wraparound issues +idle_transaction_timeout = 600 + # PyCharm doesn't work without this one: # https://github.com/Athou/commafeed/issues/559 ignore_startup_parameters = extra_float_digits diff --git a/apps/podcast-poll-due-operations/.dockerignore b/apps/postgresql-repo-base/.dockerignore similarity index 100% rename from apps/podcast-poll-due-operations/.dockerignore rename to apps/postgresql-repo-base/.dockerignore diff --git a/apps/postgresql-repo-base/Dockerfile b/apps/postgresql-repo-base/Dockerfile new file mode 100644 index 0000000000..43c9660011 --- /dev/null +++ b/apps/postgresql-repo-base/Dockerfile @@ -0,0 +1,19 @@ +# +# PostgreSQL repository base +# + +FROM gcr.io/mcback/base:latest + +RUN \ + # + # Add Add PostgreSQL GPG key + curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \ + # + # Add PostgreSQL APT repository + echo "deb http://apt.postgresql.org/pub/repos/apt/ focal-pgdg main" \ + > /etc/apt/sources.list.d/pgdg.list && \ + # + # Fetch new repositories + apt-get -y update && \ + # + true diff --git a/apps/postgresql-server/Dockerfile b/apps/postgresql-server/Dockerfile index 25a8308803..1d8dc5ab18 100644 --- a/apps/postgresql-server/Dockerfile +++ b/apps/postgresql-server/Dockerfile @@ -1,19 +1,14 @@ # -# PostgreSQL server +# Main backend PostgreSQL server # FROM gcr.io/mcback/postgresql-base:latest -# Install packages +USER root RUN \ - apt-get -y update && \ - # - # Install PostgreSQL - apt-get -y --no-install-recommends install \ - postgresql-11 \ - postgresql-client-11 \ - postgresql-contrib-11 \ - postgresql-plperl-11 \ + mkdir -p \ + /opt/postgresql-server/bin/ \ + /opt/postgresql-server/schema/ \ && \ apt-get -y --no-install-recommends install python3 python3-pip python3-setuptools && \ # @@ -36,22 +31,9 @@ RUN \ # true -# Make some run directories -RUN \ - mkdir -p /var/run/postgresql/11-main.pg_stat_tmp && \ - chown -R postgres:postgres /var/run/postgresql/11-main.pg_stat_tmp && \ - true - -# Write our own configuration -RUN rm -rf /etc/postgresql/11/main/ -COPY conf/ /etc/postgresql/11/main/ - -# This is where "update_memory_config.sh" script will write its memory settings -# which it will auto-determine from available RAM on every run. -RUN \ - touch /var/run/postgresql/postgresql-memory.conf && \ - chown postgres:postgres /var/run/postgresql/postgresql-memory.conf && \ - true +# Copy helper scripts, schema, migrations +COPY bin/* /opt/postgresql-server/bin/ +COPY schema/ /opt/postgresql-server/schema/ # Copy helper scripts, schema, migrations, pgmigrate callbacks/config RUN mkdir -p /opt/mediacloud/ @@ -61,15 +43,14 @@ COPY migrations/ /opt/mediacloud/migrations/ COPY migrations.yml /opt/mediacloud/migrations.yml RUN cd /opt/mediacloud -USER postgres - # Initialize data volume, create users + database # If a new empty volume gets mounted to /var/lib/postgresql/ upon # container start, Docker will copy the files from the container to the volume +USER postgres RUN /opt/mediacloud/bin/initialize_db.sh ENV \ - PATH="/opt/mediacloud/bin:${PATH}" \ + PATH="/opt/postgresql-server/bin:${PATH}" \ # # Make sure that we can connect via "psql" without sudoing into "postgres" user PGHOST=localhost \ @@ -78,26 +59,8 @@ ENV \ PGPASSWORD=mediacloud \ PGDATABASE=mediacloud -# Remove the init script so that someone doesn't accidentally run it in production -USER root -RUN rm /opt/mediacloud/bin/initialize_db.sh - -USER postgres - # PostgreSQL data VOLUME /var/lib/postgresql/ -# SIGTERM (Docker's default) will initiate PostgreSQL's "Smart Shutdown" mode -# which will then wait for the current transactions to finish. If there are -# active long-running queries, Docker will wait for "stop_grace_period", run -# out of patience and SIGKILL the process, forcing PostgreSQL to recover the -# database on restart. -# So, instead we stop the database with SIGINT which triggers "Fast Shutdown": -# active connections get terminated, and PostgreSQL shuts down considerably -# faster and safer. -STOPSIGNAL SIGINT - -# Server -EXPOSE 5432 - -CMD ["/opt/mediacloud/bin/postgresql_server.sh"] +# Use our own wrapper script which runs schema upgrades first +CMD ["/opt/postgresql-server/bin/postgresql.sh"] diff --git a/apps/postgresql-server/bin/apply_migrations.sh b/apps/postgresql-server/bin/apply_migrations.sh index 25b5e15dca..371c80f101 100755 --- a/apps/postgresql-server/bin/apply_migrations.sh +++ b/apps/postgresql-server/bin/apply_migrations.sh @@ -3,9 +3,9 @@ set -u set -e -MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/11/bin/" -MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/11/main/" -MC_POSTGRESQL_CONF_PATH="/etc/postgresql/11/main/postgresql.conf" +MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/13/bin/" +MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/13/main/" +MC_POSTGRESQL_CONF_PATH="/etc/postgresql/13/main/postgresql.conf" MIGRATIONS_DIR="/opt/mediacloud/migrations" diff --git a/apps/postgresql-server/bin/initialize_db.sh b/apps/postgresql-server/bin/initialize_db.sh index 12ce297523..08967dcd50 100755 --- a/apps/postgresql-server/bin/initialize_db.sh +++ b/apps/postgresql-server/bin/initialize_db.sh @@ -3,25 +3,12 @@ set -u set -e -MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/11/bin/" -MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/11/main/" -MC_POSTGRESQL_CONF_PATH="/etc/postgresql/11/main/postgresql.conf" +MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/13/bin/" +MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/13/main/" +MC_POSTGRESQL_CONF_PATH="/etc/postgresql/13/main/postgresql.conf" # Update memory configuration -/opt/mediacloud/bin/update_memory_config.sh - -# Remove APT-initialized data directory because it doesn't have the right -# locale, doesn't use checksums etc. -rm -rf /var/lib/postgresql/11/main/ - -# Run initdb -mkdir -p "${MC_POSTGRESQL_DATA_DIR}" -"${MC_POSTGRESQL_BIN_DIR}/initdb" \ - --pgdata="${MC_POSTGRESQL_DATA_DIR}" \ - --data-checksums \ - --encoding=UTF-8 \ - --lc-collate='en_US.UTF-8' \ - --lc-ctype='en_US.UTF-8' +/opt/postgresql-base/bin/update_memory_config.sh "${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ -o "-c config_file=${MC_POSTGRESQL_CONF_PATH}" \ @@ -48,11 +35,11 @@ CREATE DATABASE mediacloud WITH EOF psql -v ON_ERROR_STOP=1 -c "${CREATE_DB_SQL}" -# run migrations with pgmigrate package +# Run migrations with pgmigrate package cd /opt/mediacloud && pgmigrate -t latest migrate -# # dump schema file for reference in development -psql mediacloud -c '\! pg_dump mediacloud > /tmp/mediawords.sql' +# Dump schema file for reference in development +psql -v ON_ERROR_STOP=1 mediacloud -c '\! pg_dump mediacloud > /tmp/mediawords.sql' # Stop PostgreSQL "${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ diff --git a/apps/postgresql-server/bin/postgresql_server.sh b/apps/postgresql-server/bin/postgresql.sh similarity index 58% rename from apps/postgresql-server/bin/postgresql_server.sh rename to apps/postgresql-server/bin/postgresql.sh index 50661ff1dd..cf7e7c5c57 100755 --- a/apps/postgresql-server/bin/postgresql_server.sh +++ b/apps/postgresql-server/bin/postgresql.sh @@ -3,12 +3,8 @@ set -u set -e -MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/11/bin/" -MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/11/main/" -MC_POSTGRESQL_CONF_PATH="/etc/postgresql/11/main/postgresql.conf" - # Update memory configuration -/opt/mediacloud/bin/update_memory_config.sh +/opt/postgresql-base/bin/update_memory_config.sh # Run schema migrations if needed if [ -e /var/lib/postgresql/first_run ]; then @@ -19,11 +15,9 @@ elif [ ! -z ${MC_POSTGRESQL_SKIP_MIGRATIONS+x} ]; then echo "Skipping schema migrations because 'MC_POSTGRESQL_SKIP_MIGRATIONS' is set." else echo "Applying schema migrations..." - /opt/mediacloud/bin/apply_migrations.sh + /opt/postgresql-server/bin/apply_migrations.sh echo "Done applying schema migrations." fi # Start PostgreSQL -exec "${MC_POSTGRESQL_BIN_DIR}/postgres" \ - -D "${MC_POSTGRESQL_DATA_DIR}" \ - -c "config_file=${MC_POSTGRESQL_CONF_PATH}" +exec /opt/postgresql-base/bin/postgresql.sh diff --git a/apps/postgresql-server/bin/pps b/apps/postgresql-server/bin/pps index f007f3dfb8..ff24e59a45 100755 --- a/apps/postgresql-server/bin/pps +++ b/apps/postgresql-server/bin/pps @@ -6,7 +6,26 @@ else COLS=`tput cols` fi -echo "select psa.pid, min(application_name) as client, substr(query_start::text, 0, 20) as date, granted as l, regexp_replace(query, E'[\\n\\r ]+', ' ', 'g' ) q from pg_stat_activity psa left join pg_locks pl on ( psa.pid = pl.pid and pl.granted = 'f' ) where state not like 'idle%' group by psa.pid, usename, state, query_start, granted, q order by query_start desc" | psql mediacloud | cut -c 1-$COLS - +cat < t.host ORDER BY host LIMIT 1) - FROM t - WHERE t.host IS NOT NULL - ) - SELECT host FROM t WHERE host IS NOT NULL - loop - insert into pending_downloads - select dp.downloads_id - from downloads_pending dp - left join qd on ( dp.downloads_id = qd.downloads_id ) - where - host = pending_host.host and - qd.downloads_id is null - order by priority, downloads_id desc nulls last - limit 1; - end loop; - - return query select pd.downloads_id from pending_downloads pd; - end; - -$$ language plpgsql; - -COMMENT ON FUNCTION get_downloads_for_queue () IS 'efficiently query downloads_pending -for the latest downloads_id per host. postgres is not able to do this through its -normal query planning (it just does an index scan of the whole index). this turns -a query that takes ~22 seconds for a 100 million row table into one that takes ~0.25 seconds'; - -- -- Extracted plain text from every download -- @@ -4157,155 +4115,6 @@ CREATE UNIQUE INDEX story_enclosures_stories_id_url ON story_enclosures (stories_id, url); --- --- Audio file codec; keep in sync with "_SUPPORTED_NATIVE_AUDIO_CODECS" constant --- (https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1) --- -CREATE TYPE podcast_episodes_audio_codec AS ENUM ( - 'LINEAR16', - 'FLAC', - 'MULAW', - 'OGG_OPUS', - 'MP3' -); - -COMMENT ON TYPE podcast_episodes_audio_codec IS 'Audio file codec; keep in sync with "_SUPPORTED_NATIVE_AUDIO_CODECS" -constant (https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1)'; - --- --- Podcast story episodes (derived from enclosures) --- -CREATE TABLE podcast_episodes ( - podcast_episodes_id BIGSERIAL PRIMARY KEY, - stories_id INT NOT NULL REFERENCES stories (stories_id) ON DELETE CASCADE, - - -- Enclosure that's considered to point to a podcast episode - story_enclosures_id BIGINT NOT NULL - REFERENCES story_enclosures (story_enclosures_id) - ON DELETE CASCADE, - - -- Google Cloud Storage URI where the audio file is located at - gcs_uri TEXT NOT NULL - CONSTRAINT gcs_uri_has_gs_prefix - CHECK(gcs_uri LIKE 'gs://%'), - - -- Duration (in seconds) - duration INT NOT NULL - CONSTRAINT duration_is_positive - CHECK(duration > 0), - - -- Audio codec as determined by transcoder - codec podcast_episodes_audio_codec NOT NULL, - - -- Audio sample rate (Hz) as determined by transcoder - sample_rate INT NOT NULL - CONSTRAINT sample_rate_looks_reasonable - CHECK(sample_rate > 1000), - - -- BCP 47 language identifier - -- (https://cloud.google.com/speech-to-text/docs/languages) - bcp47_language_code CITEXT NOT NULL - CONSTRAINT bcp47_language_code_looks_reasonable - CHECK( - bcp47_language_code LIKE '%-%' - OR bcp47_language_code = 'zh' - ), - - -- Speech API operation ID to be used for retrieving transcription; if NULL, - -- transcription job hasn't been submitted yet - speech_operation_id TEXT NULL - -); - -COMMENT ON TABLE podcast_episodes IS 'Podcast story episodes (derived from enclosures)'; -COMMENT ON COLUMN podcast_episodes.story_enclosures_id IS 'Enclosure that is considered -to point to a podcast episode'; -COMMENT ON COLUMN podcast_episodes.gcs_uri IS 'Google Cloud Storage URI where object is located'; -COMMENT ON COLUMN podcast_episodes.duration IS 'seconds'; -COMMENT ON COLUMN podcast_episodes.sample_rate IS 'Audio sample rate (Hz) as determined by transcoder'; -COMMENT ON COLUMN podcast_episodes.bcp47_language_code IS 'BCP 47 language identifier -(https://cloud.google.com/speech-to-text/docs/languages)'; -COMMENT ON COLUMN podcast_episodes.speech_operation_id IS 'Speech API operation ID to be used for -retrieving transcription; if NULL, transcription job has not been submitted yet'; - --- Only one episode per story -CREATE UNIQUE INDEX podcast_episodes_stories_id - ON podcast_episodes (stories_id); - -CREATE UNIQUE INDEX podcast_episodes_story_enclosures_id - ON podcast_episodes (story_enclosures_id); - -CREATE UNIQUE INDEX podcast_episodes_stories_id_story_enclosures_id - ON podcast_episodes (stories_id, story_enclosures_id); - - --- Result of an attempt to fetch the transcript -CREATE TYPE podcast_episode_transcript_fetch_result AS ENUM ( - - -- Operation was not yet finished yet at the time of fetching - 'in_progress', - - -- Operation was finished and transcription has succeeded - 'success', - - -- Operation was finished but the transcription has failed - 'error' - -); - - --- --- Attempts to fetch podcast episode transcript --- (we might need to try fetching the operation's results multiple times) --- -CREATE TABLE podcast_episode_transcript_fetches ( - podcast_episode_transcript_fetches_id BIGSERIAL PRIMARY KEY, - - -- Podcast that is being transcribed - podcast_episodes_id BIGINT NOT NULL - REFERENCES podcast_episodes (podcast_episodes_id) - ON DELETE CASCADE, - - -- Timestamp for when a fetch job should be added to the job broker's queue the soonest - add_to_queue_at TIMESTAMP WITH TIME ZONE NOT NULL, - - -- Timestamp for when a fetch job was added to the job broker's queue; - -- if NULL, a fetch job was never added to the queue - added_to_queue_at TIMESTAMP WITH TIME ZONE NULL, - - -- Timestamp when the operation's results were attempted to be fetched by the worker; - -- if NULL, the results weren't attempted to be fetched yet - fetched_at TIMESTAMP WITH TIME ZONE NULL, - - -- Result of the fetch attempt; - -- if NULL, the operation fetch didn't happen yet - result podcast_episode_transcript_fetch_result NULL, - - -- If result = 'error', error message that happened with the fetch attempt - error_message TEXT NULL - -); - - --- Function that returns true if results were attempted at being fetched -CREATE FUNCTION podcast_episode_transcript_was_added_to_queue(p_added_to_queue_at TIMESTAMP WITH TIME ZONE) -RETURNS BOOL AS $$ - - SELECT CASE WHEN p_added_to_queue_at::timestamp IS NULL THEN false ELSE true END; - -$$ LANGUAGE SQL IMMUTABLE; - - -CREATE INDEX podcast_episode_transcript_fetches_podcast_episodes_id - ON podcast_episode_transcript_fetches (podcast_episodes_id); - -CREATE UNIQUE INDEX podcast_episode_transcript_fetches_due - ON podcast_episode_transcript_fetches ( - add_to_queue_at, - podcast_episode_transcript_was_added_to_queue(added_to_queue_at) - ); - - -- -- Celery job results -- (configured as self.__app.conf.database_table_names; schema is dictated by Celery + SQLAlchemy) diff --git a/apps/postgresql-server/schema/migrations/mediawords-4759-4760.sql b/apps/postgresql-server/schema/migrations/mediawords-4759-4760.sql new file mode 100644 index 0000000000..9d8fe5e135 --- /dev/null +++ b/apps/postgresql-server/schema/migrations/mediawords-4759-4760.sql @@ -0,0 +1,44 @@ +-- +-- This is a Media Cloud PostgreSQL schema difference file (a "diff") between schema +-- versions 4759 and 4760. +-- +-- If you are running Media Cloud with a database that was set up with a schema version +-- 4759, and you would like to upgrade both the Media Cloud and the +-- database to be at version 4760, import this SQL file: +-- +-- psql mediacloud < mediawords-4759-4760.sql +-- +-- You might need to import some additional schema diff files to reach the desired version. +-- +-- +-- 1 of 2. Import the output of 'apgdiff': +-- + + +DROP FUNCTION IF EXISTS get_downloads_for_queue(); + + +-- +-- 2 of 2. Reset the database version. +-- + +CREATE OR REPLACE FUNCTION set_database_schema_version() RETURNS boolean AS $$ +DECLARE + + -- Database schema version number (same as a SVN revision number) + -- Increase it by 1 if you make major database schema changes. + MEDIACLOUD_DATABASE_SCHEMA_VERSION CONSTANT INT := 4760; + +BEGIN + + -- Update / set database schema version + DELETE FROM database_variables WHERE name = 'database-schema-version'; + INSERT INTO database_variables (name, value) VALUES ('database-schema-version', MEDIACLOUD_DATABASE_SCHEMA_VERSION::int); + + return true; + +END; +$$ +LANGUAGE 'plpgsql'; + +SELECT set_database_schema_version(); diff --git a/apps/postgresql-server/schema/migrations/mediawords-4760-4761.sql b/apps/postgresql-server/schema/migrations/mediawords-4760-4761.sql new file mode 100644 index 0000000000..9ae44c62a6 --- /dev/null +++ b/apps/postgresql-server/schema/migrations/mediawords-4760-4761.sql @@ -0,0 +1,48 @@ +-- +-- This is a Media Cloud PostgreSQL schema difference file (a "diff") between schema +-- versions 4760 and 4761. +-- +-- If you are running Media Cloud with a database that was set up with a schema version +-- 4760, and you would like to upgrade both the Media Cloud and the +-- database to be at version 4761, import this SQL file: +-- +-- psql mediacloud < mediawords-4760-4761.sql +-- +-- You might need to import some additional schema diff files to reach the desired version. +-- +-- +-- 1 of 2. Import the output of 'apgdiff': +-- + + +DROP TABLE podcast_episode_transcript_fetches; +DROP TABLE podcast_episodes; +DROP TYPE podcast_episodes_audio_codec; +DROP TYPE podcast_episode_transcript_fetch_result; +DROP FUNCTION podcast_episode_transcript_was_added_to_queue(TIMESTAMP WITH TIME ZONE); + + +-- +-- 2 of 2. Reset the database version. +-- + +CREATE OR REPLACE FUNCTION set_database_schema_version() RETURNS boolean AS $$ +DECLARE + + -- Database schema version number (same as a SVN revision number) + -- Increase it by 1 if you make major database schema changes. + MEDIACLOUD_DATABASE_SCHEMA_VERSION CONSTANT INT := 4761; + +BEGIN + + -- Update / set database schema version + DELETE FROM database_variables WHERE name = 'database-schema-version'; + INSERT INTO database_variables (name, value) VALUES ('database-schema-version', MEDIACLOUD_DATABASE_SCHEMA_VERSION::int); + + return true; + +END; +$$ +LANGUAGE 'plpgsql'; + +SELECT set_database_schema_version(); diff --git a/apps/podcast-submit-operation/.dockerignore b/apps/postgresql-upgrade/.dockerignore similarity index 100% rename from apps/podcast-submit-operation/.dockerignore rename to apps/postgresql-upgrade/.dockerignore diff --git a/apps/postgresql-upgrade/.idea/.gitignore b/apps/postgresql-upgrade/.idea/.gitignore new file mode 100644 index 0000000000..73f69e0958 --- /dev/null +++ b/apps/postgresql-upgrade/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/apps/podcast-fetch-transcript/.idea/inspectionProfiles/profiles_settings.xml b/apps/postgresql-upgrade/.idea/inspectionProfiles/profiles_settings.xml similarity index 100% rename from apps/podcast-fetch-transcript/.idea/inspectionProfiles/profiles_settings.xml rename to apps/postgresql-upgrade/.idea/inspectionProfiles/profiles_settings.xml diff --git a/apps/postgresql-upgrade/.idea/misc.xml b/apps/postgresql-upgrade/.idea/misc.xml new file mode 100644 index 0000000000..96297493a0 --- /dev/null +++ b/apps/postgresql-upgrade/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/modules.xml b/apps/postgresql-upgrade/.idea/modules.xml similarity index 53% rename from apps/podcast-fetch-episode/.idea/modules.xml rename to apps/postgresql-upgrade/.idea/modules.xml index 1f8ef01409..36c43c68df 100644 --- a/apps/podcast-fetch-episode/.idea/modules.xml +++ b/apps/postgresql-upgrade/.idea/modules.xml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/apps/postgresql-upgrade/.idea/postgresql-upgrade.iml b/apps/postgresql-upgrade/.idea/postgresql-upgrade.iml new file mode 100644 index 0000000000..f0558f493d --- /dev/null +++ b/apps/postgresql-upgrade/.idea/postgresql-upgrade.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/vcs.xml b/apps/postgresql-upgrade/.idea/vcs.xml similarity index 100% rename from apps/podcast-poll-due-operations/.idea/vcs.xml rename to apps/postgresql-upgrade/.idea/vcs.xml diff --git a/apps/postgresql-upgrade/Dockerfile b/apps/postgresql-upgrade/Dockerfile new file mode 100644 index 0000000000..35cc37361f --- /dev/null +++ b/apps/postgresql-upgrade/Dockerfile @@ -0,0 +1,57 @@ +# +# PostgreSQL upgrade scripts +# + +FROM gcr.io/mcback/postgresql-base:latest + +USER root + +# Install Python 3 for running the upgrade script +RUN apt-get -y --no-install-recommends install python3 + +# Install packages +RUN \ + # + # Install PostgreSQL 13 (oldest version) + apt-get -y --no-install-recommends install \ + postgresql-13 \ + postgresql-client-13 \ + postgresql-contrib-13 \ + postgresql-plperl-13 \ + && \ + # + # Install PostgreSQL 14 (newest version) + # apt-get -y --no-install-recommends install \ + # postgresql-12 \ + # postgresql-client-12 \ + # postgresql-contrib-12 \ + # postgresql-plperl-12 \ + # && \ + # + true + +RUN \ + # + # Make some run directories + mkdir -p /var/run/postgres/ && \ + chown -R postgres:postgres /var/run/postgres/ && \ + # + # Remove what might have gotten created in the parent image as we won't use it + mkdir -p /var/lib/postgresql/ && \ + chown -R postgres:postgres /var/lib/postgresql/ && \ + rm -rf /var/lib/postgresql/* && \ + # + # Remove extra configurations leaving only the one from parent "postgresql-base" + rm -rf /etc/postgresql/13/ && \ + # rm -rf /etc/postgresql/14/ && \ + # + true + +COPY bin/postgresql_upgrade.py /usr/bin/ + +# This is where the volume is supposed to be mounted +VOLUME /var/lib/postgresql/ + +USER postgres + +CMD ["postgresql_upgrade.py"] diff --git a/apps/postgresql-upgrade/bin/postgresql_upgrade.py b/apps/postgresql-upgrade/bin/postgresql_upgrade.py new file mode 100755 index 0000000000..4f6b0a2c47 --- /dev/null +++ b/apps/postgresql-upgrade/bin/postgresql_upgrade.py @@ -0,0 +1,457 @@ +#!/usr/bin/env python3 + +""" +PostgreSQL upgrade script. + +Usage: + +time docker run -it \ + --shm-size=64g \ + -v ~/Downloads/postgres_11_vol/:/var/lib/postgresql/ \ + gcr.io/mcback/postgresql-upgrade \ + postgresql_upgrade.py --source_version=11 --target_version=12 \ + > postgresql_upgrade.log +""" + +import argparse +import dataclasses +import getpass +import glob +import logging +import multiprocessing +import os +import pathlib +import shutil +import signal +import subprocess +import time + +logging.basicConfig(level=logging.DEBUG) + + +class PostgresUpgradeError(Exception): + pass + + +POSTGRES_DATA_DIR = "/var/lib/postgresql" +POSTGRES_USER = 'postgres' + + +def _dir_exists_and_accessible(directory: str) -> bool: + return os.path.isdir(directory) and os.access(directory, os.X_OK) + + +def _ram_size_mb() -> int: + """Return RAM size (in megabytes) that is allocated to the container.""" + ram_size = int(subprocess.check_output(['/container_memory_limit.sh']).decode('utf-8')) + assert ram_size, "RAM size can't be zero." + return ram_size + + +class _PostgresVersion(object): + """ + Data object of a single PostgreSQL version to upgrade from / to. + """ + __slots__ = [ + 'version', + 'data_dir', + 'main_dir', + 'bin_dir', + 'initdb', + 'pg_upgrade', + 'vacuumdb', + 'postgres', + 'tmp_conf_dir', + 'port', + ] + + @classmethod + def _current_postgresql_config_path(cls) -> str: + """ + Returns path to currently present PostgreSQL configuration directory. + + :return: Path to currently present PostgreSQL configuration directory, e.g. /etc/postgresql/11/main/. + """ + conf_list = os.listdir('/etc/postgresql/') + if len(conf_list) != 1: + raise PostgresUpgradeError(f"More / less than one PostgreSQL configuration set has been found: {conf_list}") + current_version = conf_list[0] + if not current_version.isdecimal(): + raise PostgresUpgradeError(f"Invalid PostgreSQL version: {current_version}") + current_version = int(current_version) + + current_postgresql_config_path = os.path.join('/etc/postgresql/', str(current_version), 'main') + if not os.path.isfile(os.path.join(current_postgresql_config_path, 'postgresql.conf')): + raise PostgresUpgradeError(f"postgresql.conf does not exist in {current_postgresql_config_path}.") + + return current_postgresql_config_path + + def __init__(self, + version: int, + target_version: bool, + starting_version: bool, + port: int, + extra_postgres_config: str): + """ + Constructor. + + Checks whether various binaries / paths / directories are available. + + :param version: PostgreSQL version number, e.g. 11. + :param target_version: If True, this data object represents a version that is being upgraded *to*. + :param starting_version: If True, this data object represents a source version, i.e. the initial version that is + being upgraded from. + :param port: PostgreSQL temporary port number, e.g. 50432. + :param extra_postgres_config: Extra lines to add to temporary postgresql.conf. + """ + assert isinstance(version, int), "Version number must be integer." + self.version = version + assert isinstance(port, int), "Port must be an integer." + self.port = port + + self.data_dir = os.path.join(POSTGRES_DATA_DIR, str(version)) + if target_version: + if os.path.exists(self.data_dir): + raise PostgresUpgradeError(( + f"New data directory {self.data_dir} already exists; if the previous attempt to upgrade failed, " + "run something like this:\n\n" + f" rm -rf {self.data_dir}\n" + "\n\n" + "on a container, or adjust the path on the host, or revert to old ZFS snapshot." + )) + else: + if starting_version: + if not _dir_exists_and_accessible(self.data_dir): + raise PostgresUpgradeError(( + f"Old data directory {self.data_dir} does not exist or is inaccessible; forgot to mount it?" + )) + + self.main_dir = os.path.join(self.data_dir, "main") + if not target_version: + if starting_version: + if not _dir_exists_and_accessible(self.main_dir): + raise PostgresUpgradeError(f"Old main directory {self.main_dir} does not exist or is inaccessible.") + + pg_version_path = os.path.join(self.main_dir, 'PG_VERSION') + if not os.path.isfile(pg_version_path): + raise PostgresUpgradeError(f"{pg_version_path} does not exist or is inaccessible.") + + postmaster_pid_path = os.path.join(self.main_dir, 'postmaster.pid') + if os.path.exists(postmaster_pid_path): + raise PostgresUpgradeError(f"{postmaster_pid_path} exists; is the database running?") + + # Create run directory + pathlib.Path(f"/var/run/postgresql/{version}-main.pg_stat_tmp/").mkdir(parents=True, exist_ok=True) + + self.bin_dir = f"/usr/lib/postgresql/{version}/bin/" + + if not _dir_exists_and_accessible(self.bin_dir): + raise PostgresUpgradeError(f"Binaries directory {self.bin_dir} does not exist or is inaccessible.") + if not _dir_exists_and_accessible(self.bin_dir): + raise PostgresUpgradeError(f"Binaries directory {self.bin_dir} does not exist or is inaccessible.") + + self.postgres = os.path.join(self.bin_dir, 'postgres') + if not os.access(self.postgres, os.X_OK): + raise PostgresUpgradeError(f"'postgres' at {self.postgres} does not exist.") + + if target_version: + + self.initdb = os.path.join(self.bin_dir, 'initdb') + if not os.access(self.initdb, os.X_OK): + raise PostgresUpgradeError(f"'initdb' at {self.initdb} does not exist.") + + self.pg_upgrade = os.path.join(self.bin_dir, 'pg_upgrade') + if not os.access(self.pg_upgrade, os.X_OK): + raise PostgresUpgradeError(f"'pg_upgrade' at {self.pg_upgrade} does not exist.") + + self.vacuumdb = os.path.join(self.bin_dir, 'vacuumdb') + if not os.access(self.vacuumdb, os.X_OK): + raise PostgresUpgradeError(f"'vacuumdb' at {self.vacuumdb} does not exist.") + + logging.info(f"Creating temporary configuration for version {version}...") + self.tmp_conf_dir = f"/var/tmp/postgresql/conf/{version}" + if os.path.exists(self.tmp_conf_dir): + shutil.rmtree(self.tmp_conf_dir) + current_postgresql_config_path = self._current_postgresql_config_path() + shutil.copytree(current_postgresql_config_path, self.tmp_conf_dir) + + with open(os.path.join(self.tmp_conf_dir, 'postgresql.conf'), 'a') as postgresql_conf: + postgresql_conf.write(f""" + + port = {port} + data_directory = '/var/lib/postgresql/{version}/main' + hba_file = '{self.tmp_conf_dir}/pg_hba.conf' + ident_file = '{self.tmp_conf_dir}/pg_ident.conf' + external_pid_file = '/var/run/postgresql/{version}-main.pid' + cluster_name = '{version}/main' + stats_temp_directory = '/var/run/postgresql/{version}-main.pg_stat_tmp' + + {extra_postgres_config} + + """) + + +@dataclasses.dataclass +class _PostgresVersionPair(object): + """ + Version pair to upgrade between. + + Must be different by exactly one version number, e.g. 11 and 12. + """ + old_version: _PostgresVersion + new_version: _PostgresVersion + + +class _PostgreSQLServer(object): + """PostgreSQL server helper.""" + + __slots__ = [ + '__port', + '__bin_dir', + '__data_dir', + '__conf_dir', + + '__proc', + ] + + def __init__(self, port: int, bin_dir: str, data_dir: str, conf_dir: str): + assert isinstance(port, int), "Port must be an integer." + assert os.path.isdir(bin_dir), f"{bin_dir} does not exist." + assert os.access(os.path.join(bin_dir, 'postgres'), os.X_OK), f"'postgres' does not exist in {bin_dir}." + assert os.access(os.path.join(bin_dir, 'pg_isready'), os.X_OK), f"'pg_isready' does not exist in {bin_dir}." + assert os.path.isdir(data_dir), f"{data_dir} does not exist." + assert os.path.isdir(conf_dir), f"{conf_dir} does not exist." + assert os.path.isfile( + os.path.join(conf_dir, 'postgresql.conf') + ), f"postgresql.conf in {conf_dir} does not exist." + + self.__bin_dir = bin_dir + self.__port = port + self.__data_dir = data_dir + self.__conf_dir = conf_dir + + self.__proc = None + + def start(self) -> None: + assert not self.__proc, "PostgreSQL is already started." + + logging.info("Starting PostgreSQL...") + self.__proc = subprocess.Popen([ + os.path.join(self.__bin_dir, 'postgres'), + '-D', self.__data_dir, + '-c', f'config_file={self.__conf_dir}/postgresql.conf', + ]) + + # Waiting for port is not enough as PostgreSQL might be recovering + while True: + try: + subprocess.check_call([os.path.join(self.__bin_dir, 'pg_isready'), '--port', str(self.__port)]) + except subprocess.CalledProcessError as ex: + logging.debug(f"pg_isready failed: {ex}") + logging.info("Waiting for PostgreSQL to come up...") + time.sleep(1) + else: + break + + logging.info("PostgreSQL is up!") + + def stop(self) -> None: + assert self.__proc, "PostgreSQL has not been started." + + logging.info("Waiting for PostgreSQL to shut down...") + self.__proc.send_signal(signal.SIGTERM) + self.__proc.wait() + + logging.info("PostgreSQL has been shut down") + + self.__proc = None + + +def postgres_upgrade(source_version: int, target_version: int) -> None: + """ + Upgrade PostgreSQL from source version up to target version. + + :param source_version: Source dataset version, e.g. 11. + :param target_version: Target dataset version, e.g. 13. + """ + logging.debug(f"Source version: {source_version}; target version: {target_version}") + + # Unset environment variables from parent image so that pg_upgrade can make its + # own decisions about which credentials to use + del os.environ['PGHOST'] + del os.environ['PGPORT'] + del os.environ['PGUSER'] + del os.environ['PGPASSWORD'] + del os.environ['PGDATABASE'] + + if not _dir_exists_and_accessible(POSTGRES_DATA_DIR): + raise PostgresUpgradeError(f"{POSTGRES_DATA_DIR} does not exist or is inaccessible.") + + if getpass.getuser() != POSTGRES_USER: + raise PostgresUpgradeError(f"This script is to be run as '{POSTGRES_USER}' user.") + + if target_version <= source_version: + raise PostgresUpgradeError( + f"Target version {target_version} is not newer than source version {source_version}." + ) + + shm_size = int(shutil.disk_usage("/dev/shm")[0] / 1024 / 1024) + min_shm_size = int(_ram_size_mb() / 3) - 1024 + if shm_size < min_shm_size: + raise PostgresUpgradeError( + f"Container's /dev/shm should be at least {min_shm_size} MB; try passing --shm-size property." + ) + + logging.info("Updating memory configuration...") + subprocess.check_call(['/opt/mediacloud/bin/update_memory_config.sh']) + + # Remove cruft that might have been left over from last attempt to do the upgrade + patterns = [ + 'pg_*.log', + 'pg_*.custom', + 'pg_upgrade_dump_globals.sql', + ] + for pattern in patterns: + for file in glob.glob(os.path.join(POSTGRES_DATA_DIR, pattern)): + logging.debug(f"Deleting {file}...") + os.unlink(pattern) + + new_maintenance_work_mem = int(_ram_size_mb() / 10) + logging.info(f"New maintenance work memory limit: {new_maintenance_work_mem} MB") + maintenance_work_mem_statement = f'maintenance_work_mem = {new_maintenance_work_mem}MB' + + # Work out upgrade pairs + # (initialize the pairs first so that _PostgresVersion() gets a chance to test environment first) + upgrade_pairs = [] + current_port = 50432 + for version in range(source_version, target_version): + upgrade_pairs.append( + _PostgresVersionPair( + old_version=_PostgresVersion( + version=version, + target_version=False, + starting_version=(version == source_version), + port=current_port, + extra_postgres_config='', + ), + new_version=_PostgresVersion( + version=version + 1, + target_version=True, + starting_version=False, + port=current_port + 1, + extra_postgres_config=maintenance_work_mem_statement, + ) + )) + current_port = current_port + 2 + + initial_version = upgrade_pairs[0].old_version + logging.info("Starting PostgreSQL before upgrade in case the last shutdown was unclean...") + proc = _PostgreSQLServer( + port=initial_version.port, + bin_dir=initial_version.bin_dir, + data_dir=initial_version.main_dir, + conf_dir=initial_version.tmp_conf_dir, + ) + proc.start() + proc.stop() + + for pair in upgrade_pairs: + + logging.info(f"Upgrading from {pair.old_version.version} to {pair.new_version.version}...") + + logging.info("Running initdb...") + pathlib.Path(pair.new_version.main_dir).mkdir(parents=True, exist_ok=True) + subprocess.check_call([ + pair.new_version.initdb, + '--pgdata', pair.new_version.main_dir, + + # At the time of writing we don't use checksums so we can't enable them here; once (if) they get enabled, + # this needs to be uncommented + # '--data-checksums', + + '--encoding', 'UTF-8', + '--lc-collate', 'en_US.UTF-8', + '--lc-ctype', 'en_US.UTF-8', + ]) + + upgrade_command = [ + pair.new_version.pg_upgrade, + '--jobs', str(multiprocessing.cpu_count()), + '--old-bindir', pair.old_version.bin_dir, + '--new-bindir', pair.new_version.bin_dir, + '--old-datadir', pair.old_version.main_dir, + '--new-datadir', pair.new_version.main_dir, + '--old-port', str(pair.old_version.port), + '--new-port', str(pair.new_version.port), + '--old-options', f" -c config_file={pair.old_version.tmp_conf_dir}/postgresql.conf", + '--new-options', f" -c config_file={pair.new_version.tmp_conf_dir}/postgresql.conf", + '--link', + '--verbose', + ] + + logging.info("Testing if clusters are compatible...") + subprocess.check_call(upgrade_command + ['--check'], cwd=POSTGRES_DATA_DIR) + + logging.info("Upgrading...") + subprocess.check_call(upgrade_command, cwd=POSTGRES_DATA_DIR) + + logging.info("Cleaning up old data directory...") + shutil.rmtree(pair.old_version.data_dir) + + logging.info("Cleaning up scripts...") + for script in [ + 'analyze_new_cluster.sh', + 'delete_old_cluster.sh', + 'pg_upgrade_internal.log', + 'pg_upgrade_server.log', + 'pg_upgrade_utility.log', + ]: + script_path = os.path.join(POSTGRES_DATA_DIR, script) + if os.path.isfile(script_path): + os.unlink(script_path) + + logging.info(f"Done upgrading from {pair.old_version.version} to {pair.new_version.version}") + + current_version = upgrade_pairs[-1].new_version + + proc = _PostgreSQLServer( + port=current_version.port, + bin_dir=current_version.bin_dir, + data_dir=current_version.main_dir, + conf_dir=current_version.tmp_conf_dir, + ) + proc.start() + + logging.info("Running VACUUM ANALYZE...") + logging.info("(monitor locks while running that because PostgreSQL might decide to do autovacuum!)") + + # FIXME temporarily disable autovacuum in the temp. config + + subprocess.check_call([ + current_version.vacuumdb, + '--port', str(current_version.port), + '--all', + '--verbose', + # Do --analyze-only instead of --analyze-in-stages because we're ready to wait for the full statistics + '--analyze-only', + '--jobs', str(multiprocessing.cpu_count()), + ]) + + proc.stop() + + logging.info("Done!") + + +def main(): + parser = argparse.ArgumentParser(description="Upgrade PostgreSQL dataset.") + parser.add_argument("-s", "--source_version", type=int, required=True, + help="Version to upgrade from") + parser.add_argument("-t", "--target_version", type=int, required=True, + help="Version to upgrade to") + args = parser.parse_args() + + postgres_upgrade(source_version=args.source_version, target_version=args.target_version) + + +if __name__ == '__main__': + main() diff --git a/apps/purge-object-caches/.idea/mediawords.sql b/apps/purge-object-caches/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/purge-object-caches/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/purge-object-caches/.idea/misc.xml b/apps/purge-object-caches/.idea/misc.xml index 4c12eeeb9d..0240bc7d67 100644 --- a/apps/purge-object-caches/.idea/misc.xml +++ b/apps/purge-object-caches/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/purge-object-caches/.idea/purge-object-caches.iml b/apps/purge-object-caches/.idea/purge-object-caches.iml index ec5b1a0497..54087d86da 100644 --- a/apps/purge-object-caches/.idea/purge-object-caches.iml +++ b/apps/purge-object-caches/.idea/purge-object-caches.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/purge-object-caches/.idea/sqlDataSources.xml b/apps/purge-object-caches/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..d9d9e21161 --- /dev/null +++ b/apps/purge-object-caches/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/purge-object-caches/docker-compose.tests.yml b/apps/purge-object-caches/docker-compose.tests.yml index a19f62821f..1420a62a1c 100644 --- a/apps/purge-object-caches/docker-compose.tests.yml +++ b/apps/purge-object-caches/docker-compose.tests.yml @@ -43,5 +43,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/rescrape-media/.idea/mediawords.sql b/apps/rescrape-media/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/rescrape-media/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/rescrape-media/.idea/misc.xml b/apps/rescrape-media/.idea/misc.xml index 9f7e834cda..d0b1e15d09 100644 --- a/apps/rescrape-media/.idea/misc.xml +++ b/apps/rescrape-media/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/rescrape-media/.idea/rescrape-media.iml b/apps/rescrape-media/.idea/rescrape-media.iml index a23aa11380..bba1087a73 100644 --- a/apps/rescrape-media/.idea/rescrape-media.iml +++ b/apps/rescrape-media/.idea/rescrape-media.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/rescrape-media/.idea/sqlDataSources.xml b/apps/rescrape-media/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..80ebb2caa7 --- /dev/null +++ b/apps/rescrape-media/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/rescrape-media/docker-compose.tests.yml b/apps/rescrape-media/docker-compose.tests.yml index 00af15d826..42076ee535 100644 --- a/apps/rescrape-media/docker-compose.tests.yml +++ b/apps/rescrape-media/docker-compose.tests.yml @@ -50,8 +50,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ rabbitmq-server: image: gcr.io/mcback/rabbitmq-server:latest diff --git a/apps/sitemap-fetch-media-pages/.idea/mediawords.sql b/apps/sitemap-fetch-media-pages/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/sitemap-fetch-media-pages/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/sitemap-fetch-media-pages/.idea/sqlDataSources.xml b/apps/sitemap-fetch-media-pages/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..d0fee50a0b --- /dev/null +++ b/apps/sitemap-fetch-media-pages/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/solr-base/Dockerfile b/apps/solr-base/Dockerfile index 9f58041061..0ff5015f9c 100644 --- a/apps/solr-base/Dockerfile +++ b/apps/solr-base/Dockerfile @@ -11,7 +11,7 @@ ENV MEDIACLOUD_SOLR_VERSION="6.5.0" # (distribution needed for running both Solr itself and ZooKeeper) RUN \ mkdir -p /opt/solr/ && \ - /dl_to_stdout.sh "https://archive.apache.org/dist/lucene/solr/${MEDIACLOUD_SOLR_VERSION}/solr-${MEDIACLOUD_SOLR_VERSION}.tgz" | \ + /dl_to_stdout.sh "https://mediacloud-archive-apache-org.s3.amazonaws.com/solr-${MEDIACLOUD_SOLR_VERSION}.tgz" | \ tar -zx -C /opt/solr/ --strip 1 && \ true diff --git a/apps/solr-zookeeper/Dockerfile b/apps/solr-zookeeper/Dockerfile index e278b78d05..ac70852292 100644 --- a/apps/solr-zookeeper/Dockerfile +++ b/apps/solr-zookeeper/Dockerfile @@ -12,7 +12,7 @@ ENV MEDIACLOUD_ZOOKEEPER_VERSION="3.4.10" # Download and extract ZooKeeper RUN \ mkdir -p /opt/zookeeper/ && \ - /dl_to_stdout.sh "https://archive.apache.org/dist/zookeeper/zookeeper-${MEDIACLOUD_ZOOKEEPER_VERSION}/zookeeper-${MEDIACLOUD_ZOOKEEPER_VERSION}.tar.gz" | \ + /dl_to_stdout.sh "https://mediacloud-archive-apache-org.s3.amazonaws.com/zookeeper-${MEDIACLOUD_ZOOKEEPER_VERSION}.tar.gz" | \ tar -zx -C /opt/zookeeper/ --strip 1 && \ rm -rf /opt/zookeeper/conf/ && \ true diff --git a/apps/temporal-elasticsearch/.dockerignore b/apps/temporal-elasticsearch/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/temporal-elasticsearch/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/temporal-elasticsearch/Dockerfile b/apps/temporal-elasticsearch/Dockerfile new file mode 100644 index 0000000000..cb7cd58ca9 --- /dev/null +++ b/apps/temporal-elasticsearch/Dockerfile @@ -0,0 +1,35 @@ +# +# Elasticsearch for Temporal +# + +FROM gcr.io/mcback/elasticsearch-base:latest + +USER root + +COPY config/* /opt/elasticsearch/config/ + +# Create keystore and move it to data volume +RUN \ + # + # Merge base and Temporal configs into one + cat \ + /opt/elasticsearch/config/elasticsearch-base.yml \ + /opt/elasticsearch/config/temporal-elasticsearch.yml \ + > /opt/elasticsearch/config/elasticsearch.yml && \ + # + true + +USER elasticsearch + +# Preload with Temporal index template +# (https://github.com/temporalio/temporal/blob/v1.9.2/schema/elasticsearch/v7/visibility/index_template.json) +COPY index_template.json setup_index_template.sh / +RUN /setup_index_template.sh +USER root +RUN rm /index_template.json /setup_index_template.sh +USER elasticsearch + +# Elasticsearch data +VOLUME /var/lib/elasticsearch + +CMD ["/opt/elasticsearch/bin/elasticsearch.sh"] diff --git a/apps/temporal-elasticsearch/config/.dockerignore b/apps/temporal-elasticsearch/config/.dockerignore new file mode 100644 index 0000000000..b3c0a37b66 --- /dev/null +++ b/apps/temporal-elasticsearch/config/.dockerignore @@ -0,0 +1 @@ +elasticsearch.keystore diff --git a/apps/temporal-elasticsearch/config/.gitignore b/apps/temporal-elasticsearch/config/.gitignore new file mode 100644 index 0000000000..3eb03f777e --- /dev/null +++ b/apps/temporal-elasticsearch/config/.gitignore @@ -0,0 +1,3 @@ +# Might get created by a Docker container +elasticsearch.keystore + diff --git a/apps/temporal-elasticsearch/config/temporal-elasticsearch.yml b/apps/temporal-elasticsearch/config/temporal-elasticsearch.yml new file mode 100644 index 0000000000..e96f46b92d --- /dev/null +++ b/apps/temporal-elasticsearch/config/temporal-elasticsearch.yml @@ -0,0 +1,2 @@ +cluster.name: temporal-elasticsearch +node.name: temporal-elasticsearch diff --git a/apps/temporal-elasticsearch/index_template.json b/apps/temporal-elasticsearch/index_template.json new file mode 100644 index 0000000000..73d18e7d9c --- /dev/null +++ b/apps/temporal-elasticsearch/index_template.json @@ -0,0 +1,81 @@ +{ + "order": 0, + "index_patterns": [ + "temporal-visibility-*" + ], + "settings": { + "index": { + "number_of_shards": "5", + "number_of_replicas": "0", + "search.idle.after": "365d" + } + }, + "mappings": { + "dynamic": "false", + "properties": { + "NamespaceId": { + "type": "keyword" + }, + "WorkflowId": { + "type": "keyword" + }, + "RunId": { + "type": "keyword" + }, + "WorkflowType": { + "type": "keyword" + }, + "StartTime": { + "type": "long" + }, + "ExecutionTime": { + "type": "long" + }, + "CloseTime": { + "type": "long" + }, + "ExecutionStatus": { + "type": "long" + }, + "TaskQueue": { + "type": "keyword" + }, + + "Attr": { + "properties": { + "TemporalChangeVersion": { + "type": "keyword" + }, + "CustomStringField": { + "type": "text" + }, + "CustomKeywordField": { + "type": "keyword" + }, + "CustomIntField": { + "type": "long" + }, + "CustomDoubleField": { + "type": "double" + }, + "CustomBoolField": { + "type": "boolean" + }, + "CustomDatetimeField": { + "type": "date" + }, + "CustomNamespace": { + "type": "keyword" + }, + "Operator": { + "type": "keyword" + }, + "BinaryChecksums": { + "type": "keyword" + } + } + } + } + }, + "aliases": {} +} diff --git a/apps/temporal-elasticsearch/setup_index_template.sh b/apps/temporal-elasticsearch/setup_index_template.sh new file mode 100755 index 0000000000..ef42765eec --- /dev/null +++ b/apps/temporal-elasticsearch/setup_index_template.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -u +set -e + + +echo "Starting Elasticsearch for index setup..." +/opt/elasticsearch/bin/elasticsearch & + +for i in {1..120}; do + echo "Waiting for Elasticsearch to start..." + if curl --silent --show-error --fail "http://127.0.0.1:9200/_cluster/health"; then + break + else + sleep 1 + fi +done + + +echo "Creating Temporal index template..." +curl -XPUT "http://127.0.0.1:9200/_template/temporal-visibility-template" \ + --fail \ + --silent \ + --show-error \ + -H "Content-Type: application/json" \ + -d @index_template.json +echo "Done creating Temporal index template." + + +echo "Stopping Elasticsearch..." +killall java +while pgrep java > /dev/null; do + sleep 0.5 +done diff --git a/apps/temporal-grafana/.dockerignore b/apps/temporal-grafana/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/temporal-grafana/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/temporal-grafana/Dockerfile b/apps/temporal-grafana/Dockerfile new file mode 100644 index 0000000000..9eb066861a --- /dev/null +++ b/apps/temporal-grafana/Dockerfile @@ -0,0 +1,68 @@ +# +# Grafana for Temporal stats +# + +FROM gcr.io/mcback/base:latest + +# Install dependencies +RUN \ + apt-get -y --no-install-recommends install \ + libfontconfig1 \ + && \ + true + +# Install Grafana +RUN \ + mkdir -p /opt/grafana/ && \ + /dl_to_stdout.sh "https://dl.grafana.com/oss/release/grafana-7.5.5.linux-amd64.tar.gz" | \ + tar -zx -C /opt/grafana/ --strip 1 && \ + true + +RUN \ + # + # Remove sample provisioning + rm -rf /opt/grafana/conf/provisioning/ && \ + # + # Add unprivileged user the service will run as + useradd -ms /bin/bash temporal && \ + mkdir -p \ + /var/lib/grafana/ \ + /var/lib/grafana/logs/ \ + /var/lib/grafana/plugins/ \ + && \ + chown temporal:temporal /var/lib/grafana/ && \ + # + # Create directory for provisioning dashboards + mkdir -p /opt/grafana/dashboards/ && \ + # + true + +COPY provisioning/ /opt/grafana/conf/provisioning/ +COPY dashboards/dashboards/* /opt/grafana/dashboards/ + +# Test if submodules were checked out +RUN \ + if [ ! -f "/opt/grafana/dashboards/temporal.json" ]; then \ + echo && \ + echo "Git submodules haven't been checked out, please run:" && \ + echo && \ + echo " git submodule update --init --recursive" && \ + echo && \ + echo "and then rebuild this image." && \ + echo && \ + exit 1; \ + fi + +WORKDIR /opt/grafana/ + +ENV PATH="/opt/grafana/bin:${PATH}" + +EXPOSE 3000 + +VOLUME /var/lib/grafana/ + +USER temporal + +COPY grafana.ini /opt/grafana/conf/ + +CMD ["grafana-server", "-config", "/opt/grafana/conf/grafana.ini"] diff --git a/apps/temporal-grafana/dashboards b/apps/temporal-grafana/dashboards new file mode 160000 index 0000000000..6094dd666f --- /dev/null +++ b/apps/temporal-grafana/dashboards @@ -0,0 +1 @@ +Subproject commit 6094dd666f386e76a3c03e0049f02521210b6883 diff --git a/apps/temporal-grafana/grafana.ini b/apps/temporal-grafana/grafana.ini new file mode 100644 index 0000000000..9b9d4ca5c8 --- /dev/null +++ b/apps/temporal-grafana/grafana.ini @@ -0,0 +1,122 @@ +# possible values : production, development +app_mode = production + +# instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty +instance_name = temporal-grafana + +#################################### Paths #################################### +[paths] +# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) +data = /var/lib/grafana + +# Directory where grafana can store logs +logs = /var/lib/grafana/logs + +# Directory where grafana will automatically scan and look for plugins +plugins = /var/lib/grafana/plugins + +#################################### Server #################################### +[server] + +# The http port to use +http_port = 3000 + +#################################### Analytics #################################### +[analytics] +# Server reporting, sends usage counters to stats.grafana.org every 24 hours. +# No ip addresses are being tracked, only simple counters to track +# running instances, dashboard and error counts. It is very helpful to us. +# Change this option to false to disable reporting. +reporting_enabled = false + +# Set to false to disable all checks to https://grafana.net +# for new versions (grafana itself and plugins), check is used +# in some UI views to notify that grafana or plugin update exists +# This option does not cause any auto updates, nor send any information +# only a GET request to http://grafana.com to get latest versions +check_for_updates = false + +#################################### Security #################################### +[security] +# disable creation of admin user on first start of grafana +disable_initial_admin_creation = false + +# default admin user, created on startup +admin_user = mediacloud + +# default admin password, can be changed before first start of grafana, or in profile settings +admin_password = mediacloud + +# used for signing +# (Media Cloud's Grafana is hosted behind a firewall so this can be anything really) +secret_key = wkKjdjnUL9j27QW4L2w5 + +# disable gravatar profile images +disable_gravatar = true + +# disable protection against brute force login attempts +disable_brute_force_login_protection = true + +#################################### Snapshots ########################### +[snapshots] +# snapshot sharing options +external_enabled = false + +#################################### Dashboards History ################## +[dashboards] + +# Path to the default home dashboard. If this value is empty, then Grafana uses StaticRootPath + "dashboards/home.json" +default_home_dashboard_path = dashboards/temporal.json + +#################################### Users ############################### +[users] +# disable user signup / registration +allow_sign_up = false + +# Allow non admin users to create organizations +allow_org_create = false + +# Background text for the user field on the login page +login_hint = mediacloud +password_hint = mediacloud + +# Default UI theme ("dark" or "light") +default_theme = light + +[auth] + +# Set to true to disable the signout link in the side menu. useful if you use auth.proxy, defaults to false +disable_signout_menu = true + +#################################### Anonymous Auth ###################### +[auth.anonymous] +# enable anonymous access +enabled = false + +#################################### Logging ########################## +[log] +# Either "console", "file", "syslog". Default is console and file +# Use space to separate multiple modes, e.g. "console file" +mode = console + +format = text + +#################################### Alerting ############################ +[alerting] +# Disable alerting engine & UI features +enabled = false + +#################################### Annotations ######################### +[annotations] +# Configures the batch size for the annotation clean-up job. This setting is used for dashboard, API, and alert annotations. +;cleanupjob_batchsize = 100 + +#################################### Explore ############################# +[explore] +# Enable the Explore section +enabled = true + +[date_formats] + +# Default timezone for user preferences. Options are 'browser' for the browser local timezone or a timezone name from IANA Time Zone database, e.g. 'UTC' or 'Europe/Amsterdam' etc. +default_timezone = 'America/New_York' diff --git a/apps/temporal-grafana/provisioning/dashboards/temporal.yml b/apps/temporal-grafana/provisioning/dashboards/temporal.yml new file mode 100644 index 0000000000..5fbf68b99b --- /dev/null +++ b/apps/temporal-grafana/provisioning/dashboards/temporal.yml @@ -0,0 +1,10 @@ +apiVersion: 1 +providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: true + editable: false + options: + path: /opt/grafana/dashboards/ diff --git a/apps/temporal-grafana/provisioning/datasources/temporal.yml b/apps/temporal-grafana/provisioning/datasources/temporal.yml new file mode 100644 index 0000000000..9722c5904c --- /dev/null +++ b/apps/temporal-grafana/provisioning/datasources/temporal.yml @@ -0,0 +1,7 @@ +apiVersion: 1 +datasources: + - name: TemporalMetrics + type: prometheus + url: http://temporal-prometheus:9090 + access: proxy + isDefault: true diff --git a/apps/podcast-poll-due-operations/tests/python/__init__.py b/apps/temporal-grafana/provisioning/notifiers/.empty_dir similarity index 100% rename from apps/podcast-poll-due-operations/tests/python/__init__.py rename to apps/temporal-grafana/provisioning/notifiers/.empty_dir diff --git a/apps/podcast-submit-operation/src/python/podcast_submit_operation/__init__.py b/apps/temporal-grafana/provisioning/plugins/.empty_dir similarity index 100% rename from apps/podcast-submit-operation/src/python/podcast_submit_operation/__init__.py rename to apps/temporal-grafana/provisioning/plugins/.empty_dir diff --git a/apps/temporal-postgresql/.dockerignore b/apps/temporal-postgresql/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/temporal-postgresql/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/temporal-postgresql/Dockerfile b/apps/temporal-postgresql/Dockerfile new file mode 100644 index 0000000000..3e4cad27dd --- /dev/null +++ b/apps/temporal-postgresql/Dockerfile @@ -0,0 +1,98 @@ +# +# PostgreSQL server for Temporal's workflow storage +# + +FROM gcr.io/mcback/postgresql-base:latest + +USER root + +RUN \ + mkdir -p \ + /opt/temporal-postgresql/bin/ \ + /opt/temporal-postgresql/schema/ \ + && \ + # + # Install temporal-sql-tool + # Keep version that's being used in sync with temporal-server + mkdir -p /var/tmp/temporal/ && \ + /dl_to_stdout.sh "https://github.com/temporalio/temporal/releases/download/v1.9.2/temporal_1.9.2_linux_amd64.tar.gz" | \ + tar -zx -C /var/tmp/temporal/ && \ + mv \ + # Needed for creating the default namespace + /var/tmp/temporal/tctl \ + # Needed for temporarily starting the server at build time to create + # the default namespace + /var/tmp/temporal/temporal-server \ + # Needed for initializing default schema + /var/tmp/temporal/temporal-sql-tool \ + # + /usr/bin/ && \ + rm -rf /var/tmp/temporal/ && \ + true + +# Check out schema +RUN \ + apt-get -y --no-install-recommends install git && \ + mkdir -p /var/tmp/temporal/ && \ + cd /var/tmp/temporal/ && \ + git init && \ + git remote add origin https://github.com/temporalio/temporal.git && \ + # HEAD of "v1.9.2" tag: + git fetch --depth 1 origin d3acf160e51deb60ac798746fc06fc5c46c46269 && \ + git checkout FETCH_HEAD && \ + mv schema/postgresql/* /opt/temporal-postgresql/schema/ && \ + cd / && \ + rm -rf /var/tmp/temporal/ && \ + apt-get -y remove git && \ + apt-get -y autoremove && \ + apt-get -y clean && \ + true + +# Install envsubst for generating configuration +RUN apt-get -y --no-install-recommends install gettext-base + +RUN mkdir -p /opt/temporal-server/config/ +COPY temporal-config/* /opt/temporal-server/config/ + +# Allow a final mediacloud.yml to get generated +RUN chown postgres:postgres /opt/temporal-server/config/ + +# Copy helper scripts +COPY bin/* /opt/temporal-postgresql/bin/ + +USER postgres + +# Initialize data volume, create users, a database, and initialize it with +# schema +# If a new empty volume gets mounted to /var/lib/postgresql/ upon +# container start, Docker will copy the files from the container to the volume +RUN /opt/temporal-postgresql/bin/initialize_schema.sh + +# Remove the init script, Temporal server and configuration so that someone +# doesn't accidentally run it in production +USER root +RUN \ + rm -rf \ + /opt/temporal-postgresql/bin/initialize_schema.sh \ + /usr/bin/tctl \ + /usr/bin/temporal-server \ + /opt/temporal-server/ \ + && \ + true +USER postgres + +ENV \ + PATH="/opt/temporal-postgresql/bin:${PATH}" \ + # + # Make sure that we can connect via "psql" without sudoing into "postgres" user + PGHOST=localhost \ + PGPORT=5432 \ + PGUSER=temporal \ + PGPASSWORD=temporal \ + PGDATABASE=temporal + +# PostgreSQL data +VOLUME /var/lib/postgresql/ + +# Use our own wrapper script which runs schema upgrades first +CMD ["/opt/temporal-postgresql/bin/postgresql.sh"] diff --git a/apps/temporal-postgresql/bin/apply_migrations.sh b/apps/temporal-postgresql/bin/apply_migrations.sh new file mode 100755 index 0000000000..27c5fa233f --- /dev/null +++ b/apps/temporal-postgresql/bin/apply_migrations.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +set -u +set -e + +MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/13/bin/" +MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/13/main/" +MC_POSTGRESQL_CONF_PATH="/etc/postgresql/13/main/postgresql.conf" + +# Apply migrations when running on a different port so that clients don't end +# up connecting in the middle of migrating +TEMP_PORT=12345 + +# In case the database is in recovery, wait for up to 1 hour for it to complete +PGCTL_START_TIMEOUT=3600 + +# Start PostgreSQL on a temporary port +"${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ + -o "-c config_file=${MC_POSTGRESQL_CONF_PATH} -p ${TEMP_PORT}" \ + -D "${MC_POSTGRESQL_DATA_DIR}" \ + -t "${PGCTL_START_TIMEOUT}" \ + -w \ + start + +VENDOR_SCHEMA_DIR="/opt/temporal-postgresql/schema/v96" +TSQL="temporal-sql-tool \ + --plugin postgres \ + --ep 127.0.0.1 \ + -p 12345 \ + -u temporal \ + --pw temporal" + +MAIN_SCHEMA_DIR="${VENDOR_SCHEMA_DIR}/temporal/versioned" +$TSQL --db temporal update-schema -d "${MAIN_SCHEMA_DIR}" + +VISIBILITY_SCHEMA_DIR="${VENDOR_SCHEMA_DIR}/visibility/versioned" +$TSQL --db temporal_visibility update-schema -d "${VISIBILITY_SCHEMA_DIR}" + +# Stop PostgreSQL +"${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ + -D "${MC_POSTGRESQL_DATA_DIR}" \ + -m fast \ + -w \ + stop diff --git a/apps/temporal-postgresql/bin/initialize_schema.sh b/apps/temporal-postgresql/bin/initialize_schema.sh new file mode 100755 index 0000000000..df22aba2b3 --- /dev/null +++ b/apps/temporal-postgresql/bin/initialize_schema.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# +# FIXME reuse code between "initialize_schema.sh" and "apply_migrations.sh" +# + +set -u +set -e + +MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/13/bin/" +MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/13/main/" +MC_POSTGRESQL_CONF_PATH="/etc/postgresql/13/main/postgresql.conf" + +# Update memory configuration +/opt/postgresql-base/bin/update_memory_config.sh + +"${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ + -o "-c config_file=${MC_POSTGRESQL_CONF_PATH}" \ + -D "${MC_POSTGRESQL_DATA_DIR}" \ + -w \ + -t 1200 \ + start + +psql -v ON_ERROR_STOP=1 -c "CREATE USER temporal WITH PASSWORD 'temporal' SUPERUSER;" + +VENDOR_SCHEMA_DIR="/opt/temporal-postgresql/schema/v96" +TSQL="temporal-sql-tool \ + --plugin postgres \ + --ep 127.0.0.1 \ + -p 5432 \ + -u temporal \ + --pw temporal \ +" + +MAIN_SCHEMA_DIR="${VENDOR_SCHEMA_DIR}/temporal/versioned" +$TSQL create --db temporal +$TSQL --db temporal setup-schema -v 0.0 +$TSQL --db temporal update-schema -d "${MAIN_SCHEMA_DIR}" + +VISIBILITY_SCHEMA_DIR="${VENDOR_SCHEMA_DIR}/visibility/versioned" +$TSQL create --db temporal_visibility +$TSQL --db temporal_visibility setup-schema -v 0.0 +$TSQL --db temporal_visibility update-schema -d "${VISIBILITY_SCHEMA_DIR}" + +# Both listen on localhost and expect to find PostgreSQL locally too +export MC_TEMPORAL_POSTGRESQL_HOST="127.0.0.1" +export MC_TEMPORAL_HOST_IP="127.0.0.1" + +# Generate final config +envsubst \ + < /opt/temporal-server/config/mediacloud_template.yaml \ + > /opt/temporal-server/config/mediacloud.yaml + +# Start the server in the background +temporal-server --root /opt/temporal-server --env mediacloud start & + +# Create the default namespace whenever the server becomes ready +until tctl --ns default namespace describe < /dev/null; do + echo "Default namespace not found. Creating..." + sleep 0.2 + + # FIXME retention period rather short + tctl \ + --ns default \ + namespace register \ + --rd 1 \ + --desc "Default namespace for Temporal Server" \ + || echo "Creating default namespace failed." + +done + +# Even after creating the default namespace, it doesn't become immediately ready +# so wait for a bit +echo "Waiting for the default namespace to propagate..." +sleep 30 + +killall -9 temporal-server + +# Stop PostgreSQL +"${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ + -D "${MC_POSTGRESQL_DATA_DIR}" \ + -m fast \ + -w \ + -t 1200 \ + stop + +# Create a file that will denote that we're running off a fresh data volume and +# it's the first time ever that we've started the server +cat > /var/lib/postgresql/first_run << EOF +If this file exists, it means that a fresh data volume was just mounted to the +container, and the container is about to run for the first time ever, so +there's no point in attempting to check the schema version and apply +migrations. + +After the first time this container gets run, this file will get deleted and +every subsequent run of the same container will then attempt to apply +migrations in order to upgrade the schema before continuing with anything else. +EOF +chown postgres:postgres /var/lib/postgresql/first_run diff --git a/apps/temporal-postgresql/bin/postgresql.sh b/apps/temporal-postgresql/bin/postgresql.sh new file mode 100755 index 0000000000..4b7af3a946 --- /dev/null +++ b/apps/temporal-postgresql/bin/postgresql.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -u +set -e + +# Update memory configuration +/opt/postgresql-base/bin/update_memory_config.sh + +# Run schema migrations if needed +if [ -e /var/lib/postgresql/first_run ]; then + echo "Skipping schema migrations on first run..." + rm /var/lib/postgresql/first_run +elif [ ! -z ${MC_TEMPORAL_SKIP_MIGRATIONS+x} ]; then + echo "Skipping schema migrations because 'MC_TEMPORAL_SKIP_MIGRATIONS' is set." +else + echo "Applying schema migrations..." + /opt/temporal-postgresql/bin/apply_migrations.sh + echo "Done applying schema migrations." +fi + +# Start PostgreSQL +exec /opt/postgresql-base/bin/postgresql.sh diff --git a/apps/temporal-postgresql/temporal-config b/apps/temporal-postgresql/temporal-config new file mode 160000 index 0000000000..429e50e8f7 --- /dev/null +++ b/apps/temporal-postgresql/temporal-config @@ -0,0 +1 @@ +Subproject commit 429e50e8f728a1ce52a406ee0e114da2b2201ba7 diff --git a/apps/temporal-prometheus/.dockerignore b/apps/temporal-prometheus/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/temporal-prometheus/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/temporal-prometheus/Dockerfile b/apps/temporal-prometheus/Dockerfile new file mode 100644 index 0000000000..a935e541fc --- /dev/null +++ b/apps/temporal-prometheus/Dockerfile @@ -0,0 +1,32 @@ +# +# Prometheus for Temporal stats +# + +FROM gcr.io/mcback/base:latest + +RUN \ + mkdir -p /opt/prometheus/ && \ + /dl_to_stdout.sh "https://github.com/prometheus/prometheus/releases/download/v2.26.0/prometheus-2.26.0.linux-amd64.tar.gz" | \ + tar -zx -C /opt/prometheus/ --strip 1 && \ + true + +COPY prometheus.yml /opt/prometheus/ + +# Add unprivileged user the service will run as +RUN \ + useradd -ms /bin/bash temporal && \ + mkdir -p /opt/prometheus/data/ && \ + chown temporal:temporal /opt/prometheus/data/ && \ + true + +WORKDIR /opt/prometheus/ + +ENV PATH="/opt/prometheus:${PATH}" + +EXPOSE 9090 + +USER temporal + +VOLUME /opt/prometheus/data/ + +CMD ["prometheus"] diff --git a/apps/temporal-prometheus/prometheus.yml b/apps/temporal-prometheus/prometheus.yml new file mode 100644 index 0000000000..0a62dfbacb --- /dev/null +++ b/apps/temporal-prometheus/prometheus.yml @@ -0,0 +1,22 @@ +global: + scrape_interval: 5s + scrape_timeout: 5s + +scrape_configs: + + - job_name: 'prometheus' + static_configs: + - targets: + - 'localhost:9090' + + - job_name: 'services' + static_configs: + - targets: + # frontend + - 'temporal-server:9091' + # matching + - 'temporal-server:9092' + # history + - 'temporal-server:9093' + # worker + - 'temporal-server:9094' diff --git a/apps/temporal-server/.dockerignore b/apps/temporal-server/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/temporal-server/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/temporal-server/Dockerfile b/apps/temporal-server/Dockerfile new file mode 100644 index 0000000000..3fb7674dca --- /dev/null +++ b/apps/temporal-server/Dockerfile @@ -0,0 +1,76 @@ +# +# Temporal server +# + +FROM gcr.io/mcback/base:latest + +# Install dependencies +RUN \ + apt-get -y --no-install-recommends install \ + libprotobuf17 \ + && \ + true + +# Install Temporal server +RUN \ + # Keep version that's being used in sync with temporal-postgresql + mkdir -p /var/tmp/temporal/ && \ + /dl_to_stdout.sh "https://github.com/temporalio/temporal/releases/download/v1.9.2/temporal_1.9.2_linux_amd64.tar.gz" | \ + tar -zx -C /var/tmp/temporal/ && \ + mv /var/tmp/temporal/temporal-server /var/tmp/temporal/tctl /usr/bin/ && \ + cd / && \ + rm -rf /var/tmp/temporal/ && \ + true + +RUN \ + # + # Install envsubst for generating configuration + apt-get -y --no-install-recommends install \ + gettext-base \ + && \ + # + # Install utilities useful for tctl + apt-get -y --no-install-recommends install \ + jq \ + && \ + # + # Add unprivileged user the service will run as + useradd -ms /bin/bash temporal && \ + # + # Directory for wrapper scripts + mkdir -p /opt/temporal-server/bin/ && \ + # + # Directory for configuration (has to be writable to generate final + # configuration files from templates) + mkdir -p /opt/temporal-server/config/ && \ + chown temporal:temporal /opt/temporal-server/config/ && \ + # + # Directories workflow archival + mkdir -p \ + /var/lib/temporal/archival/temporal/ \ + /var/lib/temporal/archival/visibility/ \ + && \ + chown -R temporal:temporal /var/lib/temporal/ && \ + # + true + +COPY bin/* /opt/temporal-server/bin/ +COPY config/* /opt/temporal-server/config/ + +ENV PATH="/opt/temporal-server/bin:${PATH}" \ + # https://docs.temporal.io/docs/tctl/#environment-variables + TEMPORAL_CLI_ADDRESS="temporal-server:7233" \ + TEMPORAL_CLI_NAMESPACE="default" + +# Archives +VOLUME /var/lib/temporal/ + +EXPOSE \ + # Port descriptions: https://docs.temporal.io/docs/server-architecture/ + 6933 6934 6935 6939 7233 7234 7235 7239 \ + # Prometheus endpoints + 9091 9092 9093 9094 + +USER temporal + +CMD ["temporal.sh"] diff --git a/apps/temporal-server/bin/temporal.sh b/apps/temporal-server/bin/temporal.sh new file mode 100755 index 0000000000..31cf01edaf --- /dev/null +++ b/apps/temporal-server/bin/temporal.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +set -u +set -e + +export MC_TEMPORAL_POSTGRESQL_HOST="temporal-postgresql" + +# Hostname for binding configuration +export MC_TEMPORAL_HOST_IP=$(hostname -i) + +# Generate final config +envsubst \ + < /opt/temporal-server/config/mediacloud_template.yaml \ + > /opt/temporal-server/config/mediacloud.yaml + +# FIXME give up and crash after a while + +while true; do + echo "Waiting for PostgreSQL to start..." + if nc -z -w 10 temporal-postgresql 5432; then + break + else + sleep 1 + fi +done + +while true; do + echo "Waiting for Elasticsearch to start..." + if curl --silent --show-error --fail "http://temporal-elasticsearch:9200/_cluster/health"; then + break + else + sleep 1 + fi +done + +# FIXME perhaps run all four services ("frontend", "history", "matching", "worker") +# as separate containers? +exec temporal-server \ + --root /opt/temporal-server \ + --env mediacloud \ + start diff --git a/apps/temporal-server/config b/apps/temporal-server/config new file mode 160000 index 0000000000..429e50e8f7 --- /dev/null +++ b/apps/temporal-server/config @@ -0,0 +1 @@ +Subproject commit 429e50e8f728a1ce52a406ee0e114da2b2201ba7 diff --git a/apps/temporal-server/docker-compose.tests.yml b/apps/temporal-server/docker-compose.tests.yml new file mode 100644 index 0000000000..89ab2f7968 --- /dev/null +++ b/apps/temporal-server/docker-compose.tests.yml @@ -0,0 +1,194 @@ +version: "3.7" + +services: + + # Service to use for testing the Temporal service + # + # Usage: + # + # host$ ./dev/run.py temporal-server bash + # container$ python3 + # + # ...and then submit a Temporal workflow somehow. + # + temporal-server: + image: gcr.io/mcback/common:latest + init: true + stop_signal: SIGKILL + depends_on: + - temporal-server-actual + - temporal-webapp + + # Actual Temporal server, operating under "temporal-server" alias + temporal-server-actual: + image: gcr.io/mcback/temporal-server:latest + init: true + stop_signal: SIGKILL + depends_on: + - temporal-postgresql + - temporal-elasticsearch + - temporal-prometheus + networks: + default: + aliases: + - temporal-server + expose: + - 6933 + - 6934 + - 6935 + - 6939 + - 7233 + - 7234 + - 7235 + - 7239 + - 9091 + - 9092 + - 9093 + - 9094 + ports: + # Expose to host for debugging + - "6933:6933" + - "6934:6934" + - "6935:6935" + - "6939:6939" + - "7233:7233" + - "7234:7234" + - "7235:7235" + - "7239:7239" + - "9091:9091" + - "9092:9092" + - "9093:9093" + - "9094:9094" + volumes: + - type: bind + source: ./bin/ + target: /opt/temporal-server/bin/ + - type: bind + source: ./config/dynamicconfig.yaml + target: /opt/temporal-server/config/dynamicconfig.yaml + - type: bind + source: ./config/mediacloud_template.yaml + target: /opt/temporal-server/config/mediacloud_template.yaml + + temporal-postgresql: + image: gcr.io/mcback/temporal-postgresql:latest + init: true + stop_signal: SIGKILL + networks: + - default + expose: + - 5432 + ports: + # Expose to host for debugging + - "5432:5432" + volumes: + - type: bind + source: ./../temporal-postgresql/bin/ + target: /opt/temporal-postgresql/bin/ + - type: bind + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ + + temporal-elasticsearch: + image: gcr.io/mcback/temporal-elasticsearch:latest + init: true + stop_signal: SIGKILL + networks: + - default + expose: + - "9200" + - "9300" + ports: + # Expose to host for debugging + - "9200:9200" + - "9300:9300" + volumes: + - type: bind + source: ./../elasticsearch-base/bin/elasticsearch.sh + target: /opt/elasticsearch/bin/elasticsearch.sh + # Not mounting config as it gets concatenated into a single file + # Limit CPUs and RAM for the process to not get too greedy + deploy: + resources: + limits: + cpus: "2" + memory: "2G" + + temporal-prometheus: + image: gcr.io/mcback/temporal-prometheus:latest + init: true + stop_signal: SIGKILL + depends_on: + - temporal-grafana + networks: + - default + expose: + - "9090" + ports: + # Expose to host for debugging + - "9090:9090" + volumes: + - type: bind + source: ./../temporal-prometheus/prometheus.yml + target: /opt/prometheus/prometheus.yml + # Limit CPUs and RAM for the process to not get too greedy + deploy: + resources: + limits: + cpus: "2" + memory: "2G" + + temporal-grafana: + image: gcr.io/mcback/temporal-grafana:latest + init: true + stop_signal: SIGKILL + networks: + - default + expose: + - "3000" + ports: + # Expose to host for debugging + - "3000:3000" + volumes: + - type: bind + source: ./../temporal-grafana/grafana.ini + target: /opt/grafana/conf/grafana.ini + - type: bind + source: ./../temporal-grafana/provisioning/ + target: /opt/grafana/provisioning/ + - type: bind + source: ./../temporal-grafana/dashboards/dashboards/ + target: /opt/grafana/dashboards/ + # Limit CPUs and RAM for the process to not get too greedy + deploy: + resources: + limits: + cpus: "2" + memory: "2G" + + temporal-webapp: + image: gcr.io/mcback/temporal-webapp:latest + init: true + stop_signal: SIGKILL + networks: + - default + expose: + - "8088" + ports: + # Expose to host for debugging + - "8088:8088" + # Limit CPUs and RAM for the process to not get too greedy + deploy: + resources: + limits: + cpus: "1" + memory: "2G" + +networks: + default: + attachable: true + ipam: + driver: default + config: + # Use same subnet as in production + - subnet: "10.1.0.0/16" diff --git a/apps/temporal-webapp/.dockerignore b/apps/temporal-webapp/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/temporal-webapp/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/temporal-webapp/Dockerfile b/apps/temporal-webapp/Dockerfile new file mode 100644 index 0000000000..3f475a85b6 --- /dev/null +++ b/apps/temporal-webapp/Dockerfile @@ -0,0 +1,82 @@ +# +# Temporal webapp +# + +FROM gcr.io/mcback/base:latest + +RUN \ + # + # Add NodeSource APT repository + curl -fsSL https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - && \ + echo "deb https://deb.nodesource.com/node_14.x focal main" \ + > /etc/apt/sources.list.d/nodesource.list && \ + apt-get -y update && \ + # + # Install Node.js + apt-get -y --no-install-recommends install nodejs && \ + # + true + +# FIXME Vue.js still gets built in development mode +ENV NODE_ENV=production \ + NPM_CONFIG_PRODUCTION=true \ + TEMPORAL_GRPC_ENDPOINT=temporal-server:7233 \ + TEMPORAL_PERMIT_WRITE_API=true + +RUN \ + # + # Install build dependencies + apt-get -y --no-install-recommends install git && \ + # + # Create target directory + mkdir -p /opt/temporal-webapp/ && \ + # + # Download Temporal webapp + # * We use Git instead of building a released package because we need + # the submodules for the build too; + # * We check out a specific commit hash instead of a version tag to prevent + # dependency confusion + # (https://medium.com/@alex.birsan/dependency-confusion-4a5d60fec610); + # * We do some extra trickery to do a shallow copy of just a single commit + # hash to save space + time (https://stackoverflow.com/a/43136160/200603); + # * Submodule is referred to as a SSH URI, so we need to make Git's SSH + # work first too. + # + cd /opt/temporal-webapp/ && \ + git init && \ + git remote add origin https://github.com/temporalio/web.git && \ + # HEAD of "v1.9.0" tag: + git fetch --depth 1 origin 6ed16d0dc07b4baf43e091028d98fa1fe7a29c06 && \ + git checkout FETCH_HEAD && \ + # SSH checkout doesn't work with the build container's public key not + # registered with GitHub + sed -i 's/git@github.com:/https:\/\/github.com\//g' .gitmodules && \ + git submodule init && \ + git submodule sync && \ + git submodule update --init --recursive --depth 1 && \ + # + # Build the webapp + npm install --production && \ + npm run build-production && \ + # + # Remove build dependencies + apt-get -y remove git && \ + apt-get -y autoremove && \ + apt-get -y clean && \ + # + # Remove Git history as we won't need it + rm -rf .git/ && \ + # + # Add unprivileged user the service will run as + useradd -ms /bin/bash temporal && \ + # + true + +WORKDIR /opt/temporal-webapp/ + +# Webapp port +EXPOSE 8088 + +USER temporal + +CMD ["node", "server.js"] diff --git a/apps/tools/.idea/mediawords.sql b/apps/tools/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/tools/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/tools/.idea/sqlDataSources.xml b/apps/tools/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..006e38938a --- /dev/null +++ b/apps/tools/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/tools/docker-compose.tests.yml b/apps/tools/docker-compose.tests.yml index 109d12142b..5c3191140b 100644 --- a/apps/tools/docker-compose.tests.yml +++ b/apps/tools/docker-compose.tests.yml @@ -45,8 +45,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ solr-shard-01: image: gcr.io/mcback/solr-shard:latest diff --git a/apps/topics-base/.idea/mediawords.sql b/apps/topics-base/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/topics-base/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/topics-base/.idea/sqlDataSources.xml b/apps/topics-base/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..944fa920f0 --- /dev/null +++ b/apps/topics-base/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/topics-base/docker-compose.tests.yml b/apps/topics-base/docker-compose.tests.yml index 63dcc58b7d..929cfda493 100644 --- a/apps/topics-base/docker-compose.tests.yml +++ b/apps/topics-base/docker-compose.tests.yml @@ -93,8 +93,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ rabbitmq-server: image: gcr.io/mcback/rabbitmq-server:latest diff --git a/apps/topics-extract-story-links/.idea/mediawords.sql b/apps/topics-extract-story-links/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/topics-extract-story-links/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/topics-extract-story-links/.idea/misc.xml b/apps/topics-extract-story-links/.idea/misc.xml index feac02deed..2d65e1c063 100644 --- a/apps/topics-extract-story-links/.idea/misc.xml +++ b/apps/topics-extract-story-links/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/topics-extract-story-links/.idea/sqlDataSources.xml b/apps/topics-extract-story-links/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..bdf32c2882 --- /dev/null +++ b/apps/topics-extract-story-links/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/topics-extract-story-links/.idea/topics-extract-story-links.iml b/apps/topics-extract-story-links/.idea/topics-extract-story-links.iml index f750ca6520..09997bb235 100644 --- a/apps/topics-extract-story-links/.idea/topics-extract-story-links.iml +++ b/apps/topics-extract-story-links/.idea/topics-extract-story-links.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/topics-extract-story-links/docker-compose.tests.yml b/apps/topics-extract-story-links/docker-compose.tests.yml index 80d21f6a60..380509c5f1 100644 --- a/apps/topics-extract-story-links/docker-compose.tests.yml +++ b/apps/topics-extract-story-links/docker-compose.tests.yml @@ -75,5 +75,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/topics-fetch-link/.idea/mediawords.sql b/apps/topics-fetch-link/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/topics-fetch-link/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/topics-fetch-link/.idea/misc.xml b/apps/topics-fetch-link/.idea/misc.xml index d17c2acb4c..1fcaee9c6f 100644 --- a/apps/topics-fetch-link/.idea/misc.xml +++ b/apps/topics-fetch-link/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/topics-fetch-link/.idea/sqlDataSources.xml b/apps/topics-fetch-link/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..7cc1658bf3 --- /dev/null +++ b/apps/topics-fetch-link/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/topics-fetch-link/.idea/topics-fetch-link.iml b/apps/topics-fetch-link/.idea/topics-fetch-link.iml index 0d2b068e16..681b64f8cc 100644 --- a/apps/topics-fetch-link/.idea/topics-fetch-link.iml +++ b/apps/topics-fetch-link/.idea/topics-fetch-link.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/topics-fetch-link/docker-compose.tests.yml b/apps/topics-fetch-link/docker-compose.tests.yml index e5f84171bf..c4b500e35e 100644 --- a/apps/topics-fetch-link/docker-compose.tests.yml +++ b/apps/topics-fetch-link/docker-compose.tests.yml @@ -93,8 +93,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ rabbitmq-server: image: gcr.io/mcback/rabbitmq-server:latest diff --git a/apps/topics-fetch-twitter-urls/.idea/mediawords.sql b/apps/topics-fetch-twitter-urls/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/topics-fetch-twitter-urls/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/topics-fetch-twitter-urls/.idea/misc.xml b/apps/topics-fetch-twitter-urls/.idea/misc.xml index 2a84bb3ad3..1541a3b4a0 100644 --- a/apps/topics-fetch-twitter-urls/.idea/misc.xml +++ b/apps/topics-fetch-twitter-urls/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/topics-fetch-twitter-urls/.idea/sqlDataSources.xml b/apps/topics-fetch-twitter-urls/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..823ea3ad86 --- /dev/null +++ b/apps/topics-fetch-twitter-urls/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/topics-fetch-twitter-urls/.idea/topics-fetch-twitter-urls.iml b/apps/topics-fetch-twitter-urls/.idea/topics-fetch-twitter-urls.iml index 30a0de4fb1..3a6d60fde3 100644 --- a/apps/topics-fetch-twitter-urls/.idea/topics-fetch-twitter-urls.iml +++ b/apps/topics-fetch-twitter-urls/.idea/topics-fetch-twitter-urls.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/topics-fetch-twitter-urls/docker-compose.tests.yml b/apps/topics-fetch-twitter-urls/docker-compose.tests.yml index 63fe1cd6d4..74bd3fd650 100644 --- a/apps/topics-fetch-twitter-urls/docker-compose.tests.yml +++ b/apps/topics-fetch-twitter-urls/docker-compose.tests.yml @@ -97,8 +97,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ rabbitmq-server: image: gcr.io/mcback/rabbitmq-server:latest diff --git a/apps/topics-map/.idea/mediawords.sql b/apps/topics-map/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/topics-map/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/topics-map/.idea/sqlDataSources.xml b/apps/topics-map/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..b8c792255c --- /dev/null +++ b/apps/topics-map/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/topics-map/Dockerfile b/apps/topics-map/Dockerfile index 208b99b8fd..3b61e8bd37 100644 --- a/apps/topics-map/Dockerfile +++ b/apps/topics-map/Dockerfile @@ -5,7 +5,10 @@ FROM gcr.io/mcback/common:latest # Install Java -RUN apt-get -y --no-install-recommends install openjdk-8-jre-headless +RUN \ + apt-get -y update && \ + apt-get -y --no-install-recommends install openjdk-8-jre-headless && \ + true # Install fa2l Java libs RUN \ diff --git a/apps/topics-map/docker-compose.tests.yml b/apps/topics-map/docker-compose.tests.yml index 82438fcb22..d7f3d0ac2c 100644 --- a/apps/topics-map/docker-compose.tests.yml +++ b/apps/topics-map/docker-compose.tests.yml @@ -55,6 +55,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ - + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/apps/topics-mine/docker-compose.tests.yml b/apps/topics-mine/docker-compose.tests.yml index ed2e90a371..20562aa1ee 100644 --- a/apps/topics-mine/docker-compose.tests.yml +++ b/apps/topics-mine/docker-compose.tests.yml @@ -112,8 +112,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ rabbitmq-server: image: gcr.io/mcback/rabbitmq-server:latest diff --git a/apps/topics-mine/src/python/topics_mine/posts/__init__.py b/apps/topics-mine/src/python/topics_mine/posts/__init__.py index 4465abaf93..1b62e05a95 100644 --- a/apps/topics-mine/src/python/topics_mine/posts/__init__.py +++ b/apps/topics-mine/src/python/topics_mine/posts/__init__.py @@ -27,7 +27,7 @@ def fetch_posts_from_api( sample: Optional[int] = None, page_size: Optional[int] = None, ) -> list: - raise NotImplemented("Abstract method") + raise NotImplementedError("Abstract method") def validate_mock_post(self, got_post: dict, expected_post: dict) -> None: """Validate that got_post matches expected_post. diff --git a/apps/topics-snapshot/.idea/mediawords.sql b/apps/topics-snapshot/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/topics-snapshot/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/topics-snapshot/.idea/misc.xml b/apps/topics-snapshot/.idea/misc.xml index 3a0005b7b1..ef39d9adbd 100644 --- a/apps/topics-snapshot/.idea/misc.xml +++ b/apps/topics-snapshot/.idea/misc.xml @@ -6,5 +6,5 @@ - + \ No newline at end of file diff --git a/apps/topics-snapshot/.idea/sqlDataSources.xml b/apps/topics-snapshot/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..bea78ab59b --- /dev/null +++ b/apps/topics-snapshot/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/topics-snapshot/.idea/topics-snapshot.iml b/apps/topics-snapshot/.idea/topics-snapshot.iml index b92bf463b9..64cb979160 100644 --- a/apps/topics-snapshot/.idea/topics-snapshot.iml +++ b/apps/topics-snapshot/.idea/topics-snapshot.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/topics-snapshot/docker-compose.tests.yml b/apps/topics-snapshot/docker-compose.tests.yml index d0dbe3db4c..bc0a845028 100644 --- a/apps/topics-snapshot/docker-compose.tests.yml +++ b/apps/topics-snapshot/docker-compose.tests.yml @@ -97,8 +97,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ solr-shard-01: image: gcr.io/mcback/solr-shard:latest diff --git a/apps/webapp-api/.idea/mediawords.sql b/apps/webapp-api/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/webapp-api/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/webapp-api/.idea/sqlDataSources.xml b/apps/webapp-api/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..08b89ea572 --- /dev/null +++ b/apps/webapp-api/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/webapp-api/docker-compose.tests.yml b/apps/webapp-api/docker-compose.tests.yml index 5f2963bc66..af67c24c42 100644 --- a/apps/webapp-api/docker-compose.tests.yml +++ b/apps/webapp-api/docker-compose.tests.yml @@ -69,8 +69,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ solr-shard-01: image: gcr.io/mcback/solr-shard:latest diff --git a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/StoriesBase.pm b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/StoriesBase.pm index 2e1512d9c4..15a6c2df1c 100644 --- a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/StoriesBase.pm +++ b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/StoriesBase.pm @@ -14,6 +14,7 @@ use namespace::autoclean; use MediaWords::DBI::Stories; use MediaWords::DBI::Stories::WordMatrix; +use MediaWords::DBI::Stories::WordMatrixOldStopwords; # FIXME remove once stopword comparison is over use MediaWords::Solr; use MediaWords::Solr::TagCounts; use MediaWords::Util::ParseHTML; @@ -165,16 +166,23 @@ SQL } # add a word_count field to each story that includes a word count for that story -sub _attach_word_counts_to_stories($$) +# FIXME remove extra "$" once stopword comparison is over +sub _attach_word_counts_to_stories($$$) { - my ( $db, $stories ) = @_; + # FIXME remove extra parameter once stopword comparison is over + my ( $db, $stories, $old_stopwords ) = @_; my $stories_ids = [ map { $_->{ stories_id } } @{ $stories } ]; my $stories_lookup = {}; map { $stories_lookup->{ $_->{ stories_id } } = $_ } @{ $stories }; - my ( $word_matrix, $word_list ) = MediaWords::DBI::Stories::WordMatrix::get_story_word_matrix( $db, $stories_ids ); + my ( $word_matrix, $word_list ); + if ( $old_stopwords ) { + ( $word_matrix, $word_list ) = MediaWords::DBI::Stories::WordMatrixOldStopwords::get_story_word_matrix( $db, $stories_ids ); + } else { + ( $word_matrix, $word_list ) = MediaWords::DBI::Stories::WordMatrix::get_story_word_matrix( $db, $stories_ids ); + } while ( my ( $stories_id, $word_counts ) = each( %{ $word_matrix } ) ) { @@ -322,7 +330,9 @@ SQL $stories = MediaWords::DBI::Stories::attach_story_data_to_stories( $stories, $feed_data, 'feeds' ); } - $stories = _attach_word_counts_to_stories( $db, $stories ) if ( int( $self->{ show_wc } // 0 ) ); + if ( int( $self->{ show_wc } // 0 ) ) { + $stories = _attach_word_counts_to_stories( $db, $stories, $self->{ old_stopwords } ); + } return $stories; } @@ -381,6 +391,8 @@ sub _fetch_list($$$$$$) $self->{ show_text } = int( $c->req->params->{ text } // 0 ); $self->{ show_ap_stories_id } = int( $c->req->params->{ ap_stories_id } // 0 ); $self->{ show_wc } = int( $c->req->params->{ wc } // 0 ); + # FIXME remove once stopword comparison is over + $self->{ old_stopwords } = int( $c->req->params->{ old_stopwords } // 0 ); $self->{ show_feeds } = int( $c->req->params->{ show_feeds } // 0 ); $rows //= 20; @@ -544,7 +556,13 @@ sub word_matrix_GET my $stories_ids = MediaWords::Solr::search_solr_for_stories_ids( $db, { q => $q, fq => $fq, rows => $rows, sort => 'random_1 asc' } ); - my ( $word_matrix, $word_list ) = MediaWords::DBI::Stories::WordMatrix::get_story_word_matrix( $db, $stories_ids ); + my ( $word_matrix, $word_list ); + if ( $c->req->params->{ old_stopwords } ) { + # FIXME remove once stopword comparison is over + ( $word_matrix, $word_list ) = MediaWords::DBI::Stories::WordMatrixOldStopwords::get_story_word_matrix( $db, $stories_ids ); + } else { + ( $word_matrix, $word_list ) = MediaWords::DBI::Stories::WordMatrix::get_story_word_matrix( $db, $stories_ids ); + } $self->status_ok( $c, entity => { word_matrix => $word_matrix, word_list => $word_list } ); diff --git a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Wc.pm b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Wc.pm index c3c39f024f..d1c560185d 100644 --- a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Wc.pm +++ b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Wc.pm @@ -9,6 +9,9 @@ use List::Util qw(first max maxstr min minstr reduce shuffle sum); use Moose; use namespace::autoclean; use MediaWords::Solr; +use MediaWords::Solr::WordCounts; +use MediaWords::Solr::WordCountsOldStopwords; # FIXME remove once stopword comparison is over + =head1 NAME @@ -47,7 +50,13 @@ sub list_GET : PathPrefix( '/api' ) $c->req->params->{ sample_size } = $sample_size; - my $wc = MediaWords::Solr::WordCounts->new( { db => $c->dbis, cgi_params => $c->req->params } ); + my $wc; + if ( $c->req->params->{ old_stopwords } ) { + # FIXME remove once stopword comparison is over + $wc = MediaWords::Solr::WordCountsOldStopwords->new( { db => $c->dbis, cgi_params => $c->req->params } ); + } else { + $wc = MediaWords::Solr::WordCounts->new( { db => $c->dbis, cgi_params => $c->req->params } ); + } my $words = $wc->get_words; diff --git a/apps/webapp-api/src/perl/MediaWords/DBI/Stories/WordMatrixOldStopwords.pm b/apps/webapp-api/src/perl/MediaWords/DBI/Stories/WordMatrixOldStopwords.pm new file mode 100644 index 0000000000..b664d8116d --- /dev/null +++ b/apps/webapp-api/src/perl/MediaWords/DBI/Stories/WordMatrixOldStopwords.pm @@ -0,0 +1,152 @@ +# FIXME remove once stopword comparison is over +package MediaWords::DBI::Stories::WordMatrixOldStopwords; + +use strict; +use warnings; + +use Modern::Perl "2015"; +use MediaWords::CommonLibs; + +use List::MoreUtils qw(natatime); + +use MediaWords::Solr::WordCountsOldStopwords; + +# get a postgres cursor that will return the concatenated story_sentences for each of the given stories_ids. use +# $sentence_separator to join the sentences for each story. +sub _get_story_word_matrix_rows($$$) +{ + my ( $db, $stories_ids, $sentence_separator ) = @_; + + return [] unless ( @{ $stories_ids } ); + + my $stories_ids_list = join( ',', map { int( $_ ) } @{ $stories_ids } ); + + my $ids_table = $db->get_temporary_ids_table( $stories_ids ); + my $rows = $db->query( <hashes; +select stories_id, language, string_agg( sentence, \$1 ) story_text + from story_sentences + where stories_id in ( $stories_ids_list ) + group by stories_id, language + order by stories_id, language +SQL + + return $rows; +} + +# Given a list of stories_ids, generate a matrix consisting of the vector of word stem counts for each stories_id on each +# line. Return a hash of story word counts and a list of word stems. +# +# The list of story word counts is in the following format: +# { +# { => +# { => , +# +# } +# }, +# ... +# ] +# +# The id of each word is the indes of the given word in the word list. The word list is a list of lists, with each +# member list consisting of the stem followed by the most commonly used term. +# +# For example, for stories_ids 1 and 2, both of which contain 4 mentions of 'foo' and 10 of 'bars', the word count +# has and and word list look like: +# +# [ { 1 => { 0 => 4, 1 => 10 } }, { 2 => { 0 => 4, 1 => 10 } } ] +# +# [ [ 'foo', 'foo' ], [ 'bar', 'bars' ] ] +# +# The story_sentences for each story will be used for word counting. If $max_words is specified, only the most common +# $max_words will be used for each story. +# +# The function uses MediaWords::Util::IdentifyLanguage to identify the stemming and stopwording language for each story. +# If the language of a given story is not supported, stemming and stopwording become null operations. For the list of +# languages supported, see @MediaWords::Langauges::Language::_supported_languages. +sub get_story_word_matrix($$;$) +{ + my ( $db, $stories_ids, $max_words ) = @_; + + my $word_index_lookup = {}; + my $word_index_sequence = 0; + my $word_term_counts = {}; + + my $use_transaction = !$db->in_transaction(); + $db->begin if ( $use_transaction ); + + my $sentence_separator = 'SPLITSPLIT'; + my $story_text_cursor = + + my $word_matrix = {}; + my $iter = natatime( 100, @{ $stories_ids } ); + while ( my @chunk_stories_ids = $iter->() ) + { + my $stories = _get_story_word_matrix_rows( $db, \@chunk_stories_ids, $sentence_separator ); + + for my $story ( @{ $stories } ) + { + my $wc = MediaWords::Solr::WordCountsOldStopwords->new(); + + # Remove stopwords from the stems + $wc->include_stopwords( 0 ); + + my $sentences_and_story_languages = []; + for my $sentence ( split( $sentence_separator, $story->{ story_text } ) ) + { + push( + @{ $sentences_and_story_languages }, + { + 'story_language' => $story->{ language }, + 'sentence' => $sentence, + } + ); + } + + my $stem_counts = $wc->count_stems( $sentences_and_story_languages ); + + my $stem_count_list = []; + while ( my ( $stem, $data ) = each( %{ $stem_counts } ) ) + { + push( @{ $stem_count_list }, [ $stem, $data->{ count }, $data->{ terms } ] ); + } + + if ( $max_words ) + { + $stem_count_list = [ sort { $b->[ 1 ] <=> $a->[ 1 ] } @{ $stem_count_list } ]; + splice( @{ $stem_count_list }, 0, $max_words ); + } + + $word_matrix->{ $story->{ stories_id } } //= {}; + my $stem_vector = $word_matrix->{ $story->{ stories_id } }; + for my $stem_count ( @{ $stem_count_list } ) + { + my ( $stem, $count, $terms ) = @{ $stem_count }; + + $word_index_lookup->{ $stem } //= $word_index_sequence++; + my $index = $word_index_lookup->{ $stem }; + + $stem_vector->{ $index } += $count; + + map { $word_term_counts->{ $stem }->{ $_ } += $terms->{ $_ } } keys( %{ $terms } ); + } + } + } + + $db->commit if ( $use_transaction ); + + my $word_list = []; + for my $stem ( keys( %{ $word_index_lookup } ) ) + { + my $term_pairs = []; + while ( my ( $term, $count ) = each( %{ $word_term_counts->{ $stem } } ) ) + { + push( @{ $term_pairs }, [ $term, $count ] ); + } + + $term_pairs = [ sort { $b->[ 1 ] <=> $a->[ 1 ] } @{ $term_pairs } ]; + $word_list->[ $word_index_lookup->{ $stem } ] = [ $stem, $term_pairs->[ 0 ]->[ 0 ] ]; + } + + return ( $word_matrix, $word_list ); +} + +1; diff --git a/apps/webapp-api/src/perl/MediaWords/Solr/WordCountsOldStopwords.pm b/apps/webapp-api/src/perl/MediaWords/Solr/WordCountsOldStopwords.pm new file mode 100644 index 0000000000..8af5a061e5 --- /dev/null +++ b/apps/webapp-api/src/perl/MediaWords/Solr/WordCountsOldStopwords.pm @@ -0,0 +1,447 @@ +# FIXME remove once stopword comparison is over +package MediaWords::Solr::WordCountsOldStopwords; + +use Moose; + +=head1 NAME + +MediaWords::Solr::WordCounts - handle word counting from solr + +=head1 DESCRIPTION + +Uses sampling to generate quick word counts from solr queries. + +=cut + +use strict; +use warnings; +use utf8; + +use Modern::Perl "2015"; +use MediaWords::CommonLibs; + +use CHI; +use Data::Dumper; +use Encode; +use List::Util; +use Readonly; +use URI::Escape; + +use MediaWords::Languages::Language; +use MediaWords::Solr; +use MediaWords::Solr::Query::MatchingSentences; +use MediaWords::Util::ParseJSON; +use MediaWords::Util::Text; + +# Max. length of the sentence to tokenize +Readonly my $MAX_SENTENCE_LENGTH => 1024; + +# Max. number of times to count a word in a single sentence +Readonly my $MAX_REPEATS_PER_SENTENCE => 3; + +# mediawords.wc_cache_version from config +my $_wc_cache_version; + +# Moose instance fields + +has 'q' => ( is => 'rw', isa => 'Str' ); +has 'fq' => ( is => 'rw', isa => 'ArrayRef' ); +has 'num_words' => ( is => 'rw', isa => 'Int', default => 500 ); +has 'sample_size' => ( is => 'rw', isa => 'Int', default => 1000 ); +has 'random_seed' => ( is => 'rw', isa => 'Int', default => 1 ); +has 'ngram_size' => ( is => 'rw', isa => 'Int', default => 1 ); +has 'include_stopwords' => ( is => 'rw', isa => 'Bool' ); +has 'include_stats' => ( is => 'rw', isa => 'Bool' ); +has 'cached_combined_stopwords' => ( is => 'rw', isa => 'HashRef' ); +has 'db' => ( is => 'rw' ); + +# list of all attribute names that should be exposed as cgi params +sub __get_cgi_param_attributes() +{ + return [ qw(q fq num_words sample_size random_seed include_stopwords include_stats ngram_size) ]; +} + +# return hash of attributes for use as cgi params +sub _get_cgi_param_hash($) +{ + my ( $self ) = @_; + + my $keys = __get_cgi_param_attributes(); + + my $meta = $self->meta; + + my $hash = {}; + map { $hash->{ $_ } = $meta->get_attribute( $_ )->get_value( $self ) } @{ $keys }; + + return $hash; +} + +# add support for constructor in this form: +# WordsCounts->new( cgi_params => $cgi_params ) +# where $cgi_params is a hash of cgi params directly from a web request +around BUILDARGS => sub { + my $orig = shift; + my $class = shift; + + my $args; + if ( ref( $_[ 0 ] ) ) + { + $args = $_[ 0 ]; + } + elsif ( defined( $_[ 0 ] ) ) + { + $args = { @_ }; + } + else + { + $args = {}; + } + + my $vals; + if ( $args->{ cgi_params } ) + { + my $cgi_params = $args->{ cgi_params }; + + $vals = {}; + my $keys = __get_cgi_param_attributes(); + for my $key ( @{ $keys } ) + { + if ( exists( $cgi_params->{ $key } ) ) + { + $vals->{ $key } = $cgi_params->{ $key }; + } + } + + if ( $args->{ db } ) + { + $vals->{ db } = $args->{ db }; + } + } + else + { + $vals = $args; + } + + if ( $vals->{ fq } && !ref( $vals->{ fq } ) ) + { + $vals->{ fq } = [ $vals->{ fq } ]; + } + + $vals->{ fq } ||= []; + + return $class->$orig( $vals ); +}; + +# Cache merged hashes of stopwords for speed +sub _combine_stopwords($$) +{ + my ( $self, $languages ) = @_; + + unless ( ref( $languages ) eq ref( [] ) ) + { + die "Languages is not an arrayref."; + } + unless ( scalar( @{ $languages } ) > 0 ) + { + die "Languages should have at least one language set."; + } + + my $language_lookup = {}; + my $deduped_languages = []; + for my $language ( @{ $languages } ) + { + unless ( $language_lookup->{ $language->language_code() } ) + { + push( @{ $deduped_languages }, $language ); + $language_lookup->{ $language->language_code() } = 1; + } + } + + $languages = $deduped_languages; + + my $language_codes = []; + foreach my $language ( @{ $languages } ) + { + push( @{ $language_codes }, $language->language_code() ); + } + $language_codes = [ sort( @{ $language_codes } ) ]; + + my $cache_key = join( '-', @{ $language_codes } ); + + unless ( $self->cached_combined_stopwords() ) + { + $self->cached_combined_stopwords( {} ); + } + + unless ( defined $self->cached_combined_stopwords->{ $cache_key } ) + { + my $combined_stopwords = {}; + foreach my $language ( @{ $languages } ) + { + my $stopwords = $language->stop_words_old_map(); + $combined_stopwords = { ( %{ $combined_stopwords }, %{ $stopwords } ) }; + } + + $self->cached_combined_stopwords->{ $cache_key } = $combined_stopwords; + } + + return $self->cached_combined_stopwords->{ $cache_key }; +} + +# expects story_sentence hashes, with a story_language field. +# +# parse the text and return a count of stems and terms in the sentence in the +# following format: +# +# { $stem => { count => $stem_count, terms => { $term => $term_count, ... } } } +# +# if ngram_size is > 1, use the unstemmed phrases of ngram_size as the stems +sub count_stems($$) +{ + my ( $self, $story_sentences ) = @_; + + # Set any duplicate sentences blank + my $dup_sentences = {}; + + # Tokenize each sentence and add count to $words for each token + my $stem_counts = {}; + for my $story_sentence ( @{ $story_sentences } ) + { + next unless ( defined( $story_sentence ) ); + + my $sentence = $story_sentence->{ 'sentence' }; + next unless ( defined( $sentence ) ); + + next if ( $dup_sentences->{ $sentence } ); + $dup_sentences->{ $sentence } = 1; + + # Very long sentences tend to be noise -- html text and the like. + $sentence = substr( $sentence, 0, $MAX_SENTENCE_LENGTH ) if ( length( $sentence ) > $MAX_SENTENCE_LENGTH ); + + # Remove urls so they don't get tokenized into noise + if ( $sentence =~ m~https?://[^\s]+~i ) + { + $sentence =~ s~https?://[^\s]+~~gi; + } + + my $story_language = $story_sentence->{ 'story_language' } || 'en'; + my $sentence_language = $story_sentence->{ language } || 'en'; + + # Language objects are cached in ::Languages::Language, no need to have a separate cache + my $lang_en = MediaWords::Languages::Language::default_language(); + my $lang_story = MediaWords::Languages::Language::language_for_code( $story_language ) || $lang_en; + my $lang_sentence = MediaWords::Languages::Language::language_for_code( $sentence_language ) || $lang_en; + + # Tokenize into words + my $sentence_words = $lang_sentence->split_sentence_to_words( $sentence ); + + # Remove stopwords; + # (don't stem stopwords first as they will usually be stemmed too much) + my $combined_stopwords = {}; + unless ( $self->include_stopwords ) + { + # Use both sentence's language and English stopwords + $combined_stopwords = $self->_combine_stopwords( [ $lang_en, $lang_story, $lang_sentence ] ); + } + + sub _word_is_valid_token($$) + { + my ( $word, $stopwords ) = @_; + + # Remove numbers + if ( $word =~ /^\d+?$/ ) + { + return 0; + } + + # Remove stopwords + if ( $stopwords->{ $word } ) + { + return 0; + } + + return 1; + } + + $sentence_words = [ grep { _word_is_valid_token( $_, $combined_stopwords ) } @{ $sentence_words } ]; + + # Stem using sentence language's algorithm + my $sentence_word_stems = + ( $self->ngram_size > 1 ) ? $sentence_words : $lang_sentence->stem_words( $sentence_words ); + + my $n = $self->ngram_size; + my $num_ngrams = scalar( @{ $sentence_words } ) - $n + 1; + + my $sentence_stem_counts = {}; + + for ( my $i = 0 ; $i < $num_ngrams ; ++$i ) + { + my $term = join( ' ', @{ $sentence_words }[ $i .. ( $i + $n - 1 ) ] ); + my $stem = join( ' ', @{ $sentence_word_stems }[ $i .. ( $i + $n - 1 ) ] ); + + $sentence_stem_counts->{ $stem } //= {}; + ++$sentence_stem_counts->{ $stem }->{ count }; + + next if ( $sentence_stem_counts->{ $stem }->{ count } > $MAX_REPEATS_PER_SENTENCE ); + + $stem_counts->{ $stem } //= {}; + ++$stem_counts->{ $stem }->{ count }; + + $stem_counts->{ $stem }->{ terms } //= {}; + ++$stem_counts->{ $stem }->{ terms }->{ $term }; + } + } + + return $stem_counts; +} + +# connect to solr server directly and count the words resulting from the query +sub _get_words_from_solr_server($) +{ + my ( $self ) = @_; + + my $db = $self->db; + + unless ( $self->q() || ( $self->fq && @{ $self->fq } ) ) + { + return []; + } + + my $solr_params = { + q => $self->q(), + fq => $self->fq, + rows => $self->sample_size, + sort => 'random_' . $self->random_seed . ' asc' + }; + + DEBUG( "executing solr query ..." ); + DEBUG Dumper( $solr_params ); + + my $story_sentences = MediaWords::Solr::Query::MatchingSentences::query_matching_sentences( $self->db, $solr_params, $self->sample_size ); + + DEBUG( "counting sentences..." ); + my $words = $self->count_stems( $story_sentences ); + DEBUG( "done counting sentences" ); + + my @word_list; + while ( my ( $stem, $count ) = each( %{ $words } ) ) + { + push( @word_list, { stem => $stem, count => $count->{ count } } ); + } + + @word_list = sort { + $b->{ count } <=> $a->{ count } or # + $b->{ stem } cmp $a->{ stem } # + } @word_list; + + my $counts = []; + for my $w ( @word_list ) + { + my $terms = $words->{ $w->{ stem } }->{ terms }; + my ( $max_term, $max_term_count ); + while ( my ( $term, $term_count ) = each( %{ $terms } ) ) + { + if ( !$max_term || ( $term_count > $max_term_count ) ) + { + $max_term = $term; + $max_term_count = $term_count; + } + } + + if ( !MediaWords::Util::Text::is_valid_utf8( $w->{ stem } ) || !MediaWords::Util::Text::is_valid_utf8( $max_term ) ) + { + WARN "invalid utf8: $w->{ stem } / $max_term"; + next; + } + + push( @{ $counts }, { stem => $w->{ stem }, count => $w->{ count }, term => $max_term } ); + } + + splice( @{ $counts }, $self->num_words ); + + if ( $self->include_stats ) + { + return { + stats => { + num_words_returned => scalar( @{ $counts } ), + num_sentences_returned => scalar( @{ $story_sentences } ), + num_words_param => $self->num_words, + sample_size_param => $self->sample_size, + random_seed => $self->random_seed + }, + words => $counts + }; + } + else + { + return $counts; + } +} + +# return CHI cache for word counts +sub _get_cache +{ + return CHI->new( + driver => 'File', + expires_in => '1 day', + expires_variance => '0.1', + root_dir => "/var/cache/word_counts", + depth => 4 + ); +} + +# return key that uniquely identifies the query +sub _get_cache_key +{ + my ( $self ) = @_; + + $_wc_cache_version = '1'; + + my $meta = $self->meta; + + my $keys = $self->__get_cgi_param_attributes(); + + my $hash_key = "$_wc_cache_version:" . Dumper( map { $meta->get_attribute( $_ )->get_value( $self ) } @{ $keys } ); + + return $hash_key; +} + +# get a cached value for the given word count +sub _get_cached_words +{ + my ( $self ) = @_; + + return $self->_get_cache->get( $self->_get_cache_key ); +} + +# set a cached value for the given word count +sub _set_cached_words +{ + my ( $self, $value ) = @_; + + return $self->_get_cache->set( $self->_get_cache_key, $value ); +} + +# get sorted list of most common words in sentences matching a Solr query, +# exclude stop words. Assumes english stemming and stopwording for now. +sub get_words +{ + my ( $self ) = @_; + + my $words; + + $words = $self->_get_cached_words; + + if ( $words ) + { + return $words; + } + + $words ||= $self->_get_words_from_solr_server(); + + $self->_set_cached_words( $words ); + + return $words; +} + +1; diff --git a/apps/webapp-httpd/docker-compose.tests.yml b/apps/webapp-httpd/docker-compose.tests.yml index 4569acad28..a422bcb5b7 100644 --- a/apps/webapp-httpd/docker-compose.tests.yml +++ b/apps/webapp-httpd/docker-compose.tests.yml @@ -98,8 +98,8 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ solr-shard-01: image: gcr.io/mcback/solr-shard:latest diff --git a/apps/word2vec-generate-snapshot-model/.idea/mediawords.sql b/apps/word2vec-generate-snapshot-model/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/word2vec-generate-snapshot-model/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/word2vec-generate-snapshot-model/.idea/sqlDataSources.xml b/apps/word2vec-generate-snapshot-model/.idea/sqlDataSources.xml new file mode 100644 index 0000000000..7a90f0188b --- /dev/null +++ b/apps/word2vec-generate-snapshot-model/.idea/sqlDataSources.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/apps/word2vec-generate-snapshot-model/docker-compose.tests.yml b/apps/word2vec-generate-snapshot-model/docker-compose.tests.yml index 294d99757a..fdc9cf0a10 100644 --- a/apps/word2vec-generate-snapshot-model/docker-compose.tests.yml +++ b/apps/word2vec-generate-snapshot-model/docker-compose.tests.yml @@ -49,5 +49,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ - target: /etc/postgresql/11/main/ + source: ./../postgresql-base/conf/ + target: /etc/postgresql/13/main/ diff --git a/doc/docker_cheat_sheet.markdown b/doc/docker_cheat_sheet.markdown index 2338c1ea67..cb2c21b191 100644 --- a/doc/docker_cheat_sheet.markdown +++ b/doc/docker_cheat_sheet.markdown @@ -91,7 +91,7 @@ To access PostgreSQL directly, you can either run `psql` in a `postgresql-server ```bash $ docker exec -it 29a psql - psql (11.3 (Ubuntu 11.3-1.pgdg20.04+1)) + psql (13.3 (Ubuntu 11.3-1.pgdg20.04+1)) Type "help" for help. mediacloud=# @@ -147,7 +147,7 @@ To access PostgreSQL directly, you can either run `psql` in a `postgresql-server Password for user mediacloud: Timing is on. Expanded display is on. - psql (11.3) + psql (13.3) Type "help" for help. mediacloud=# diff --git a/doc/podcasts_gc_auth.markdown b/doc/podcasts_gc_auth.markdown index 88e98fbcc7..a43366039c 100644 --- a/doc/podcasts_gc_auth.markdown +++ b/doc/podcasts_gc_auth.markdown @@ -60,13 +60,20 @@ In order to transcribe podcast episodes using Google Cloud's Speech API, you'll gcloud services enable speech.googleapis.com ``` -9. Create a Cloud Storage bucket to store episode audio files (if one doesn't exist already): +9. Create three Cloud Storage buckets: ```shell - gsutil mb gs://mc-podcast-episodes-audio-files-test + # Raw (non-transcoded) enclosures fetched from podcast websites + gsutil mb gs://mc-podcast-raw-enclosures-test + + # Transcoded episodes ready for submission to the Speech API + gsutil mb gs://mc-podcast-transcoded-episodes-test + + # Transcript JSON files + gsutil mb gs://mc-podcast-transcripts-test ``` -10. Create a service account that the podcast transcribing apps would use: +10. Create a service account that the podcast transcribing workflow will use: ```shell gcloud iam service-accounts create mc-transcribe-podcasts-test \ @@ -74,12 +81,18 @@ In order to transcribe podcast episodes using Google Cloud's Speech API, you'll --description="(test) Upload episodes to GCS, submit them to Speech API, fetch transcripts" ``` -11. Allow the service account to read / write objects from bucket (here `mc-upload-episode-audio-files` is the service account name, and `mc-podcast-transcription-test` is the Google Cloud project ID): +11. Allow the service account to read / write objects from buckets (here `mc-upload-episode-audio-files` is the service account name, and `mc-podcast-transcription-test` is the Google Cloud project ID): ```shell gsutil acl ch \ - -u mc-transcribe-podcasts-test@mc-podcast-transcription-test.iam.gserviceaccount.com:O \ - gs://mc-podcast-episodes-audio-files-test + -u mc-transcribe-podcasts-test@meag-podcast-transcription-tst.iam.gserviceaccount.com:O \ + gs://mc-podcast-raw-enclosures-test + gsutil acl ch \ + -u mc-transcribe-podcasts-test@meag-podcast-transcription-tst.iam.gserviceaccount.com:O \ + gs://mc-podcast-transcoded-episodes-test + gsutil acl ch \ + -u mc-transcribe-podcasts-test@meag-podcast-transcription-tst.iam.gserviceaccount.com:O \ + gs://mc-podcast-transcripts-test ``` 12. Generate authentication JSON credentials: @@ -87,7 +100,7 @@ In order to transcribe podcast episodes using Google Cloud's Speech API, you'll ```shell gcloud iam service-accounts keys create \ mc-transcribe-podcasts-test.json \ - --iam-account mc-transcribe-podcasts-test@mc-podcast-transcription-test.iam.gserviceaccount.com + --iam-account mc-transcribe-podcasts-test@meag-podcast-transcription-tst.iam.gserviceaccount.com ``` 13. Encode contents of `mc-transcribe-podcasts-test.json` to Base64: @@ -96,4 +109,4 @@ In order to transcribe podcast episodes using Google Cloud's Speech API, you'll base64 mc-transcribe-podcasts-test.json ``` -13. Copy the resulting Base64-encoded string to `MC_PODCAST_GC_AUTH_JSON_BASE64` environment variable that's set for apps using Google Cloud services for podcast transcription. +13. Copy the resulting Base64-encoded string to `MC_PODCAST_AUTH_JSON_BASE64` environment variable that's set for apps using Google Cloud services for podcast transcription. diff --git a/doc/postgresql_upgrade.markdown b/doc/postgresql_upgrade.markdown new file mode 100644 index 0000000000..24ea81db47 --- /dev/null +++ b/doc/postgresql_upgrade.markdown @@ -0,0 +1,163 @@ +# PostgreSQL upgrade + +To upgrade PostgreSQL between two (e.g. 12 -> 13) or more (e.g. 11 -> 12 -> 13) versions, do the following: + + +## Preparation (up to a week before) + +1. Sync the initial PostgreSQL dataset to a backup server: + + ```bash + production$ sudo zfs snapshot space/mediacloud/vol_postgresql_data@11_initial + + production$ sudo zfs send space/mediacloud/vol_postgresql_data@11_initial | \ + mbuffer -s 128k -m 10M | \ + pv | \ + ssh backup sudo zfs receive -F space/mediacloud/vol_postgresql_data + ``` + +2. Update `apps/postgresql-upgrade/Dockerfile` for it to install the version that you're upgrading *from* and the in-between versions if needed, and then build + push the image. + + You should result with an image that includes all PostgreSQL versions that are needed for upgrading, e.g. if you're upgrading from 11 to 13, `postgresql-upgrade` should include PostgreSQL versions 11, 12 and 13: + + ```dockerfile + # Parent image already installs PostgreSQL 13 + FROM gcr.io/mcback/postgresql-server:latest + + # <...> + + RUN \ + # + # Install PostgreSQL 11 (oldest version) + apt-get -y --no-install-recommends install \ + postgresql-11 \ + postgresql-client-11 \ + postgresql-contrib-11 \ + postgresql-plperl-11 \ + && \ + # + # Install PostgreSQL 12 (intermediate version) + apt-get -y --no-install-recommends install \ + postgresql-12 \ + postgresql-client-12 \ + postgresql-contrib-12 \ + postgresql-plperl-12 \ + && \ + # + true + ``` + +3. Run a test upgrade on a backup server to find out if it works and how long it will take: + + ```bash + backup$ time docker run -it \ + --shm-size=64g \ + -v /space/mediacloud/vol_postgresql_data:/var/lib/postgresql/ \ + gcr.io/mcback/postgresql-upgrade \ + postgresql_upgrade.py \ + --source_version=11 \ + --target_version=13 \ + &> test_postgresql_upgrade.log + + backup$ sudo zfs rollback space/mediacloud/vol_postgresql_data@11_initial + ``` + + If it doesn't work, fix the issues on the production server and `zfs send -i old_snapshot new_snapshot` the changes. Rinse and repeat until it works. + + Take note how long it will take for the upgrade script to run. + + +## Pre-upgrade (a day before) + +4. A day or so before the upgrade, create a new dataset snapshot and sync it to the backup server. + + This is done to reduce the time it will require to sync the final snapshot after the database is down for the upgrade. + + ```bash + production$ sudo zfs snapshot space/mediacloud/vol_postgresql_data@11_intermediate + + production$ sudo zfs send -i \ + space/mediacloud/vol_postgresql_data@11_initial \ + space/mediacloud/vol_postgresql_data@11_intermediate \ + | \ + mbuffer -s 128k -m 10M | \ + pv | \ + ssh backup sudo zfs receive -F space/mediacloud/vol_postgresql_data + ``` + + +## Upgrade + +5. Stop all services: + + ```bash + docker service rm mediacloud + ``` + + Make sure `postgresql-server` has stopped. If it hasn't, wait for it to stop. + +6. Make a final PostgreSQL dataset snapshot and sync it to the backup server: + + ```bash + production$ sudo zfs snapshot space/mediacloud/vol_postgresql_data@11_final + + production$ sudo zfs send -i \ + space/mediacloud/vol_postgresql_data@11_intermediate \ + space/mediacloud/vol_postgresql_data@11_final \ + | \ + mbuffer -s 128k -m 10M | \ + pv | \ + ssh backup sudo zfs receive -F space/mediacloud/vol_postgresql_data + ``` + +7. Run the upgrade script: + + ```bash + production$ time docker run -it \ + --shm-size=64g \ + -v /space/mediacloud/vol_postgresql_data:/var/lib/postgresql/ \ + gcr.io/mcback/postgresql-upgrade \ + postgresql_upgrade.py \ + --source_version=11 \ + --target_version=13 \ + &> postgresql_upgrade.log + ``` + +8. Create a post-upgrade snapshot: + + ```bash + production$ sudo zfs snapshot space/mediacloud/vol_postgresql_data@13_initial + ``` + +9. Restart all services: + + ```bash + docker stack deploy -c docker-compose.mediacloud.yml mediacloud + ``` + + +## Cleanup + +10. Copy post-upgrade snapshot to the backup server: + + ```bash + production$ sudo zfs send -i \ + space/mediacloud/vol_postgresql_data@11_final \ + space/mediacloud/vol_postgresql_data@13_initial \ + | \ + mbuffer -s 128k -m 10M | \ + pv | \ + ssh backup sudo zfs receive -F space/mediacloud/vol_postgresql_data + ``` + +11. Clean up pre-upgrade snapshots: + + ```bash + backup$ zfs destroy space/mediacloud/vol_postgresql_data@11_initial + backup$ zfs destroy space/mediacloud/vol_postgresql_data@11_intermediate + backup$ zfs destroy space/mediacloud/vol_postgresql_data@11_final + + production$ zfs destroy space/mediacloud/vol_postgresql_data@11_initial + production$ zfs destroy space/mediacloud/vol_postgresql_data@11_intermediate + production$ zfs destroy space/mediacloud/vol_postgresql_data@11_final + ``` diff --git a/doc/pycharm.markdown b/doc/pycharm.markdown index 67051e8175..df10ad0dfb 100644 --- a/doc/pycharm.markdown +++ b/doc/pycharm.markdown @@ -229,11 +229,18 @@ Without the SQL schema dialect and data source configuration, PyCharm will compl ![](https://github.com/mediacloud/backend-docs-images/raw/master/docker-pycharm/schema-01-dialect-warning.png) +To generate the latest version of the schema, navigate to the project root in your terminal and run `./dev/get_schema.sh`. + To configure SQL schema dialect and data source: -1. To generate the latest version of the schema, navigate to the project root in your terminal and run `./dev/get_schema.sh`. +1. Symlink `apps/postgresql-server/schema/mediawords.sql` to `.idea/` directory: + + ```bash + cd apps/your_app/.idea/ + ln -s ../../postgresql-server/schema/mediawords.sql . + ``` -2. In PyCharm, go to *Languages & Frameworks* -> *SQL Dialects*, set the *Project SQL Dialect:* to *PostgreSQL* and click *OK*: +2. Go to *Languages & Frameworks* -> *SQL Dialects*, set the *Project SQL Dialect:* to *PostgreSQL* and click *OK*: ![](https://github.com/mediacloud/backend-docs-images/raw/master/docker-pycharm/schema-02-dialect-project.png) @@ -245,7 +252,7 @@ To configure SQL schema dialect and data source: ![](https://github.com/mediacloud/backend-docs-images/raw/master/docker-pycharm/schema-04-data-source-ddl.png) -5. Leave the default value of the *Name:* field intact and under *DDL Files*, add the `mediawords.sql` file (located in `postgresql-server/schema`) and click *OK*: +5. Leave the default value of the *Name:* field intact and under *DDL Files*, add a `mediawords.sql` file located in `.idea/` directory (you might need to click on *Show Hidden Files and Directories* button first) and click *OK*: ![](https://github.com/mediacloud/backend-docs-images/raw/master/docker-pycharm/schema-05-data-source-mediawords.png) diff --git a/doc/workflows.markdown b/doc/workflows.markdown new file mode 100644 index 0000000000..45144830c4 --- /dev/null +++ b/doc/workflows.markdown @@ -0,0 +1,419 @@ + + +Table of Contents +================= + + * [Workflows](#workflows) + * [Samples](#samples) + * [Retry parameters](#retry-parameters) + * [Activity interface](#activity-interface) + * [Activity interface with custom retries](#activity-interface-with-custom-retries) + * [Workflow interface](#workflow-interface) + * [Running a workflow](#running-a-workflow) + * [Asynchronously](#asynchronously) + * [Synchronously](#synchronously) + * [Tips & tricks](#tips--tricks) + * [Name workflow (activity) interface as XYZWorkflow (XYZActivities), implementation as XYZWorkflowImpl (XYZActivitiesImpl)](#name-workflow-activity-interface-as-xyzworkflow-xyzactivities-implementation-as-xyzworkflowimpl-xyzactivitiesimpl) + * [Make activities idempotent](#make-activities-idempotent) + * [Limit activity invocations in a single workflow to 1000](#limit-activity-invocations-in-a-single-workflow-to-1000) + * [Limit the activity payload to 200 KB](#limit-the-activity-payload-to-200-kb) + * [Use positional arguments](#use-positional-arguments) + * [Make arguments serializable by encode_json()](#make-arguments-serializable-by-encode_json) + * [Use connect_to_db_or_raise() instead of connect_to_db()](#use-connect_to_db_or_raise-instead-of-connect_to_db) + * [Use stop_worker_faster() to stop local workers used in tests](#use-stop_worker_faster-to-stop-local-workers-used-in-tests) + * [Reuse WorkflowClient objects when possible](#reuse-workflowclient-objects-when-possible) + * [Links](#links) + +---- + + +# Workflows + + +## Samples + + +### Retry parameters + +```python +DEFAULT_RETRY_PARAMETERS = RetryParameters( + + # InitialInterval is a delay before the first retry. + initial_interval=timedelta(seconds=1), + + # BackoffCoefficient. Retry policies are exponential. The coefficient specifies how fast the retry interval is + # growing. The coefficient of 1 means that the retry interval is always equal to the InitialInterval. + backoff_coefficient=2, + + # MaximumInterval specifies the maximum interval between retries. Useful for coefficients more than 1. + maximum_interval=timedelta(hours=2), + + # MaximumAttempts specifies how many times to attempt to execute an Activity in the presence of failures. If this + # limit is exceeded, the error is returned back to the Workflow that invoked the Activity. + + # We start off with a huge default retry count for each individual activity (1000 attempts * 2 hour max. interval + # = about a month worth of retrying) to give us time to detect problems, fix them, deploy fixes and let the workflow + # system just handle the rest without us having to restart workflows manually. + # + # Activities for which retrying too much doesn't make sense (e.g. due to the cost) set their own "maximum_attempts". + maximum_attempts=1000, + + # NonRetryableErrorReasons allows you to specify errors that shouldn't be retried. For example retrying invalid + # arguments error doesn't make sense in some scenarios. + non_retryable_error_types=[ + + # Counterintuitively, we *do* want to retry not only on transient errors but also on programming and + # configuration ones too because on programming / configuration bugs we can just fix up some code or + # configuration, deploy the fixes and let the workflow system automagically continue on with the workflow + # without us having to dig out what exactly has failed and restart things. + # + # However, on "permanent" errors (the ones when some action decides that it just can't proceed with this + # particular input, e.g. process a story that does not exist) there's no point in retrying anything. + # anything anymore. + McPermanentError.__name__, + + ], +) +``` + + +### Activity interface + +```python +class SampleActivities(object): + + @activity_method( + task_queue=TASK_QUEUE, + + # ScheduleToStart is the maximum time from a Workflow requesting Activity execution to a worker starting its + # execution. The usual reason for this timeout to fire is all workers being down or not being able to keep up + # with the request rate. We recommend setting this timeout to the maximum time a Workflow is willing to wait for + # an Activity execution in the presence of all possible worker outages. + schedule_to_start_timeout=None, + + # StartToClose is the maximum time an Activity can execute after it was picked by a worker. + start_to_close_timeout=timedelta(seconds=60), + + # ScheduleToClose is the maximum time from the Workflow requesting an Activity execution to its completion. + schedule_to_close_timeout=None, + + # Heartbeat is the maximum time between heartbeat requests. See Long Running Activities. + # (https://docs.temporal.io/docs/concept-activities/#long-running-activities) + heartbeat_timeout=None, + + retry_parameters=DEFAULT_RETRY_PARAMETERS, + ) + async def sample_activity(self, stories_id: int) -> Optional[str]: + raise NotImplementedError +``` + + +### Activity interface with custom retries + +```python +class SampleActivities(object): + + @activity_method( + task_queue=TASK_QUEUE, + schedule_to_start_timeout=None, + start_to_close_timeout=timedelta(seconds=60), + schedule_to_close_timeout=None, + heartbeat_timeout=None, + retry_parameters=dataclasses.replace( + DEFAULT_RETRY_PARAMETERS, + + # Wait for a minute before trying again + initial_interval=timedelta(minutes=1), + + # Hope for the server to resurrect in a week + maximum_interval=timedelta(weeks=1), + + # Don't kill ourselves trying to hit a permanently dead server + maximum_attempts=50, + ), + ) + async def another_sample_activity_with_custom_retries(self, stories_id: int) -> Optional[str]: + raise NotImplementedError +``` + + +### Workflow interface + +```python +class SampleWorkflow(object): + + @workflow_method(task_queue=TASK_QUEUE) + async def sample_workflow_method(self, stories_id: int) -> None: + raise NotImplementedError +``` + + +### Running a workflow + + +#### Asynchronously + +"Fire and forget" about the workflow: + +```python +from mediawords.workflow.client import workflow_client + + +client = workflow_client() +workflow: SampleWorkflow = client.new_workflow_stub( + cls=SampleWorkflow, + workflow_options=WorkflowOptions(workflow_id=str(stories_id)), +) + +await WorkflowClient.start(workflow.sample_workflow_method, stories_id) +``` + + +#### Synchronously + +Start a workflow and wait for it to complete: + +```python +from mediawords.workflow.client import workflow_client + + +client = workflow_client() +workflow: SampleWorkflow = client.new_workflow_stub( + cls=SampleWorkflow, + workflow_options=WorkflowOptions(workflow_id=str(stories_id)), +) + +result = await workflow.transcribe_episode(stories_id) +``` + + +## Tips & tricks + + +### Name workflow (activity) interface as `XYZWorkflow` (`XYZActivities`), implementation as `XYZWorkflowImpl` (`XYZActivitiesImpl`) + +Temporal's webapp uses the interface's class name as the workflow name by default, so that way the workflow names look better and are more easily searchable. + +```python +# Good! + +class KardashianActivities(object): + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=60), + ) + async def add_new_kardashian(self) -> None: + # ... + +class KardashianActivitiesImpl(KardashianActivities): + + async def add_new_kardashian(self) -> None: + # ... + + +class KardashianWorkflow(object): + + @workflow_method(task_queue=TASK_QUEUE) + async def keep_up_with_kardashians(self) -> None: + # ... + +class KardashianWorkflowImpl(KardashianWorkflow): + + async def keep_up_with_kardashians(self) -> None: + # ... +``` + + +### Make activities idempotent + +Temporal guarantees at-least-once activity invocations, so some activities might have to be rerun occasionally: + +```python +# Bad! + +class KardashianActivitiesImpl(KardashianActivities): + + async def add_new_kardashian(self) -> None: + db = connect_to_db_or_raise() + + # If this activity gets run twice, we'll end up with two Kims in the + # "kardashians" table which is against our strategic goals + db.query(""" + INSERT INTO kardashians (name, surname) + VALUES ('Kim', 'Kardashian') + """) +``` + +Therefore, activities need to be "ready" for getting run twice sometimes: + +```python +# Good! + +class KardashianActivitiesImpl(KardashianActivities): + + async def add_new_kardashian(self) -> None: + db = connect_to_db_or_raise() + + # Here we're assuming that there's a unique index on (name, surname) + # and using the ON CONFLICT upsert: + # https://www.postgresql.org/docs/current/sql-insert.html#SQL-ON-CONFLICT + db.query(""" + INSERT INTO kardashians (name, surname) + VALUES ('Kim', 'Kardashian') + ON CONFLICT (name, surname) DO NOTHING + """) +``` + + +### Limit activity invocations in a single workflow to 1000 + +While workflow count itself is largely unlimited, the history size (where action invocations get logged to) is [limited to 10 MB (soft limit) / 50 MB (hard limit)](https://github.com/temporalio/temporal/blob/v1.7.0/service/history/configs/config.go#L380-L381), and history count is limited to [10k (soft limit) / 50k (hard limit) entries](https://github.com/temporalio/temporal/blob/v1.7.0/service/history/configs/config.go#L382-L383). + +Given that an activity might get retried a few times, and those retries will end up in the workflow's history too, don't invoke too many activities in a single workflow run. + +Instead, go for **hierarchical workflows.** For example, if an activity fetches an URL, and you're planning on fetching 1 million URLs, you can make a parent workflow start 1000 children workflows and wait for their completion. + + + + + +### Limit the activity payload to 200 KB + +Activity arguments get serialized into JSON, sent over the network and then unserialized, so passing around huge JSON payloads hits the performance. Also, payloads are visible in the web UI so loading a huge JSON file in the Temporal's webapp is not practical. + +Instead of passing around huge chunks of data in payloads, store it somewhere in the database. + + +### Use positional arguments + +At the time of writing, the Python SDK is unable to serialize named arguments (`**kwargs`) and pass them to workflow / action methods: + +```python +# Bad! +await workflow.transcribe_episode(stories_id=stories_id) +``` + +so positional arguments (`*args`) have to be used instead: + +```python +# Good! +await workflow.transcribe_episode(stories_id) +``` + + +### Make arguments serializable by `encode_json()` + +Python SDK serializes arguments to workflow and individual activities with `encode_json()`, and the default `JSONEncoder` is [limited](https://docs.python.org/3/library/json.html#json.JSONEncoder) in what it's able to serialize: + +```python +# Bad! + +class FancyObject(object): + def __init__(self, fancy_argument: int): + self.fancy_argument = fancy_argument + +class FancyActivities(object): + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=60), + ) + async def fancy_activity(self, fancy: FancyObject) -> bool: + # <...> +``` + +Instead, opt for simple dicts: + +```python +# Good! + +from typing import Dict, Any + +class FancyObject(object): + def __init__(self, fancy_argument: int): + self.fancy_argument = fancy_argument + + def to_dict(self) -> Dict[str, Any]: + return { + 'fancy_argument': self.fancy_argument, + } + + @classmethod + def from_dict(self, input_dict: Dict[str, Any]) -> 'FancyObject': + return cls(fancy_argument=fancy_argument) + +class FancyActivities(object): + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=60), + ) + async def fancy_activity(self, fancy: Dict[str, Any]) -> bool: + # Convert back to an object + fancy = FancyObject.from_dict(fancy) + # <...> +``` + +or define a new `typing` type to make it more obvious what the activity method is supposed to find in the argument dictionary: + +```python +# Better (somewhat)! + +from typing import Dict, Any + +FancyObjectDict = Dict[str, Any] + +class FancyObject(object): + def __init__(self, fancy_argument: int): + self.fancy_argument = fancy_argument + + def to_dict(self) -> FancyObjectDict: + return { + 'fancy_argument': self.fancy_argument, + } + + @classmethod + def from_dict(self, input_dict: FancyObjectDict) -> 'FancyObject': + return cls(fancy_argument=fancy_argument) + +class FancyActivities(object): + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=60), + ) + async def fancy_activity(self, fancy: FancyObjectDict) -> bool: + # Convert back to an object + fancy = FancyObject.from_dict(fancy) + # <...> +``` + + +### Use `connect_to_db_or_raise()` instead of `connect_to_db()` + +By default, `connect_to_db()` will attempt connecting to the database quite a few times, and if it fails to do so, it will call `fatal_error()` thus stopping the whole application that has called the function. + +Temporal implements retries itself, plus it's not beneficial to quit the worker on database connection issues (as the worker then should continue on retrying), so instead of `connect_to_db()` go for `connect_to_db_or_raise()` which attempts connecting to PostgreSQL only once, and raises a simple exception on failures instead of stopping the whole application. + + +### Use `stop_worker_faster()` to stop local workers used in tests + +Default implementation of `worker.stop()` waits for the whole 5 seconds between attempts to stop all the worker threads. Our own hack implemented in `stop_worker_faster()` tests whether the workers managed to stop every 0.5 seconds. + +This is useful in tests in which we run local workers and want to stop them afterwards. + + +### Reuse `WorkflowClient` objects when possible + +Try avoiding creating a new `WorkflowClient` object often as ["it is a heavyweight object that establishes persistent TCP connections"](https://github.com/uber/cadence/issues/2528#issuecomment-530894674). + + +## Links + +* [Main Temporal website](https://temporal.io/) +* [Temporal Python SDK](https://github.com/firdaus/temporal-python-sdk) + * [Tests with many usage samples](https://github.com/firdaus/temporal-python-sdk/tree/master/tests) +* ["Workflows in Python using Temporal"](https://onepointzero.app/workflows-in-python-using-temporal/), a blog post by the author of the Python SDK with many examples +* [Workflow samples in Go](https://github.com/temporalio/samples-go), many of which adaptable to Python + * [Mutex workflow sample](https://github.com/temporalio/samples-go/tree/master/mutex) diff --git a/provision/roles/docker/tasks/iptables.yml b/provision/roles/docker/tasks/iptables.yml index aa1c1b858f..c374ba04b4 100644 --- a/provision/roles/docker/tasks/iptables.yml +++ b/provision/roles/docker/tasks/iptables.yml @@ -161,6 +161,34 @@ - docker - iptables +- name: Deny connections to Temporal webapp + iptables: + # Insert before RETURN + action: insert + chain: DOCKER-USER + protocol: tcp + destination_port: 8088 + jump: DROP + become: true + become_user: root + tags: + - docker + - iptables + +- name: Deny connections to Temporal Grafana + iptables: + # Insert before RETURN + action: insert + chain: DOCKER-USER + protocol: tcp + destination_port: 3000 + jump: DROP + become: true + become_user: root + tags: + - docker + - iptables + - name: Save IPv4 rules community.general.iptables_state: ip_version: ipv4