From 8b031f5682bbf862a4cf53f1098f1797cd011cb7 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Wed, 17 Feb 2021 18:36:09 -0400 Subject: [PATCH 001/175] add comments --- apps/mail-postfix-server/Dockerfile | 4 ++++ apps/mail-postfix-server/docker-compose.tests.yml | 9 ++++++--- apps/mail-postfix-server/header_checks | 4 ++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/apps/mail-postfix-server/Dockerfile b/apps/mail-postfix-server/Dockerfile index 6c475f995c..203f04f669 100644 --- a/apps/mail-postfix-server/Dockerfile +++ b/apps/mail-postfix-server/Dockerfile @@ -57,6 +57,10 @@ RUN \ postconf -e smtp_tls_security_level=may && \ postconf -e smtpd_tls_security_level=none && \ # + # Make sure default headers (e.g. Message-Id, date) are present + postconf -e always_add_missing_headers=yes && \ + postconf -e local_header_rewrite_clients=permit_inet_interfaces && \ + # # Disable chroot on all services as it doesn't play well with a mounted # volume, e.g. "smtpd" is unable to access libnss after a chroot and thus # resolve OpenDKIM container. diff --git a/apps/mail-postfix-server/docker-compose.tests.yml b/apps/mail-postfix-server/docker-compose.tests.yml index babdaa376d..ac3225256c 100644 --- a/apps/mail-postfix-server/docker-compose.tests.yml +++ b/apps/mail-postfix-server/docker-compose.tests.yml @@ -6,8 +6,11 @@ services: # # Usage: # - # host$ ./dev/run.py mail-postfix-server bash - # container$ sendmail "your@email.com" + # - host$ ./dev/run.py mail-postfix-server bash + # - (new terminal window) host$ docker ps + # - find container with name ending in 'mail-postfix-server-actual_1' + # - host$ docker exec -it some_string_mail-postfix-server-actual_1 bash + # - container$ sendmail "your@email.com" # mail-postfix-server: image: gcr.io/mcback/common:latest @@ -21,7 +24,7 @@ services: image: gcr.io/mcback/mail-postfix-server:latest init: true stop_signal: SIGKILL - # "docker exec" into a container and run Postfix manually (/postfix.sh): + # "docker exec" into a container and run Postfix manually (./postfix.sh): command: sleep infinity # To be able to set /proc/sys/kernel/yama/ptrace_scope: privileged: true diff --git a/apps/mail-postfix-server/header_checks b/apps/mail-postfix-server/header_checks index 0b5347f5fe..01f9d130b2 100644 --- a/apps/mail-postfix-server/header_checks +++ b/apps/mail-postfix-server/header_checks @@ -1,5 +1,5 @@ /^Received:.*with ESMTP / IGNORE /^X-Originating-IP:/ IGNORE /^X-Mailer:/ IGNORE -/^Mime-Version:/ IGNORE -/^User-Agent:/ IGNORE \ No newline at end of file +/^User-Agent:/ IGNORE +/^Content-Transfer-Encoding:/i PREPEND List-Unsubscribe: mailto:support@mediacloud.org?subject=delete%20account%20and%20unsubscribe \ No newline at end of file From 8cc71a8cbb15d562a7675569949fecd7133f76f1 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Wed, 17 Feb 2021 20:04:46 -0400 Subject: [PATCH 002/175] try adding unsub header --- apps/mail-postfix-server/Dockerfile | 1 - apps/mail-postfix-server/header_checks | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/mail-postfix-server/Dockerfile b/apps/mail-postfix-server/Dockerfile index 203f04f669..039bf1b029 100644 --- a/apps/mail-postfix-server/Dockerfile +++ b/apps/mail-postfix-server/Dockerfile @@ -50,7 +50,6 @@ RUN \ # # Filter out "Received:" and some other headers postconf -e header_checks=regexp:/etc/postfix/header_checks && \ - postconf -e mime_header_checks=regexp:/etc/postfix/header_checks && \ postconf -e smtp_header_checks=regexp:/etc/postfix/header_checks && \ # # Don't require TLS as local clients are trusted diff --git a/apps/mail-postfix-server/header_checks b/apps/mail-postfix-server/header_checks index 01f9d130b2..0cc6875eea 100644 --- a/apps/mail-postfix-server/header_checks +++ b/apps/mail-postfix-server/header_checks @@ -2,4 +2,4 @@ /^X-Originating-IP:/ IGNORE /^X-Mailer:/ IGNORE /^User-Agent:/ IGNORE -/^Content-Transfer-Encoding:/i PREPEND List-Unsubscribe: mailto:support@mediacloud.org?subject=delete%20account%20and%20unsubscribe \ No newline at end of file +/^Content-Transfer-Encoding:/i PREPEND List-Unsubscribe: mailto:support@mediacloud.org?subject=Delete%20account%20and%20unsubscribe \ No newline at end of file From e9cf5a2308007844f9cbd283c113ba4ebeb2aaa4 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Thu, 18 Feb 2021 14:45:24 -0400 Subject: [PATCH 003/175] add comments in docker-compose.test.yml --- apps/mail-postfix-server/docker-compose.tests.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/mail-postfix-server/docker-compose.tests.yml b/apps/mail-postfix-server/docker-compose.tests.yml index ac3225256c..7633966bc2 100644 --- a/apps/mail-postfix-server/docker-compose.tests.yml +++ b/apps/mail-postfix-server/docker-compose.tests.yml @@ -10,7 +10,11 @@ services: # - (new terminal window) host$ docker ps # - find container with name ending in 'mail-postfix-server-actual_1' # - host$ docker exec -it some_string_mail-postfix-server-actual_1 bash - # - container$ sendmail "your@email.com" + # - mail-postfix-server-actual_1_container$ ./postfix.sh + # - open new terminal window on your host machine + # - host$ docker exec -it some_string_mail-postfix-server-actual_1 bash + # - follow instructions at URL below to create a test mail.txt file and send to your email address from the container + # https://clients.javapipe.com/knowledgebase/132/How-to-Test-Sendmail-From-Command-Line-on-Linux.html # mail-postfix-server: image: gcr.io/mcback/common:latest From 6216aaaffda57ce6a6b64ccf7534e645880d0859 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Thu, 18 Feb 2021 16:49:31 -0400 Subject: [PATCH 004/175] add list-unsubscribe in mail.py --- apps/common/src/python/mediawords/util/mail.py | 7 ++++++- apps/mail-postfix-server/header_checks | 3 +-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/apps/common/src/python/mediawords/util/mail.py b/apps/common/src/python/mediawords/util/mail.py index 3e75702f6d..41df71d68c 100644 --- a/apps/common/src/python/mediawords/util/mail.py +++ b/apps/common/src/python/mediawords/util/mail.py @@ -117,11 +117,14 @@ def send_email(message: Message) -> bool: message_part = MIMEText(message.text_body, 'plain', 'utf-8') mime_message.attach(message_part) + mime_message.add_header('List-Unsubscribe', + 'mailto:support@mediacloud.org?subject=Delete%20account%20and%20unsubscribe') + # HTML gets attached last, thus making it a preferred part as per RFC if message.html_body: message_part = MIMEText(message.html_body, 'html', 'utf-8') mime_message.attach(message_part) - + print(mime_message) if test_mode_is_enabled(): log.info("Test mode is enabled, not actually sending any email.") log.debug("Omitted email:\n\n%s" % mime_message.as_string()) @@ -136,6 +139,7 @@ def send_email(message: Message) -> bool: # Send message refused_recipients = smtp.sendmail(mime_message['From'], mime_message['To'], mime_message.as_string()) + if len(refused_recipients): log.warning("Unable to send email to the following recipients: %s" % str(refused_recipients)) @@ -160,4 +164,5 @@ def send_text_email(to: str, subject: str, body: str) -> bool: body = decode_object_from_bytes_if_needed(body) message = Message(to=to, subject=subject, text_body=body) + return send_email(message) diff --git a/apps/mail-postfix-server/header_checks b/apps/mail-postfix-server/header_checks index 0cc6875eea..d23d0795d6 100644 --- a/apps/mail-postfix-server/header_checks +++ b/apps/mail-postfix-server/header_checks @@ -1,5 +1,4 @@ /^Received:.*with ESMTP / IGNORE /^X-Originating-IP:/ IGNORE /^X-Mailer:/ IGNORE -/^User-Agent:/ IGNORE -/^Content-Transfer-Encoding:/i PREPEND List-Unsubscribe: mailto:support@mediacloud.org?subject=Delete%20account%20and%20unsubscribe \ No newline at end of file +/^User-Agent:/ IGNORE \ No newline at end of file From 9b7ee46bb1fb2be9fe9f3bb1e58691594dcb4b00 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Thu, 18 Feb 2021 16:59:50 -0400 Subject: [PATCH 005/175] remove print --- apps/common/src/python/mediawords/util/mail.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/common/src/python/mediawords/util/mail.py b/apps/common/src/python/mediawords/util/mail.py index 41df71d68c..0c7bf8d413 100644 --- a/apps/common/src/python/mediawords/util/mail.py +++ b/apps/common/src/python/mediawords/util/mail.py @@ -124,7 +124,7 @@ def send_email(message: Message) -> bool: if message.html_body: message_part = MIMEText(message.html_body, 'html', 'utf-8') mime_message.attach(message_part) - print(mime_message) + if test_mode_is_enabled(): log.info("Test mode is enabled, not actually sending any email.") log.debug("Omitted email:\n\n%s" % mime_message.as_string()) From 62592938ba7be96fc7c88e3283286f15c3e15e57 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Thu, 18 Feb 2021 17:18:08 -0400 Subject: [PATCH 006/175] cleanup --- apps/common/src/python/mediawords/util/mail.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/apps/common/src/python/mediawords/util/mail.py b/apps/common/src/python/mediawords/util/mail.py index 0c7bf8d413..3402e237fe 100644 --- a/apps/common/src/python/mediawords/util/mail.py +++ b/apps/common/src/python/mediawords/util/mail.py @@ -124,7 +124,7 @@ def send_email(message: Message) -> bool: if message.html_body: message_part = MIMEText(message.html_body, 'html', 'utf-8') mime_message.attach(message_part) - + if test_mode_is_enabled(): log.info("Test mode is enabled, not actually sending any email.") log.debug("Omitted email:\n\n%s" % mime_message.as_string()) @@ -139,7 +139,6 @@ def send_email(message: Message) -> bool: # Send message refused_recipients = smtp.sendmail(mime_message['From'], mime_message['To'], mime_message.as_string()) - if len(refused_recipients): log.warning("Unable to send email to the following recipients: %s" % str(refused_recipients)) @@ -164,5 +163,4 @@ def send_text_email(to: str, subject: str, body: str) -> bool: body = decode_object_from_bytes_if_needed(body) message = Message(to=to, subject=subject, text_body=body) - return send_email(message) From 6b1b343ef68bb4146102e23cf47493077b286110 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Fri, 19 Feb 2021 14:33:17 -0400 Subject: [PATCH 007/175] add volume for common in mail-postfix-server --- apps/mail-postfix-server/docker-compose.tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/mail-postfix-server/docker-compose.tests.yml b/apps/mail-postfix-server/docker-compose.tests.yml index 7633966bc2..f059ce57da 100644 --- a/apps/mail-postfix-server/docker-compose.tests.yml +++ b/apps/mail-postfix-server/docker-compose.tests.yml @@ -20,6 +20,10 @@ services: image: gcr.io/mcback/common:latest init: true stop_signal: SIGKILL + volumes: + - type: bind + source: ./../common/src/ + target: /opt/mediacloud/src/common/ depends_on: - mail-postfix-server-actual From 26705248d9b8a50feb594110a7511143b058e7a1 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Fri, 19 Feb 2021 15:46:28 -0400 Subject: [PATCH 008/175] fix topics-map build --- apps/topics-map/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/topics-map/Dockerfile b/apps/topics-map/Dockerfile index 208b99b8fd..9d7080f811 100644 --- a/apps/topics-map/Dockerfile +++ b/apps/topics-map/Dockerfile @@ -5,7 +5,10 @@ FROM gcr.io/mcback/common:latest # Install Java -RUN apt-get -y --no-install-recommends install openjdk-8-jre-headless +RUN \ + apt-get update && \ + apt-get -y --no-install-recommends install openjdk-8-jre-headless && \ + true # Install fa2l Java libs RUN \ From d4633af4efc39911d5e1778e7d717bdf816b3786 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Fri, 19 Feb 2021 15:59:02 -0400 Subject: [PATCH 009/175] expand postfix testing instructions --- .../docker-compose.tests.yml | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/apps/mail-postfix-server/docker-compose.tests.yml b/apps/mail-postfix-server/docker-compose.tests.yml index f059ce57da..1c51ca0677 100644 --- a/apps/mail-postfix-server/docker-compose.tests.yml +++ b/apps/mail-postfix-server/docker-compose.tests.yml @@ -4,18 +4,28 @@ services: # Service to use for testing the mail service # - # Usage: + # Steps to test: # - # - host$ ./dev/run.py mail-postfix-server bash - # - (new terminal window) host$ docker ps - # - find container with name ending in 'mail-postfix-server-actual_1' - # - host$ docker exec -it some_string_mail-postfix-server-actual_1 bash - # - mail-postfix-server-actual_1_container$ ./postfix.sh - # - open new terminal window on your host machine - # - host$ docker exec -it some_string_mail-postfix-server-actual_1 bash - # - follow instructions at URL below to create a test mail.txt file and send to your email address from the container + # 1) host$ ./dev/run.py mail-postfix-server bash + # 2) (new terminal window) host$ docker ps + # 3) find container with name ending in 'mail-postfix-server-actual_1' + # 4) host$ docker exec -it some_string_mail-postfix-server-actual_1 bash + # 5) container$ ./postfix.sh + # 6) open new terminal window on your host machine + # 7) host$ docker exec -it some_string_mail-postfix-server-actual_1 bash + # 8) follow instructions at URL below to create a test mail.txt file and send to your email address from the container # https://clients.javapipe.com/knowledgebase/132/How-to-Test-Sendmail-From-Command-Line-on-Linux.html # + # Alternatively, if you want to test via the send_email() method (https://github.com/mediacloud/backend/blob/master/apps/common/src/python/mediawords/util/mail.py#L73), + # or test changes to said method, to you can disregard steps 7-8 above and instead: + # 7) host$ docker ps + # 8) Find mail-postfix-server container ID + # 9) host$ docker exec -it some_string_mail-postfix-server + # 10) $container python3 + # 11) >> from mediawords.util.mail import * + # 12) >> test_message = Message(to='your@email.com', subject='test postfix', text_body=None, html_body='

hi

') + # 13) >> send_email(test_message) + # mail-postfix-server: image: gcr.io/mcback/common:latest init: true From 2f82c8cfe4de6f7f31bcb89534457418e62c4520 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 25 Feb 2021 23:32:21 +0200 Subject: [PATCH 010/175] Update PyCharm project --- apps/nytlabels-annotator/.idea/misc.xml | 2 +- apps/nytlabels-annotator/.idea/nytlabels-annotator.iml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/nytlabels-annotator/.idea/misc.xml b/apps/nytlabels-annotator/.idea/misc.xml index 62c91cd3b1..dd2f82cf96 100644 --- a/apps/nytlabels-annotator/.idea/misc.xml +++ b/apps/nytlabels-annotator/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/apps/nytlabels-annotator/.idea/nytlabels-annotator.iml b/apps/nytlabels-annotator/.idea/nytlabels-annotator.iml index f1dab97a30..3e3e8c191c 100644 --- a/apps/nytlabels-annotator/.idea/nytlabels-annotator.iml +++ b/apps/nytlabels-annotator/.idea/nytlabels-annotator.iml @@ -2,7 +2,7 @@ - + From 8f3e784832cbb6fd8b89e34faa01134b4c02fb8a Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 25 Feb 2021 23:59:45 +0200 Subject: [PATCH 011/175] Make number of threads to be used into an argument --- apps/nytlabels-annotator/Dockerfile | 2 +- .../crappy-predict-news-labels/nytlabels.py | 13 +++-- .../crappy-predict-news-labels/nytlabels.sh | 8 +++ .../nytlabels_http_server.py | 50 ++++++++++++++----- 4 files changed, 57 insertions(+), 16 deletions(-) create mode 100755 apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.sh diff --git a/apps/nytlabels-annotator/Dockerfile b/apps/nytlabels-annotator/Dockerfile index 4cd13da3ba..09dc564bb5 100644 --- a/apps/nytlabels-annotator/Dockerfile +++ b/apps/nytlabels-annotator/Dockerfile @@ -103,4 +103,4 @@ STOPSIGNAL SIGTERM USER nobody -CMD ["nytlabels_http_server.py"] +CMD ["nytlabels.sh"] diff --git a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py index c07629c68c..1bd7de5260 100644 --- a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py +++ b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py @@ -1,7 +1,8 @@ import dataclasses +import multiprocessing import os import shelve -from typing import List +from typing import List, Optional from nltk.data import load as load_nltk_data from nltk.tokenize.destructive import NLTKWordTokenizer @@ -87,13 +88,19 @@ class MultiLabelPredict(object): '_embedding_size', ] - def __init__(self, model_path: str, labels_path: str): + def __init__(self, model_path: str, labels_path: str, num_threads: Optional[int] = None): if not os.path.isfile(model_path): raise RuntimeError(f"Model was not found in {model_path}") if not os.path.isfile(labels_path): raise RuntimeError(f"Model labels were not found in {labels_path}") - self._model = onnxruntime.InferenceSession(model_path) + if num_threads is None: + num_threads = multiprocessing.cpu_count() + + options = onnxruntime.SessionOptions() + options.intra_op_num_threads = num_threads + + self._model = onnxruntime.InferenceSession(path_or_bytes=model_path) self._labels = open(labels_path, 'r').read().splitlines() _, self._sample_length, self._embedding_size = self._model.get_inputs()[0].shape diff --git a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.sh b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.sh new file mode 100755 index 0000000000..73263c2105 --- /dev/null +++ b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -u +set -e + +PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +exec "$PWD/nytlabels_http_server.py" --num_threads "$(/container_cpu_limit.sh)" diff --git a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py index 0f0205ae41..46d75336ce 100755 --- a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py +++ b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py @@ -4,14 +4,14 @@ NYTLabels annotator HTTP service. """ +import argparse import dataclasses import json import operator import os from http import HTTPStatus from http.server import HTTPServer, BaseHTTPRequestHandler -from sys import argv -from typing import Union, Dict, List +from typing import Union, Dict, List, Optional, Type from self_test_input import SELF_TEST_INPUT @@ -38,7 +38,7 @@ class _Predictor(object): _ModelDescriptor(basename='just_taxonomies', json_key='taxonomies'), ] - def __init__(self): + def __init__(self, num_threads: Optional[int]): pwd = os.path.dirname(os.path.abspath(__file__)) models_dir = os.path.join(pwd, 'models') @@ -64,6 +64,7 @@ def __init__(self): model = MultiLabelPredict( model_path=os.path.join(models_dir, f"{model_descriptor.basename}.onnx"), labels_path=os.path.join(models_dir, f"{model_descriptor.basename}.txt"), + num_threads=num_threads, ) if sample_length and embedding_size: @@ -94,7 +95,16 @@ def __init__(self): # noinspection PyPep8Naming class NYTLabelsRequestHandler(BaseHTTPRequestHandler): - _PREDICTOR = _Predictor() + _PREDICTOR = None + + @classmethod + def initialize_predictor(cls, num_threads: Optional[int]) -> None: + assert not cls._PREDICTOR, "Predictor is already initialized." + cls._PREDICTOR = _Predictor(num_threads=num_threads) + + def __init__(self, *args, **kwargs): + assert self._PREDICTOR, "You need to initialize the predictor before setting this class as a request handler." + super(NYTLabelsRequestHandler, self).__init__(*args, **kwargs) def __respond(self, http_status: int, response: Union[dict, list]): self.send_response(http_status) @@ -115,6 +125,8 @@ def do_HEAD(self): def _predict(self, text: str) -> Dict[str, List[Dict[str, str]]]: + assert self._PREDICTOR, "Predictor is not initialized, are you using the class factory?" + # Sample length / embedding size is the same for all models first_model = self._PREDICTOR.models[list(self._PREDICTOR.models.keys())[0]] sample_length = first_model.sample_length() @@ -187,15 +199,29 @@ def do_POST(self): self.__respond(http_status=HTTPStatus.OK, response=result) -def run(port: int = 8080): - server_address = ('', port) - httpd = HTTPServer(server_address, NYTLabelsRequestHandler) - print(f'Starting NYTLabels annotator on port {port}...') +def make_nytlabels_request_handler_class(num_threads: Optional[int]) -> Type[NYTLabelsRequestHandler]: + class CustomNYTLabelsRequestHandler(NYTLabelsRequestHandler): + pass + + CustomNYTLabelsRequestHandler.initialize_predictor(num_threads=num_threads) + + return CustomNYTLabelsRequestHandler + + +def main(): + parser = argparse.ArgumentParser(description="Start NYTLabels annotator web service.") + parser.add_argument("-p", "--port", type=int, required=False, default=8080, + help="Port to listen to") + parser.add_argument("-t", "--num_threads", type=int, required=False, + help="Threads that the model runtime should spawn") + args = parser.parse_args() + + server_address = ('', args.port) + handler_class = make_nytlabels_request_handler_class(num_threads=args.num_threads) + httpd = HTTPServer(server_address, handler_class) + print(f'Starting NYTLabels annotator on port {args.port}...') httpd.serve_forever() if __name__ == "__main__": - if len(argv) == 2: - run(port=int(argv[1])) - else: - run() + main() From 9f96dde0a6551f5179393994d15579700469e138 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Fri, 26 Feb 2021 00:03:58 +0200 Subject: [PATCH 012/175] Hardcode execution_mode and graph_optimization_level to get predictable performance --- .../src/crappy-predict-news-labels/nytlabels.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py index 1bd7de5260..2728ffeb54 100644 --- a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py +++ b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py @@ -99,6 +99,8 @@ def __init__(self, model_path: str, labels_path: str, num_threads: Optional[int] options = onnxruntime.SessionOptions() options.intra_op_num_threads = num_threads + options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL + options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL self._model = onnxruntime.InferenceSession(path_or_bytes=model_path) self._labels = open(labels_path, 'r').read().splitlines() From 84c522f247a4fb53397c27e796862c0482f222a1 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Fri, 26 Feb 2021 00:15:15 +0200 Subject: [PATCH 013/175] Don't duplicate model client code in self-test --- .../nytlabels_http_server.py | 72 ++++++++----------- 1 file changed, 30 insertions(+), 42 deletions(-) diff --git a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py index 46d75336ce..bb09135998 100755 --- a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py +++ b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py @@ -7,8 +7,8 @@ import argparse import dataclasses import json -import operator import os +import pprint from http import HTTPStatus from http.server import HTTPServer, BaseHTTPRequestHandler from typing import Union, Dict, List, Optional, Type @@ -26,8 +26,8 @@ class _ModelDescriptor(object): class _Predictor(object): __slots__ = [ - 'text2vectors', - 'models', + '__text2vectors', + '__models', ] _MODEL_DESCRIPTORS = [ @@ -46,14 +46,14 @@ def __init__(self, num_threads: Optional[int]): raise RuntimeError(f"Models path should be directory: {models_dir}") print("Loading scaler and word2vec...") - self.text2vectors = Text2ScaledVectors( + self.__text2vectors = Text2ScaledVectors( word2vec_shelve_path=os.path.join(models_dir, 'GoogleNews-vectors-negative300.stripped.shelve'), scaler_path=os.path.join(models_dir, 'scaler.onnx'), ) print("Scaler and word2vec loaded.") print("Loading models...") - self.models = dict() + self.__models = dict() # Make sure all models have the sample sample length and embedding size as we vector text only once sample_length = None @@ -73,24 +73,37 @@ def __init__(self, num_threads: Optional[int]): sample_length = model.sample_length() embedding_size = model.embedding_size() - self.models[model_descriptor] = model + self.__models[model_descriptor] = model print("Models loaded.") print("Running self-test...\n") - vectors = self.text2vectors.transform( - text=SELF_TEST_INPUT, + test_result = self.predict(text=SELF_TEST_INPUT) + pp = pprint.PrettyPrinter(indent=4, width=1024) + pp.pprint(test_result) + print("Done running self-test.") + + def predict(self, text: str) -> Dict[str, List[Dict[str, str]]]: + + # Sample length / embedding size is the same for all models + first_model = self.__models[list(self.__models.keys())[0]] + sample_length = first_model.sample_length() + embedding_size = first_model.embedding_size() + + vectors = self.__text2vectors.transform( + text=text, sample_length=sample_length, embedding_size=embedding_size, ) - for model_descriptor in sorted(self._MODEL_DESCRIPTORS, key=operator.attrgetter('basename')): - print(f"Model '{model_descriptor.basename}':") - model = self.models[model_descriptor] + + result = dict() + + for model_descriptor, model in self.__models.items(): predictions = model.predict(x_matrix=vectors) - for prediction in predictions: - print(f" * Label: {prediction.label}, score: {prediction.score:.6f}") - assert len(predictions), f"Some predictions should be returned by {model.__class__.__name__}" - print() - print("Done running self-test.") + result[model_descriptor.json_key] = [ + {'label': x.label, 'score': "{0:.5f}".format(x.score)} for x in predictions + ] + + return result # noinspection PyPep8Naming @@ -123,31 +136,6 @@ def do_HEAD(self): # noinspection PyUnresolvedReferences self.__respond_with_error(http_status=HTTPStatus.BAD_REQUEST.value, message='HEAD requests are not supported.') - def _predict(self, text: str) -> Dict[str, List[Dict[str, str]]]: - - assert self._PREDICTOR, "Predictor is not initialized, are you using the class factory?" - - # Sample length / embedding size is the same for all models - first_model = self._PREDICTOR.models[list(self._PREDICTOR.models.keys())[0]] - sample_length = first_model.sample_length() - embedding_size = first_model.embedding_size() - - vectors = self._PREDICTOR.text2vectors.transform( - text=text, - sample_length=sample_length, - embedding_size=embedding_size, - ) - - result = dict() - - for model_descriptor, model in self._PREDICTOR.models.items(): - predictions = model.predict(x_matrix=vectors) - result[model_descriptor.json_key] = [ - {'label': x.label, 'score': "{0:.5f}".format(x.score)} for x in predictions - ] - - return result - def do_POST(self): content_length = int(self.headers.get('Content-Length', 0)) if not content_length: @@ -187,7 +175,7 @@ def do_POST(self): return try: - result = self._predict(text) + result = self._PREDICTOR.predict(text) except Exception as ex: # noinspection PyUnresolvedReferences self.__respond_with_error( From f9c45ffdc6812039b9a3bcacd35a661a2ec4d5ff Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Fri, 26 Feb 2021 00:39:41 +0200 Subject: [PATCH 014/175] Set execution mode to ORT_SEQUENTIAL as it's slightly faster --- .../src/crappy-predict-news-labels/nytlabels.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py index 2728ffeb54..28283b5db1 100644 --- a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py +++ b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py @@ -98,8 +98,17 @@ def __init__(self, model_path: str, labels_path: str, num_threads: Optional[int] num_threads = multiprocessing.cpu_count() options = onnxruntime.SessionOptions() + + # 11.265 ms + options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL + + # 11.464 ms + # options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL + + # Not really used without ORT_PARALLEL: + options.inter_op_num_threads = num_threads options.intra_op_num_threads = num_threads - options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL + options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL self._model = onnxruntime.InferenceSession(path_or_bytes=model_path) From 78d2df4ca7afa04e48da89e4c2e9a638593cb060 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Fri, 26 Feb 2021 01:11:42 +0200 Subject: [PATCH 015/175] Install OpenMP --- apps/nytlabels-annotator/Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/apps/nytlabels-annotator/Dockerfile b/apps/nytlabels-annotator/Dockerfile index 09dc564bb5..b980bbe7cf 100644 --- a/apps/nytlabels-annotator/Dockerfile +++ b/apps/nytlabels-annotator/Dockerfile @@ -84,6 +84,11 @@ RUN \ WORKDIR /usr/src/crappy-predict-news-labels/ COPY src/crappy-predict-news-labels/requirements.txt /usr/src/crappy-predict-news-labels/ RUN \ + # + # OpenMP for onnxruntime speed up + apt-get -y --no-install-recommends install libgomp1 && \ + # + # The rest pip3 install -r requirements.txt && \ rm -rf /root/.cache/ && \ true From 7f46e5f505192762f8d39043b534f32dc5d1482f Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Fri, 26 Feb 2021 01:43:47 +0200 Subject: [PATCH 016/175] Add "apt-get update" hack as base image is outdated --- apps/nytlabels-annotator/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/nytlabels-annotator/Dockerfile b/apps/nytlabels-annotator/Dockerfile index b980bbe7cf..e4b1eab83b 100644 --- a/apps/nytlabels-annotator/Dockerfile +++ b/apps/nytlabels-annotator/Dockerfile @@ -5,6 +5,8 @@ FROM gcr.io/mcback/base:latest RUN \ + # FIXME remove once the base image gets updated + apt-get -y update && \ # # Install model fetch dependencies apt-get -y --no-install-recommends install brotli && \ From f7c51705da613e2c7f1cb2d491345f0fc4ff26d8 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Fri, 26 Feb 2021 02:01:47 +0200 Subject: [PATCH 017/175] Make it possible to process input only with a few models --- apps/nytlabels-annotator/Dockerfile | 20 ++--- apps/nytlabels-annotator/README.md | 8 ++ .../nytlabels_http_server.py | 74 +++++++++++++------ 3 files changed, 68 insertions(+), 34 deletions(-) diff --git a/apps/nytlabels-annotator/Dockerfile b/apps/nytlabels-annotator/Dockerfile index e4b1eab83b..0d33d8845b 100644 --- a/apps/nytlabels-annotator/Dockerfile +++ b/apps/nytlabels-annotator/Dockerfile @@ -27,29 +27,29 @@ RUN /dl_to_stdout.sh "$MODEL_URL/GoogleNews-vectors-negative300.stripped.shelve. RUN /dl_to_stdout.sh "$MODEL_URL/scaler.onnx" > scaler.onnx RUN /dl_to_stdout.sh "$MODEL_URL/all_descriptors.onnx.br" | \ - brotli -d > all_descriptors.onnx + brotli -d > allDescriptors.onnx RUN /dl_to_stdout.sh "$MODEL_URL/all_descriptors.txt.br" | \ - brotli -d > all_descriptors.txt + brotli -d > allDescriptors.txt RUN /dl_to_stdout.sh "$MODEL_URL/descriptors_3000.onnx.br" | \ - brotli -d > descriptors_3000.onnx + brotli -d > descriptors3000.onnx RUN /dl_to_stdout.sh "$MODEL_URL/descriptors_3000.txt.br" | \ - brotli -d > descriptors_3000.txt + brotli -d > descriptors3000.txt RUN /dl_to_stdout.sh "$MODEL_URL/descriptors_600.onnx.br" | \ - brotli -d > descriptors_600.onnx + brotli -d > descriptors600.onnx RUN /dl_to_stdout.sh "$MODEL_URL/descriptors_600.txt.br" | \ - brotli -d > descriptors_600.txt + brotli -d > descriptors600.txt RUN /dl_to_stdout.sh "$MODEL_URL/descriptors_with_taxonomies.onnx.br" | \ - brotli -d > descriptors_with_taxonomies.onnx + brotli -d > descriptorsAndTaxonomies.onnx RUN /dl_to_stdout.sh "$MODEL_URL/descriptors_with_taxonomies.txt.br" | \ - brotli -d > descriptors_with_taxonomies.txt + brotli -d > descriptorsAndTaxonomies.txt RUN /dl_to_stdout.sh "$MODEL_URL/just_taxonomies.onnx.br" | \ - brotli -d > just_taxonomies.onnx + brotli -d > taxonomies.onnx RUN /dl_to_stdout.sh "$MODEL_URL/just_taxonomies.txt.br" | \ - brotli -d > just_taxonomies.txt + brotli -d > taxonomies.txt # Install NLTK data RUN \ diff --git a/apps/nytlabels-annotator/README.md b/apps/nytlabels-annotator/README.md index 33df58c79f..c631393663 100644 --- a/apps/nytlabels-annotator/README.md +++ b/apps/nytlabels-annotator/README.md @@ -58,3 +58,11 @@ echo '{}' | \ curl --header "Content-Type: application/json" -X POST --data-binary @- http://127.0.0.1:8080/predict.json | \ jq ".descriptors600" ``` + +Alternatively, to try out just the `descriptors600` model: + +```bash +echo '{"models": ["descriptors600"]}' | \ + jq --arg key0 text --arg value0 "$(cat test.txt)" '. | .[$key0]=$value0' | \ + curl --header "Content-Type: application/json" -X POST --data-binary @- http://127.0.0.1:8080/predict.json +``` diff --git a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py index bb09135998..cb74243552 100755 --- a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py +++ b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py @@ -5,7 +5,6 @@ """ import argparse -import dataclasses import json import os import pprint @@ -17,11 +16,14 @@ from nytlabels import Text2ScaledVectors, MultiLabelPredict - -@dataclasses.dataclass(frozen=True) -class _ModelDescriptor(object): - basename: str - json_key: str +# For each key there must exist a model ONNX file and a list of labels with a given basename +ALL_MODELS = [ + 'allDescriptors', + 'descriptors3000', + 'descriptors600', + 'descriptorsAndTaxonomies', + 'taxonomies', +] class _Predictor(object): @@ -30,14 +32,6 @@ class _Predictor(object): '__models', ] - _MODEL_DESCRIPTORS = [ - _ModelDescriptor(basename='all_descriptors', json_key='allDescriptors'), - _ModelDescriptor(basename='descriptors_3000', json_key='descriptors3000'), - _ModelDescriptor(basename='descriptors_600', json_key='descriptors600'), - _ModelDescriptor(basename='descriptors_with_taxonomies', json_key='descriptorsAndTaxonomies'), - _ModelDescriptor(basename='just_taxonomies', json_key='taxonomies'), - ] - def __init__(self, num_threads: Optional[int]): pwd = os.path.dirname(os.path.abspath(__file__)) @@ -59,11 +53,11 @@ def __init__(self, num_threads: Optional[int]): sample_length = None embedding_size = None - for model_descriptor in self._MODEL_DESCRIPTORS: - print(f" Loading '{model_descriptor.basename}'...") + for model_name in ALL_MODELS: + print(f" Loading '{model_name}'...") model = MultiLabelPredict( - model_path=os.path.join(models_dir, f"{model_descriptor.basename}.onnx"), - labels_path=os.path.join(models_dir, f"{model_descriptor.basename}.txt"), + model_path=os.path.join(models_dir, f"{model_name}.onnx"), + labels_path=os.path.join(models_dir, f"{model_name}.txt"), num_threads=num_threads, ) @@ -73,16 +67,16 @@ def __init__(self, num_threads: Optional[int]): sample_length = model.sample_length() embedding_size = model.embedding_size() - self.__models[model_descriptor] = model + self.__models[model_name] = model print("Models loaded.") print("Running self-test...\n") - test_result = self.predict(text=SELF_TEST_INPUT) + test_result = self.predict(text=SELF_TEST_INPUT, enabled_model_names=ALL_MODELS) pp = pprint.PrettyPrinter(indent=4, width=1024) pp.pprint(test_result) print("Done running self-test.") - def predict(self, text: str) -> Dict[str, List[Dict[str, str]]]: + def predict(self, text: str, enabled_model_names: List[str]) -> Dict[str, List[Dict[str, str]]]: # Sample length / embedding size is the same for all models first_model = self.__models[list(self.__models.keys())[0]] @@ -97,9 +91,10 @@ def predict(self, text: str) -> Dict[str, List[Dict[str, str]]]: result = dict() - for model_descriptor, model in self.__models.items(): + for model_name in enabled_model_names: + model = self.__models[model_name] predictions = model.predict(x_matrix=vectors) - result[model_descriptor.json_key] = [ + result[model_name] = [ {'label': x.label, 'score': "{0:.5f}".format(x.score)} for x in predictions ] @@ -174,8 +169,39 @@ def do_POST(self): ) return + models = payload.get('models', None) + if models is None: + enabled_model_names = ALL_MODELS + else: + enabled_model_names = [] + for model_name in models: + if model_name not in ALL_MODELS: + # noinspection PyUnresolvedReferences + self.__respond_with_error( + http_status=HTTPStatus.BAD_REQUEST.value, + message=f"Model '{model_name}' was not found.", + ) + return + if model_name in enabled_model_names: + # noinspection PyUnresolvedReferences + self.__respond_with_error( + http_status=HTTPStatus.BAD_REQUEST.value, + message=f"Model '{model_name}' is duplicate.", + ) + return + + enabled_model_names.append(model_name) + + if not enabled_model_names: + # noinspection PyUnresolvedReferences + self.__respond_with_error( + http_status=HTTPStatus.BAD_REQUEST.value, + message="List of enabled models is empty.", + ) + return + try: - result = self._PREDICTOR.predict(text) + result = self._PREDICTOR.predict(text=text, enabled_model_names=enabled_model_names) except Exception as ex: # noinspection PyUnresolvedReferences self.__respond_with_error( From ba195de553419acdf6f186f52bc02940bf0e0ec1 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Fri, 26 Feb 2021 02:02:23 +0200 Subject: [PATCH 018/175] Query only for "descriptors600" labels --- .../src/python/nytlabels_fetch_annotation/fetcher.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/apps/nytlabels-fetch-annotation/src/python/nytlabels_fetch_annotation/fetcher.py b/apps/nytlabels-fetch-annotation/src/python/nytlabels_fetch_annotation/fetcher.py index a9ad25d3f6..f932fba3b1 100644 --- a/apps/nytlabels-fetch-annotation/src/python/nytlabels_fetch_annotation/fetcher.py +++ b/apps/nytlabels-fetch-annotation/src/python/nytlabels_fetch_annotation/fetcher.py @@ -14,6 +14,8 @@ class NYTLabelsAnnotatorFetcher(JSONAnnotationFetcher): """NYT labels annotation fetcher.""" + _ENABLED_MODEL = 'descriptors600' + def __init__(self, fetcher_config: NYTLabelsFetcherConfig = None): self.__fetcher_config = fetcher_config @@ -34,7 +36,7 @@ def _request_for_text(self, text: str) -> Request: # Create JSON request log.debug("Converting text to JSON request...") try: - text_json = encode_json({'text': text}) + text_json = encode_json({'text': text, 'models': [self._ENABLED_MODEL]}) except Exception as ex: # Not critical, might happen to some stories, no need to shut down the annotator raise McJSONAnnotationFetcherException( @@ -64,8 +66,8 @@ def _fetched_annotation_is_valid(self, annotation: Union[dict, list]) -> bool: log.warning("Annotation is not dict: %s" % str(annotation)) return False - if 'descriptors600' not in annotation: - log.warning("Annotation doesn't have 'descriptors600' key: %s" % str(annotation)) + if self._ENABLED_MODEL not in annotation: + log.warning(f"Annotation doesn't have '{self._ENABLED_MODEL}' key: {annotation}") return False return True From 37957b61b1c29939db96c3b133531c05b9ca3d17 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Fri, 26 Feb 2021 02:16:02 +0200 Subject: [PATCH 019/175] Get rid of misleading comment --- .../src/crappy-predict-news-labels/nytlabels.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py index 28283b5db1..d7a2911689 100644 --- a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py +++ b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels.py @@ -98,11 +98,9 @@ def __init__(self, model_path: str, labels_path: str, num_threads: Optional[int] num_threads = multiprocessing.cpu_count() options = onnxruntime.SessionOptions() - - # 11.265 ms options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL - # 11.464 ms + # Seems to be slightly slower: # options.execution_mode = onnxruntime.ExecutionMode.ORT_PARALLEL # Not really used without ORT_PARALLEL: From 9c1cdf2c3cc6b48ec47a54d1006e64ccafb0fdfe Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Fri, 26 Feb 2021 03:16:26 +0200 Subject: [PATCH 020/175] Enable HTTP/1.1 in nytlabels-annotator request to not wait on "Expect:" --- .../nytlabels_http_server.py | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py index cb74243552..951de5e6b5 100755 --- a/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py +++ b/apps/nytlabels-annotator/src/crappy-predict-news-labels/nytlabels_http_server.py @@ -103,6 +103,9 @@ def predict(self, text: str, enabled_model_names: List[str]) -> Dict[str, List[D # noinspection PyPep8Naming class NYTLabelsRequestHandler(BaseHTTPRequestHandler): + # Allow HTTP/1.1 connections and so don't wait up on "Expect:" headers + protocol_version = "HTTP/1.1" + _PREDICTOR = None @classmethod @@ -115,14 +118,31 @@ def __init__(self, *args, **kwargs): super(NYTLabelsRequestHandler, self).__init__(*args, **kwargs) def __respond(self, http_status: int, response: Union[dict, list]): + raw_response = json.dumps(response).encode('utf-8') self.send_response(http_status) self.send_header('Content-Type', 'application/json; charset=UTF-8') + self.send_header('Content-Length', str(len(raw_response))) self.end_headers() - self.wfile.write(json.dumps(response).encode('utf-8')) + self.wfile.write(raw_response) def __respond_with_error(self, http_status: int, message: str): self.__respond(http_status=http_status, response={'error': message}) + # If the request handler's protocol_version is set to "HTTP/1.0" (the default) and the client tries connecting via + # HTTP/1.1 and sends an "Expect: 100-continue" header, the client will then wait for a bit (curl waits for a second) + # for "100 Continue" which the server will never send (due to it being configured to support HTTP/1.0 only), + # therefore the whole request will take a one whole second more. + # + # Please note that when enabling HTTP/1.1, one has to send Content-Length in their responses. + def __check_expect_header(self): + expect = self.headers.get('Expect', "") + if expect.lower() == "100-continue": + if not (self.protocol_version >= "HTTP/1.1" and self.request_version >= "HTTP/1.1"): + print(( + "WARNING: due to server / client misconfiguration, client sent Expect: header " + "and is waiting for a response, possibly delaying the whole request.""" + )) + def do_GET(self): # noinspection PyUnresolvedReferences self.__respond_with_error(http_status=HTTPStatus.BAD_REQUEST.value, message='GET requests are not supported.') @@ -132,6 +152,9 @@ def do_HEAD(self): self.__respond_with_error(http_status=HTTPStatus.BAD_REQUEST.value, message='HEAD requests are not supported.') def do_POST(self): + + self.__check_expect_header() + content_length = int(self.headers.get('Content-Length', 0)) if not content_length: # noinspection PyUnresolvedReferences From 4cca8ccf9a1568bf7fe8997f2b9e0cf4a771050d Mon Sep 17 00:00:00 2001 From: jtotoole Date: Fri, 26 Feb 2021 15:22:57 -0400 Subject: [PATCH 021/175] put unsubscribe address in config --- apps/common/docker-compose.tests.yml | 6 ++++++ apps/common/src/python/mediawords/util/config/common.py | 8 ++++++++ apps/common/src/python/mediawords/util/mail.py | 3 +-- apps/docker-compose.dist.yml | 8 ++++++++ apps/topics-map/Dockerfile | 2 +- 5 files changed, 24 insertions(+), 3 deletions(-) diff --git a/apps/common/docker-compose.tests.yml b/apps/common/docker-compose.tests.yml index f822f335f5..29794cece7 100644 --- a/apps/common/docker-compose.tests.yml +++ b/apps/common/docker-compose.tests.yml @@ -13,6 +13,12 @@ services: MC_DOWNLOADS_AMAZON_S3_DIRECTORY_NAME: "${MC_DOWNLOADS_AMAZON_S3_DIRECTORY_NAME}" MC_PUBLIC_STORE_TYPE: "postgresql" MC_PUBLIC_STORE_SALT: "foo" + # Email address to point to in List-Unsubscribe email header. + # Technically we don't have a straightforward "unsubscribe" endpoint, but our + # emails are more likely to be marked spam if we don't have such a header, so + # we make the email subject "Delete account and unsubscribe" in + # mediawords/util/config/common.py + MC_UNSUBSCRIBE_MAILTO_LINK: "support@example.com" volumes: - type: bind source: ./src/ diff --git a/apps/common/src/python/mediawords/util/config/common.py b/apps/common/src/python/mediawords/util/config/common.py index 114514a52c..e04c75b210 100644 --- a/apps/common/src/python/mediawords/util/config/common.py +++ b/apps/common/src/python/mediawords/util/config/common.py @@ -155,6 +155,14 @@ def password() -> str: """Password.""" return '' + @staticmethod + def unsubscribe_address() -> str: + """Mailto link for email to which unsubscribe/account deletion requests should be sent""" + address = env_value('MC_UNSUBSCRIBE_MAILTO_LINK', required=False, allow_empty_string=True) + if address is None or '@' not in address: + address = 'support@example.com' + return f'mailto:{address}?subject=Delete%20account%20and%20unsubscribe' + class DownloadStorageConfig(object): """Download storage configuration.""" diff --git a/apps/common/src/python/mediawords/util/mail.py b/apps/common/src/python/mediawords/util/mail.py index 3402e237fe..d7effc037a 100644 --- a/apps/common/src/python/mediawords/util/mail.py +++ b/apps/common/src/python/mediawords/util/mail.py @@ -117,8 +117,7 @@ def send_email(message: Message) -> bool: message_part = MIMEText(message.text_body, 'plain', 'utf-8') mime_message.attach(message_part) - mime_message.add_header('List-Unsubscribe', - 'mailto:support@mediacloud.org?subject=Delete%20account%20and%20unsubscribe') + mime_message.add_header('List-Unsubscribe', CommonConfig.smtp().unsubscribe_address()) # HTML gets attached last, thus making it a preferred part as per RFC if message.html_body: diff --git a/apps/docker-compose.dist.yml b/apps/docker-compose.dist.yml index eb587cb664..cd4312fcf1 100644 --- a/apps/docker-compose.dist.yml +++ b/apps/docker-compose.dist.yml @@ -74,6 +74,14 @@ x-common-configuration: &common-configuration # "From:" email address when sending emails MC_EMAIL_FROM_ADDRESS: "info@mediacloud.org" + # Email address to point to in List-Unsubscribe email header. + # Technically we don't have a straightforward "unsubscribe" endpoint, but our + # emails are more likely to be marked spam if we don't have such a header, so + # we make the email subject "Delete account and unsubscribe" in + # mediawords/util/config/common.py + # example value = support@example.com + MC_UNSUBSCRIBE_MAILTO_LINK: "support@example.com" + # Fail all HTTP requests that match the following pattern, e.g. # "^https?://[^/]*some-website.com" MC_USERAGENT_BLACKLIST_URL_PATTERN: "" diff --git a/apps/topics-map/Dockerfile b/apps/topics-map/Dockerfile index 9d7080f811..3b61e8bd37 100644 --- a/apps/topics-map/Dockerfile +++ b/apps/topics-map/Dockerfile @@ -6,7 +6,7 @@ FROM gcr.io/mcback/common:latest # Install Java RUN \ - apt-get update && \ + apt-get -y update && \ apt-get -y --no-install-recommends install openjdk-8-jre-headless && \ true From 24f4c9dddffaaac9ef73dae21ff86b07d2a12478 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Mon, 1 Mar 2021 18:20:07 +0200 Subject: [PATCH 022/175] Enable HTTP/1.1 in all BaseHTTPRequestHandlers --- apps/common/src/python/mediawords/test/hash_server.py | 2 ++ .../bin/extract_article_from_page_http_server.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/apps/common/src/python/mediawords/test/hash_server.py b/apps/common/src/python/mediawords/test/hash_server.py index 9038d3f57e..dfe07703f0 100644 --- a/apps/common/src/python/mediawords/test/hash_server.py +++ b/apps/common/src/python/mediawords/test/hash_server.py @@ -171,6 +171,8 @@ def serve_forever(self, _=0.5): # noinspection PyPep8Naming class _HTTPHandler(BaseHTTPRequestHandler): + # Allow HTTP/1.1 connections and so don't wait up on "Expect:" headers + protocol_version = "HTTP/1.1" def _set_port(self, port: int): self._port = port diff --git a/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py b/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py index 21fd9c1078..4b99f4cce9 100755 --- a/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py +++ b/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py @@ -44,6 +44,9 @@ class ServerHandler(BaseHTTPRequestHandler): + # Allow HTTP/1.1 connections and so don't wait up on "Expect:" headers + protocol_version = "HTTP/1.1" + _API_ENDPOINT_PATH = "/extract" def __json_response(self, status: int, response: dict) -> bytes: From d645daf9de5b5de12ec10dd3a268f0580e7dcd57 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Mon, 1 Mar 2021 19:55:13 +0200 Subject: [PATCH 023/175] Add PyCharm project --- apps/nytlabels-fetch-annotation-and-tag/.idea/misc.xml | 2 +- .../.idea/nytlabels-fetch-annotation-and-tag.iml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/nytlabels-fetch-annotation-and-tag/.idea/misc.xml b/apps/nytlabels-fetch-annotation-and-tag/.idea/misc.xml index bd61294576..2ac35808ab 100644 --- a/apps/nytlabels-fetch-annotation-and-tag/.idea/misc.xml +++ b/apps/nytlabels-fetch-annotation-and-tag/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/apps/nytlabels-fetch-annotation-and-tag/.idea/nytlabels-fetch-annotation-and-tag.iml b/apps/nytlabels-fetch-annotation-and-tag/.idea/nytlabels-fetch-annotation-and-tag.iml index 10163454cb..9a2244a452 100644 --- a/apps/nytlabels-fetch-annotation-and-tag/.idea/nytlabels-fetch-annotation-and-tag.iml +++ b/apps/nytlabels-fetch-annotation-and-tag/.idea/nytlabels-fetch-annotation-and-tag.iml @@ -2,7 +2,7 @@ - + From 38f167811f03c289817cbfdcc5dccbd4abc3374b Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Mon, 1 Mar 2021 19:55:19 +0200 Subject: [PATCH 024/175] Reformat file --- .../tests/python/test_nytlabels_tags_from_annotation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/nytlabels-fetch-annotation-and-tag/tests/python/test_nytlabels_tags_from_annotation.py b/apps/nytlabels-fetch-annotation-and-tag/tests/python/test_nytlabels_tags_from_annotation.py index 2ee0c9d654..b8027b64e9 100644 --- a/apps/nytlabels-fetch-annotation-and-tag/tests/python/test_nytlabels_tags_from_annotation.py +++ b/apps/nytlabels-fetch-annotation-and-tag/tests/python/test_nytlabels_tags_from_annotation.py @@ -6,6 +6,7 @@ from mediawords.util.network import random_unused_port from mediawords.util.parse_json import encode_json from mediawords.util.sql import sql_now + from nytlabels_fetch_annotation_and_tag.config import NYTLabelsTagsFromAnnotationConfig from nytlabels_fetch_annotation_and_tag.nytlabels_tags_from_annotation import NYTLabelsTagsFromAnnotation from nytlabels_fetch_annotation_and_tag.sample_data import sample_nytlabels_response, expected_nytlabels_tags @@ -14,7 +15,6 @@ class TestNYTLabelsTagsFromAnnotation(TestCase): def test_tagging(self): - db = connect_to_db() media = db.create(table='media', insert_hash={ From 0690fae2f2b677f167d883132209dfcad630e97e Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Mon, 1 Mar 2021 20:04:47 +0200 Subject: [PATCH 025/175] Print timestamps together with headers --- apps/nytlabels-annotator/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/nytlabels-annotator/README.md b/apps/nytlabels-annotator/README.md index c631393663..f20423ebe3 100644 --- a/apps/nytlabels-annotator/README.md +++ b/apps/nytlabels-annotator/README.md @@ -55,7 +55,7 @@ and then `POST` said file as JSON to the annotator: ```bash echo '{}' | \ jq --arg key0 text --arg value0 "$(cat test.txt)" '. | .[$key0]=$value0' | \ - curl --header "Content-Type: application/json" -X POST --data-binary @- http://127.0.0.1:8080/predict.json | \ + curl --verbose --silent --trace-time --header "Content-Type: application/json" -X POST --data-binary @- http://127.0.0.1:8080/predict.json | \ jq ".descriptors600" ``` @@ -64,5 +64,5 @@ Alternatively, to try out just the `descriptors600` model: ```bash echo '{"models": ["descriptors600"]}' | \ jq --arg key0 text --arg value0 "$(cat test.txt)" '. | .[$key0]=$value0' | \ - curl --header "Content-Type: application/json" -X POST --data-binary @- http://127.0.0.1:8080/predict.json + curl --verbose --silent --trace-time --header "Content-Type: application/json" -X POST --data-binary @- http://127.0.0.1:8080/predict.json ``` From adc0ae1793501e32f4c11acf0969ede2eefbad29 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Mon, 1 Mar 2021 14:32:04 -0400 Subject: [PATCH 026/175] change env var name + format --- apps/common/docker-compose.tests.yml | 2 +- apps/common/src/python/mediawords/util/config/common.py | 5 +++-- apps/common/src/python/mediawords/util/mail.py | 6 +++++- apps/docker-compose.dist.yml | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/apps/common/docker-compose.tests.yml b/apps/common/docker-compose.tests.yml index 29794cece7..6b68b76f7f 100644 --- a/apps/common/docker-compose.tests.yml +++ b/apps/common/docker-compose.tests.yml @@ -18,7 +18,7 @@ services: # emails are more likely to be marked spam if we don't have such a header, so # we make the email subject "Delete account and unsubscribe" in # mediawords/util/config/common.py - MC_UNSUBSCRIBE_MAILTO_LINK: "support@example.com" + MC_EMAIL_UNSUBSCRIBE: "support@example.com" volumes: - type: bind source: ./src/ diff --git a/apps/common/src/python/mediawords/util/config/common.py b/apps/common/src/python/mediawords/util/config/common.py index e04c75b210..97ce03718d 100644 --- a/apps/common/src/python/mediawords/util/config/common.py +++ b/apps/common/src/python/mediawords/util/config/common.py @@ -158,10 +158,11 @@ def password() -> str: @staticmethod def unsubscribe_address() -> str: """Mailto link for email to which unsubscribe/account deletion requests should be sent""" - address = env_value('MC_UNSUBSCRIBE_MAILTO_LINK', required=False, allow_empty_string=True) + address = env_value('MC_EMAIL_UNSUBSCRIBE', required=False, allow_empty_string=True) if address is None or '@' not in address: address = 'support@example.com' - return f'mailto:{address}?subject=Delete%20account%20and%20unsubscribe' + return address + # return f'mailto:{address}?subject=Delete%20account%20and%20unsubscribe' class DownloadStorageConfig(object): diff --git a/apps/common/src/python/mediawords/util/mail.py b/apps/common/src/python/mediawords/util/mail.py index d7effc037a..9bea5ec1ed 100644 --- a/apps/common/src/python/mediawords/util/mail.py +++ b/apps/common/src/python/mediawords/util/mail.py @@ -117,7 +117,11 @@ def send_email(message: Message) -> bool: message_part = MIMEText(message.text_body, 'plain', 'utf-8') mime_message.attach(message_part) - mime_message.add_header('List-Unsubscribe', CommonConfig.smtp().unsubscribe_address()) + unsubscribe_address = CommonConfig.smtp().unsubscribe_address() + + mime_message.add_header( + 'List-Unsubscribe', + f'mailto:{unsubscribe_address}?subject=Delete%20account%20and%20unsubscribe') # HTML gets attached last, thus making it a preferred part as per RFC if message.html_body: diff --git a/apps/docker-compose.dist.yml b/apps/docker-compose.dist.yml index cd4312fcf1..27ebbf9c6f 100644 --- a/apps/docker-compose.dist.yml +++ b/apps/docker-compose.dist.yml @@ -80,7 +80,7 @@ x-common-configuration: &common-configuration # we make the email subject "Delete account and unsubscribe" in # mediawords/util/config/common.py # example value = support@example.com - MC_UNSUBSCRIBE_MAILTO_LINK: "support@example.com" + MC_EMAIL_UNSUBSCRIBE: "support@example.com" # Fail all HTTP requests that match the following pattern, e.g. # "^https?://[^/]*some-website.com" From e571a8bc9b453ca620ce0812aa263a29fe8bbb53 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Mon, 1 Mar 2021 20:36:11 +0200 Subject: [PATCH 027/175] Revert "Enable HTTP/1.1 in all BaseHTTPRequestHandlers" This reverts commit 24f4c9dddffaaac9ef73dae21ff86b07d2a12478. --- apps/common/src/python/mediawords/test/hash_server.py | 2 -- .../bin/extract_article_from_page_http_server.py | 3 --- 2 files changed, 5 deletions(-) diff --git a/apps/common/src/python/mediawords/test/hash_server.py b/apps/common/src/python/mediawords/test/hash_server.py index dfe07703f0..9038d3f57e 100644 --- a/apps/common/src/python/mediawords/test/hash_server.py +++ b/apps/common/src/python/mediawords/test/hash_server.py @@ -171,8 +171,6 @@ def serve_forever(self, _=0.5): # noinspection PyPep8Naming class _HTTPHandler(BaseHTTPRequestHandler): - # Allow HTTP/1.1 connections and so don't wait up on "Expect:" headers - protocol_version = "HTTP/1.1" def _set_port(self, port: int): self._port = port diff --git a/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py b/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py index 4b99f4cce9..21fd9c1078 100755 --- a/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py +++ b/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py @@ -44,9 +44,6 @@ class ServerHandler(BaseHTTPRequestHandler): - # Allow HTTP/1.1 connections and so don't wait up on "Expect:" headers - protocol_version = "HTTP/1.1" - _API_ENDPOINT_PATH = "/extract" def __json_response(self, status: int, response: dict) -> bytes: From 7435f93aa20b3ccfc34efef432fd59f44deb927f Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Mon, 1 Mar 2021 20:38:13 +0200 Subject: [PATCH 028/175] Enable HTTP/1.1 for extractor as it always sends Content-Length --- .../bin/extract_article_from_page_http_server.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py b/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py index 21fd9c1078..4b99f4cce9 100755 --- a/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py +++ b/apps/extract-article-from-page/bin/extract_article_from_page_http_server.py @@ -44,6 +44,9 @@ class ServerHandler(BaseHTTPRequestHandler): + # Allow HTTP/1.1 connections and so don't wait up on "Expect:" headers + protocol_version = "HTTP/1.1" + _API_ENDPOINT_PATH = "/extract" def __json_response(self, status: int, response: dict) -> bytes: From ef58f261b3aeba8bb8c0e0898022f3a355f7f54b Mon Sep 17 00:00:00 2001 From: jtotoole Date: Thu, 4 Mar 2021 12:46:35 -0400 Subject: [PATCH 029/175] fix comments --- apps/common/src/python/mediawords/util/config/common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/common/src/python/mediawords/util/config/common.py b/apps/common/src/python/mediawords/util/config/common.py index 97ce03718d..c515847ef6 100644 --- a/apps/common/src/python/mediawords/util/config/common.py +++ b/apps/common/src/python/mediawords/util/config/common.py @@ -157,12 +157,11 @@ def password() -> str: @staticmethod def unsubscribe_address() -> str: - """Mailto link for email to which unsubscribe/account deletion requests should be sent""" + """Email to which unsubscribe/account deletion requests should be sent""" address = env_value('MC_EMAIL_UNSUBSCRIBE', required=False, allow_empty_string=True) if address is None or '@' not in address: address = 'support@example.com' return address - # return f'mailto:{address}?subject=Delete%20account%20and%20unsubscribe' class DownloadStorageConfig(object): From 1db5e3f16fac95087360bffce70cb3ef1d4a0c7b Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Tue, 16 Mar 2021 23:47:25 +0200 Subject: [PATCH 030/175] Create elasticsearch-base image to reuse ES elsewhere --- apps/elasticsearch-base/.dockerignore | 92 +++++++++++++++++++ apps/elasticsearch-base/Dockerfile | 69 ++++++++++++++ apps/elasticsearch-base/bin/elasticsearch.sh | 34 +++++++ apps/elasticsearch-base/config/.dockerignore | 1 + apps/elasticsearch-base/config/.gitignore | 3 + .../config/elasticsearch-base.yml} | 7 -- .../config/java.policy | 0 .../config/jvm.options | 2 +- .../config/log4j2.properties | 0 apps/elk-elasticsearch/Dockerfile | 64 ++----------- ...{elasticsearch.sh => elk-elasticsearch.sh} | 31 +------ .../config/elk-elasticsearch.yml | 7 ++ 12 files changed, 219 insertions(+), 91 deletions(-) create mode 100644 apps/elasticsearch-base/.dockerignore create mode 100644 apps/elasticsearch-base/Dockerfile create mode 100755 apps/elasticsearch-base/bin/elasticsearch.sh create mode 100644 apps/elasticsearch-base/config/.dockerignore create mode 100644 apps/elasticsearch-base/config/.gitignore rename apps/{elk-elasticsearch/config/elasticsearch.yml => elasticsearch-base/config/elasticsearch-base.yml} (68%) rename apps/{elk-elasticsearch => elasticsearch-base}/config/java.policy (100%) rename apps/{elk-elasticsearch => elasticsearch-base}/config/jvm.options (95%) rename apps/{elk-elasticsearch => elasticsearch-base}/config/log4j2.properties (100%) rename apps/elk-elasticsearch/bin/{elasticsearch.sh => elk-elasticsearch.sh} (56%) create mode 100644 apps/elk-elasticsearch/config/elk-elasticsearch.yml diff --git a/apps/elasticsearch-base/.dockerignore b/apps/elasticsearch-base/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/elasticsearch-base/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/elasticsearch-base/Dockerfile b/apps/elasticsearch-base/Dockerfile new file mode 100644 index 0000000000..e8e07fdd8b --- /dev/null +++ b/apps/elasticsearch-base/Dockerfile @@ -0,0 +1,69 @@ +# +# Base image for Elasticsearch +# + +FROM gcr.io/mcback/java-base:latest + +# Install Elasticsearch +# (https://www.elastic.co/downloads/elasticsearch-no-jdk) +ENV MC_ELASTICSEARCH_VERSION=7.10.2 +RUN \ + mkdir -p /opt/elasticsearch/ && \ + curl --fail --location --retry 3 --retry-delay 5 "https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${MC_ELASTICSEARCH_VERSION}-no-jdk-linux-x86_64.tar.gz" | \ + tar -zx -C /opt/elasticsearch/ --strip 1 && \ + true + +# Add unprivileged user the service will run as +RUN useradd -ms /bin/bash elasticsearch + +RUN \ + # + # Data directory + mkdir -p /var/lib/elasticsearch/ && \ + mkdir -p /var/lib/elasticsearch/jvm-heapdumps/ && \ + mkdir -p /var/lib/elasticsearch/jvm-gc-logs/ && \ + chown -R elasticsearch:elasticsearch /var/lib/elasticsearch/ && \ + # + # JVM options directory + mkdir -p /opt/elasticsearch/config/jvm.options.d/ && \ + chmod 775 /opt/elasticsearch/config/jvm.options.d/ && \ + # + true + +COPY config/* /opt/elasticsearch/config/ +COPY bin/* /opt/elasticsearch/bin/ + +# Create keystore and move it to data volume +RUN \ + rm -f /opt/elasticsearch/config/elasticsearch.keystore && \ + rm -f /var/lib/elasticsearch/elasticsearch.keystore && \ + /opt/elasticsearch/bin/elasticsearch-keystore create && \ + mv /opt/elasticsearch/config/elasticsearch.keystore /var/lib/elasticsearch/ && \ + ln -s /var/lib/elasticsearch/elasticsearch.keystore /opt/elasticsearch/config/elasticsearch.keystore && \ + chown elasticsearch:elasticsearch /var/lib/elasticsearch/elasticsearch.keystore && \ + # + # Keystore tool will want to write a "temporary" keystore: + # + # ERROR: unable to create temporary keystore at + # [/opt/elasticsearch/config/elasticsearch.keystore.tmp], + # write permissions required for [/opt/elasticsearch/config] + # or run [elasticsearch-keystore upgrade] + # + # Plus the S3 plugin insists at writing to other locations too. + # + chown -R elasticsearch:elasticsearch /opt/elasticsearch/config/ && \ + # + true + +USER elasticsearch + +# Elasticsearch HTTP +EXPOSE 9200 + +# Elasticsearch TCP transport +EXPOSE 9300 + +# Elasticsearch data +VOLUME /var/lib/elasticsearch + +CMD ["/opt/elasticsearch/bin/elasticsearch.sh"] diff --git a/apps/elasticsearch-base/bin/elasticsearch.sh b/apps/elasticsearch-base/bin/elasticsearch.sh new file mode 100755 index 0000000000..4dcc391452 --- /dev/null +++ b/apps/elasticsearch-base/bin/elasticsearch.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -e +set -u + +# https://www.elastic.co/guide/en/elasticsearch/reference/current/max-number-of-threads.html +if [ "$(ulimit -u)" != "unlimited" ] && [ $(ulimit -u) -lt 4096 ]; then + echo "Process limit (ulimit -u) is too low." + exit 1 +fi + +# https://www.elastic.co/guide/en/elasticsearch/reference/current/file-descriptors.html +if [ "$(ulimit -n -S)" != "unlimited" ] && [ $(ulimit -n -S) -lt 65535 ]; then + echo "Soft open file limit (ulimit -n -S) is too low." + exit 1 +fi +if [ "$(ulimit -n -H)" != "unlimited" ] && [ $(ulimit -n -H) -lt 65535 ]; then + echo "Hard open file limit (ulimit -n -H) is too low." + exit 1 +fi + +# "Set Xmx and Xms to no more than 50% of your physical RAM." +MC_RAM_SIZE=$(/container_memory_limit.sh) +MC_ELASTICSEARCH_MS=$((MC_RAM_SIZE / 10 * 4)) +MC_ELASTICSEARCH_MX="${MC_ELASTICSEARCH_MS}" + +export ES_JAVA_OPTS="" + +# Memory limits +export ES_JAVA_OPTS="${ES_JAVA_OPTS} -Xms${MC_ELASTICSEARCH_MS}m" +export ES_JAVA_OPTS="${ES_JAVA_OPTS} -Xmx${MC_ELASTICSEARCH_MX}m" + +# Run Elasticsearch +exec /opt/elasticsearch/bin/elasticsearch diff --git a/apps/elasticsearch-base/config/.dockerignore b/apps/elasticsearch-base/config/.dockerignore new file mode 100644 index 0000000000..b3c0a37b66 --- /dev/null +++ b/apps/elasticsearch-base/config/.dockerignore @@ -0,0 +1 @@ +elasticsearch.keystore diff --git a/apps/elasticsearch-base/config/.gitignore b/apps/elasticsearch-base/config/.gitignore new file mode 100644 index 0000000000..3eb03f777e --- /dev/null +++ b/apps/elasticsearch-base/config/.gitignore @@ -0,0 +1,3 @@ +# Might get created by a Docker container +elasticsearch.keystore + diff --git a/apps/elk-elasticsearch/config/elasticsearch.yml b/apps/elasticsearch-base/config/elasticsearch-base.yml similarity index 68% rename from apps/elk-elasticsearch/config/elasticsearch.yml rename to apps/elasticsearch-base/config/elasticsearch-base.yml index 12fc1f5b1c..3e7ad2dfea 100644 --- a/apps/elk-elasticsearch/config/elasticsearch.yml +++ b/apps/elasticsearch-base/config/elasticsearch-base.yml @@ -1,5 +1,3 @@ -cluster.name: elk-elasticsearch -node.name: elk-elasticsearch path.data: /var/lib/elasticsearch network.host: 0.0.0.0 http.port: 9200 @@ -8,8 +6,3 @@ transport.port: 9300 # Use single node discovery in order to disable production mode and avoid bootstrap checks # see https://www.elastic.co/guide/en/elasticsearch/reference/current/bootstrap-checks.html discovery.type: single-node - -# Define S3 client for log snapshots -s3.client: - elk_logs: - protocol: https diff --git a/apps/elk-elasticsearch/config/java.policy b/apps/elasticsearch-base/config/java.policy similarity index 100% rename from apps/elk-elasticsearch/config/java.policy rename to apps/elasticsearch-base/config/java.policy diff --git a/apps/elk-elasticsearch/config/jvm.options b/apps/elasticsearch-base/config/jvm.options similarity index 95% rename from apps/elk-elasticsearch/config/jvm.options rename to apps/elasticsearch-base/config/jvm.options index c15568722a..3590c3bb45 100644 --- a/apps/elk-elasticsearch/config/jvm.options +++ b/apps/elasticsearch-base/config/jvm.options @@ -18,7 +18,7 @@ # has sufficient space -XX:HeapDumpPath=/var/lib/elasticsearch/jvm-heapdumps/ -# Update policy for S3 plugin to work +# Update policy for plugins to work -Djava.security.policy=/opt/elasticsearch/config/java.policy # Log JVM errors to STDERR diff --git a/apps/elk-elasticsearch/config/log4j2.properties b/apps/elasticsearch-base/config/log4j2.properties similarity index 100% rename from apps/elk-elasticsearch/config/log4j2.properties rename to apps/elasticsearch-base/config/log4j2.properties diff --git a/apps/elk-elasticsearch/Dockerfile b/apps/elk-elasticsearch/Dockerfile index 4dd588dc44..9e9a5595e6 100644 --- a/apps/elk-elasticsearch/Dockerfile +++ b/apps/elk-elasticsearch/Dockerfile @@ -2,77 +2,33 @@ # Elasticsearch for ELK logging stack # -FROM gcr.io/mcback/java-base:latest +FROM gcr.io/mcback/elasticsearch-base:latest -# Install Elasticsearch -# (https://www.elastic.co/downloads/elasticsearch-no-jdk) -ENV ELK_ELASTICSEARCH_VERSION=7.10.2 -RUN \ - mkdir -p /opt/elasticsearch/ && \ - curl --fail --location --retry 3 --retry-delay 5 "https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${ELK_ELASTICSEARCH_VERSION}-no-jdk-linux-x86_64.tar.gz" | \ - tar -zx -C /opt/elasticsearch/ --strip 1 && \ - true +USER root # Install Elasticsearch Amazon S3 plugin for ILS archival # (we use curl to be able to configure retries and such) RUN \ - curl --fail --location --retry 3 --retry-delay 5 "https://artifacts.elastic.co/downloads/elasticsearch-plugins/repository-s3/repository-s3-${ELK_ELASTICSEARCH_VERSION}.zip" > \ + curl --fail --location --retry 3 --retry-delay 5 "https://artifacts.elastic.co/downloads/elasticsearch-plugins/repository-s3/repository-s3-${MC_ELASTICSEARCH_VERSION}.zip" > \ /var/tmp/es-s3-plugin.zip && \ /opt/elasticsearch/bin/elasticsearch-plugin install --batch file:///var/tmp/es-s3-plugin.zip && \ rm /var/tmp/es-s3-plugin.zip && \ true -# Add unprivileged user the service will run as -RUN useradd -ms /bin/bash elk - -RUN \ - # - # Data directory - mkdir -p /var/lib/elasticsearch/ && \ - mkdir -p /var/lib/elasticsearch/jvm-heapdumps/ && \ - mkdir -p /var/lib/elasticsearch/jvm-gc-logs/ && \ - chown -R elk:elk /var/lib/elasticsearch/ && \ - # - # JVM options directory - mkdir -p /opt/elasticsearch/config/jvm.options.d/ && \ - chmod 775 /opt/elasticsearch/config/jvm.options.d/ && \ - # - true - COPY config/* /opt/elasticsearch/config/ COPY bin/* /opt/elasticsearch/bin/ # Create keystore and move it to data volume RUN \ - rm -f /opt/elasticsearch/config/elasticsearch.keystore && \ - rm -f /var/lib/elasticsearch/elasticsearch.keystore && \ - /opt/elasticsearch/bin/elasticsearch-keystore create && \ - mv /opt/elasticsearch/config/elasticsearch.keystore /var/lib/elasticsearch/ && \ - ln -s /var/lib/elasticsearch/elasticsearch.keystore /opt/elasticsearch/config/elasticsearch.keystore && \ - chown elk:elk /var/lib/elasticsearch/elasticsearch.keystore && \ - # - # Keystore tool will want to write a "temporary" keystore: - # - # ERROR: unable to create temporary keystore at - # [/opt/elasticsearch/config/elasticsearch.keystore.tmp], - # write permissions required for [/opt/elasticsearch/config] - # or run [elasticsearch-keystore upgrade] - # - # Plus the S3 plugin insists at writing to other locations too. # - chown -R elk:elk /opt/elasticsearch/config/ && \ + # Merge base and ELK configs into one + cat \ + /opt/elasticsearch/config/elasticsearch-base.yml \ + /opt/elasticsearch/config/elk-elasticsearch.yml \ + > /opt/elasticsearch/config/elasticsearch.yml && \ # true -USER elk - -# Elasticsearch HTTP -EXPOSE 9200 - -# Elasticsearch TCP transport -EXPOSE 9300 - -# Elasticsearch data -VOLUME /var/lib/elasticsearch +USER elasticsearch -CMD ["/opt/elasticsearch/bin/elasticsearch.sh"] +CMD ["/opt/elasticsearch/bin/elk-elasticsearch.sh"] diff --git a/apps/elk-elasticsearch/bin/elasticsearch.sh b/apps/elk-elasticsearch/bin/elk-elasticsearch.sh similarity index 56% rename from apps/elk-elasticsearch/bin/elasticsearch.sh rename to apps/elk-elasticsearch/bin/elk-elasticsearch.sh index 5681a63dc0..677a2dcd4d 100755 --- a/apps/elk-elasticsearch/bin/elasticsearch.sh +++ b/apps/elk-elasticsearch/bin/elk-elasticsearch.sh @@ -24,22 +24,6 @@ fi set -u -# https://www.elastic.co/guide/en/elasticsearch/reference/current/max-number-of-threads.html -if [ "$(ulimit -u)" != "unlimited" ] && [ $(ulimit -u) -lt 4096 ]; then - echo "Process limit (ulimit -u) is too low." - exit 1 -fi - -# https://www.elastic.co/guide/en/elasticsearch/reference/current/file-descriptors.html -if [ "$(ulimit -n -S)" != "unlimited" ] && [ $(ulimit -n -S) -lt 65535 ]; then - echo "Soft open file limit (ulimit -n -S) is too low." - exit 1 -fi -if [ "$(ulimit -n -H)" != "unlimited" ] && [ $(ulimit -n -H) -lt 65535 ]; then - echo "Hard open file limit (ulimit -n -H) is too low." - exit 1 -fi - # Update AWS credentials in a keystore echo "Update AWS credentials in a keystore..." echo -n "${MC_ELK_ELASTICSEARCH_SNAPSHOT_S3_ACCESS_KEY_ID}" | \ @@ -55,16 +39,5 @@ if [ ! -f /var/lib/elasticsearch/s3-snapshots-setup ]; then touch /var/lib/elasticsearch/s3-snapshots-setup fi -# "Set Xmx and Xms to no more than 50% of your physical RAM." -MC_RAM_SIZE=$(/container_memory_limit.sh) -MC_ELASTICSEARCH_MS=$((MC_RAM_SIZE / 10 * 4)) -MC_ELASTICSEARCH_MX="${MC_ELASTICSEARCH_MS}" - -export ES_JAVA_OPTS="" - -# Memory limits -export ES_JAVA_OPTS="${ES_JAVA_OPTS} -Xms${MC_ELASTICSEARCH_MS}m" -export ES_JAVA_OPTS="${ES_JAVA_OPTS} -Xmx${MC_ELASTICSEARCH_MX}m" - -# Run Elasticsearch -exec /opt/elasticsearch/bin/elasticsearch +# Run Elasticsearch wrapper script +exec /opt/elasticsearch/bin/elasticsearch.sh diff --git a/apps/elk-elasticsearch/config/elk-elasticsearch.yml b/apps/elk-elasticsearch/config/elk-elasticsearch.yml new file mode 100644 index 0000000000..68c42c5625 --- /dev/null +++ b/apps/elk-elasticsearch/config/elk-elasticsearch.yml @@ -0,0 +1,7 @@ +cluster.name: elk-elasticsearch +node.name: elk-elasticsearch + +# Define S3 client for log snapshots +s3.client: + elk_logs: + protocol: https From 1b1702c72bbdb52b89d92f2e1382576a21621a66 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 20 Mar 2021 01:53:55 +0000 Subject: [PATCH 031/175] Bump jinja2 from 2.11.2 to 2.11.3 in /apps/common/src Bumps [jinja2](https://github.com/pallets/jinja) from 2.11.2 to 2.11.3. - [Release notes](https://github.com/pallets/jinja/releases) - [Changelog](https://github.com/pallets/jinja/blob/master/CHANGES.rst) - [Commits](https://github.com/pallets/jinja/compare/2.11.2...2.11.3) Signed-off-by: dependabot[bot] --- apps/common/src/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/common/src/requirements.txt b/apps/common/src/requirements.txt index 96c3385b19..f9396bd942 100644 --- a/apps/common/src/requirements.txt +++ b/apps/common/src/requirements.txt @@ -44,7 +44,7 @@ furl==2.1.0 jieba==0.42.1 # Parsing email templates -Jinja2==2.11.2 +Jinja2==2.11.3 # One of Celery's dependencies (here just for PyCharm to stop complaining) # Upgrade together with Celery and not separately. From 3702088599e97373912e4c3ec83cd790afca86e6 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Wed, 17 Mar 2021 02:04:02 +0200 Subject: [PATCH 032/175] Add apps that make up Temporal --- .../docker-compose.tests.yml | 2 +- apps/common/docker-compose.tests.yml | 2 +- apps/common/src/requirements.txt | 3 + apps/crawler-ap/docker-compose.tests.yml | 2 +- apps/crawler-fetcher/docker-compose.tests.yml | 2 +- .../crawler-provider/docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- apps/dump-table/docker-compose.tests.yml | 2 +- apps/elk-kibana/docker-compose.tests.yml | 9 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../import-solr-data/docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- apps/munin-cron/docker-compose.tests.yml | 2 +- apps/munin-httpd/docker-compose.tests.yml | 2 +- apps/munin-node/docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- apps/postgresql-base/Dockerfile | 86 +++++++- apps/postgresql-base/bin/postgresql.sh | 16 ++ .../bin/update_memory_config.sh | 0 .../conf/environment | 0 .../conf/pg_ctl.conf | 0 .../conf/pg_hba.conf | 2 +- .../conf/pg_ident.conf | 0 .../conf/postgresql.conf | 0 .../conf/start.conf | 0 apps/postgresql-pgbouncer/Dockerfile | 2 +- apps/postgresql-repo-base/.dockerignore | 92 ++++++++ apps/postgresql-repo-base/Dockerfile | 19 ++ apps/postgresql-server/Dockerfile | 71 ++----- .../postgresql-server/bin/apply_migrations.sh | 2 +- .../bin/initialize_schema.sh | 17 +- .../{postgresql_server.sh => postgresql.sh} | 12 +- apps/postgresql-server/bin/pps | 23 +- .../docker-compose.tests.yml | 2 +- apps/rescrape-media/docker-compose.tests.yml | 2 +- apps/temporal-elasticsearch/.dockerignore | 92 ++++++++ apps/temporal-elasticsearch/Dockerfile | 32 +++ .../config/.dockerignore | 1 + apps/temporal-elasticsearch/config/.gitignore | 3 + .../config/temporal-elasticsearch.yml | 2 + .../index_template.json | 89 ++++++++ .../setup_index_template.sh | 34 +++ apps/temporal-postgresql/.dockerignore | 92 ++++++++ apps/temporal-postgresql/Dockerfile | 74 +++++++ .../bin/apply_migrations.sh | 45 ++++ .../bin/initialize_schema.sh | 68 ++++++ apps/temporal-postgresql/bin/postgresql.sh | 22 ++ apps/temporal-server/.dockerignore | 92 ++++++++ apps/temporal-server/Dockerfile | 79 +++++++ apps/temporal-server/bin/temporal.sh | 56 +++++ .../temporal-server/config/dynamicconfig.yaml | 60 ++++++ .../config/mediacloud_template.yaml | 199 ++++++++++++++++++ apps/temporal-server/docker-compose.tests.yml | 132 ++++++++++++ apps/temporal-webapp/.dockerignore | 92 ++++++++ apps/temporal-webapp/Dockerfile | 85 ++++++++ apps/tools/docker-compose.tests.yml | 2 +- apps/topics-base/docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- apps/topics-map/docker-compose.tests.yml | 2 +- apps/topics-mine/docker-compose.tests.yml | 2 +- apps/topics-snapshot/docker-compose.tests.yml | 2 +- apps/webapp-api/docker-compose.tests.yml | 2 +- apps/webapp-httpd/docker-compose.tests.yml | 2 +- .../docker-compose.tests.yml | 2 +- 83 files changed, 1653 insertions(+), 138 deletions(-) create mode 100755 apps/postgresql-base/bin/postgresql.sh rename apps/{postgresql-server => postgresql-base}/bin/update_memory_config.sh (100%) rename apps/{postgresql-server => postgresql-base}/conf/environment (100%) rename apps/{postgresql-server => postgresql-base}/conf/pg_ctl.conf (100%) rename apps/{postgresql-server => postgresql-base}/conf/pg_hba.conf (89%) rename apps/{postgresql-server => postgresql-base}/conf/pg_ident.conf (100%) rename apps/{postgresql-server => postgresql-base}/conf/postgresql.conf (100%) rename apps/{postgresql-server => postgresql-base}/conf/start.conf (100%) create mode 100644 apps/postgresql-repo-base/.dockerignore create mode 100644 apps/postgresql-repo-base/Dockerfile rename apps/postgresql-server/bin/{postgresql_server.sh => postgresql.sh} (58%) create mode 100644 apps/temporal-elasticsearch/.dockerignore create mode 100644 apps/temporal-elasticsearch/Dockerfile create mode 100644 apps/temporal-elasticsearch/config/.dockerignore create mode 100644 apps/temporal-elasticsearch/config/.gitignore create mode 100644 apps/temporal-elasticsearch/config/temporal-elasticsearch.yml create mode 100644 apps/temporal-elasticsearch/index_template.json create mode 100755 apps/temporal-elasticsearch/setup_index_template.sh create mode 100644 apps/temporal-postgresql/.dockerignore create mode 100644 apps/temporal-postgresql/Dockerfile create mode 100755 apps/temporal-postgresql/bin/apply_migrations.sh create mode 100755 apps/temporal-postgresql/bin/initialize_schema.sh create mode 100755 apps/temporal-postgresql/bin/postgresql.sh create mode 100644 apps/temporal-server/.dockerignore create mode 100644 apps/temporal-server/Dockerfile create mode 100755 apps/temporal-server/bin/temporal.sh create mode 100644 apps/temporal-server/config/dynamicconfig.yaml create mode 100644 apps/temporal-server/config/mediacloud_template.yaml create mode 100644 apps/temporal-server/docker-compose.tests.yml create mode 100644 apps/temporal-webapp/.dockerignore create mode 100644 apps/temporal-webapp/Dockerfile diff --git a/apps/cliff-fetch-annotation-and-tag/docker-compose.tests.yml b/apps/cliff-fetch-annotation-and-tag/docker-compose.tests.yml index 5b589005cd..fcecfafd01 100644 --- a/apps/cliff-fetch-annotation-and-tag/docker-compose.tests.yml +++ b/apps/cliff-fetch-annotation-and-tag/docker-compose.tests.yml @@ -54,5 +54,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/common/docker-compose.tests.yml b/apps/common/docker-compose.tests.yml index f822f335f5..fd34c9e11d 100644 --- a/apps/common/docker-compose.tests.yml +++ b/apps/common/docker-compose.tests.yml @@ -91,7 +91,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ solr-shard-01: diff --git a/apps/common/src/requirements.txt b/apps/common/src/requirements.txt index 96c3385b19..e6b3d08ecb 100644 --- a/apps/common/src/requirements.txt +++ b/apps/common/src/requirements.txt @@ -89,6 +89,9 @@ sentence_splitter==1.4 # Celery PostgreSQL result backend support sqlalchemy==1.3.22 +# Temporal's Python SDK +git+https://github.com/firdaus/temporal-python-sdk.git@8604d025ae1272b592d3d4dd430acd15eeb6562a#egg=temporal-python-sdk + # Normalizing URLs url_normalize==1.4.3 diff --git a/apps/crawler-ap/docker-compose.tests.yml b/apps/crawler-ap/docker-compose.tests.yml index 0c7ecbf4f3..a08abb051c 100644 --- a/apps/crawler-ap/docker-compose.tests.yml +++ b/apps/crawler-ap/docker-compose.tests.yml @@ -93,7 +93,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ rabbitmq-server: diff --git a/apps/crawler-fetcher/docker-compose.tests.yml b/apps/crawler-fetcher/docker-compose.tests.yml index bce5615b12..e13efb607b 100644 --- a/apps/crawler-fetcher/docker-compose.tests.yml +++ b/apps/crawler-fetcher/docker-compose.tests.yml @@ -56,7 +56,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ rabbitmq-server: diff --git a/apps/crawler-provider/docker-compose.tests.yml b/apps/crawler-provider/docker-compose.tests.yml index e1c6fa25b4..a31a8ceb87 100644 --- a/apps/crawler-provider/docker-compose.tests.yml +++ b/apps/crawler-provider/docker-compose.tests.yml @@ -49,5 +49,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/create-missing-partitions/docker-compose.tests.yml b/apps/create-missing-partitions/docker-compose.tests.yml index cbafcefd70..254208231a 100644 --- a/apps/create-missing-partitions/docker-compose.tests.yml +++ b/apps/create-missing-partitions/docker-compose.tests.yml @@ -43,5 +43,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/cron-generate-daily-rss-dumps/docker-compose.tests.yml b/apps/cron-generate-daily-rss-dumps/docker-compose.tests.yml index 80a2254651..a7c029b9c0 100644 --- a/apps/cron-generate-daily-rss-dumps/docker-compose.tests.yml +++ b/apps/cron-generate-daily-rss-dumps/docker-compose.tests.yml @@ -46,5 +46,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/cron-generate-media-health/docker-compose.tests.yml b/apps/cron-generate-media-health/docker-compose.tests.yml index d0e3004326..5e689e3a70 100644 --- a/apps/cron-generate-media-health/docker-compose.tests.yml +++ b/apps/cron-generate-media-health/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/cron-generate-user-summary/docker-compose.tests.yml b/apps/cron-generate-user-summary/docker-compose.tests.yml index c02b92f977..da857d568a 100644 --- a/apps/cron-generate-user-summary/docker-compose.tests.yml +++ b/apps/cron-generate-user-summary/docker-compose.tests.yml @@ -46,5 +46,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/cron-print-long-running-job-states/docker-compose.tests.yml b/apps/cron-print-long-running-job-states/docker-compose.tests.yml index f84022e5b3..17a8962be0 100644 --- a/apps/cron-print-long-running-job-states/docker-compose.tests.yml +++ b/apps/cron-print-long-running-job-states/docker-compose.tests.yml @@ -46,5 +46,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/cron-refresh-stats/docker-compose.tests.yml b/apps/cron-refresh-stats/docker-compose.tests.yml index 2ccefa0ed7..a697d94982 100644 --- a/apps/cron-refresh-stats/docker-compose.tests.yml +++ b/apps/cron-refresh-stats/docker-compose.tests.yml @@ -46,5 +46,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/cron-rescrape-due-media/docker-compose.tests.yml b/apps/cron-rescrape-due-media/docker-compose.tests.yml index 9284439ac1..d208555376 100644 --- a/apps/cron-rescrape-due-media/docker-compose.tests.yml +++ b/apps/cron-rescrape-due-media/docker-compose.tests.yml @@ -47,7 +47,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ rabbitmq-server: diff --git a/apps/cron-rescraping-changes/docker-compose.tests.yml b/apps/cron-rescraping-changes/docker-compose.tests.yml index 6437093c02..4c43bf8319 100644 --- a/apps/cron-rescraping-changes/docker-compose.tests.yml +++ b/apps/cron-rescraping-changes/docker-compose.tests.yml @@ -46,5 +46,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/cron-set-media-primary-language/docker-compose.tests.yml b/apps/cron-set-media-primary-language/docker-compose.tests.yml index 69c7cd43a6..8864332bb0 100644 --- a/apps/cron-set-media-primary-language/docker-compose.tests.yml +++ b/apps/cron-set-media-primary-language/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/cron-set-media-subject-country/docker-compose.tests.yml b/apps/cron-set-media-subject-country/docker-compose.tests.yml index 99c543ee36..5d6c2b40d7 100644 --- a/apps/cron-set-media-subject-country/docker-compose.tests.yml +++ b/apps/cron-set-media-subject-country/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/dump-table/docker-compose.tests.yml b/apps/dump-table/docker-compose.tests.yml index 946442019d..21805412ce 100644 --- a/apps/dump-table/docker-compose.tests.yml +++ b/apps/dump-table/docker-compose.tests.yml @@ -43,5 +43,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/elk-kibana/docker-compose.tests.yml b/apps/elk-kibana/docker-compose.tests.yml index 26eb31d2c5..d98bca2078 100644 --- a/apps/elk-kibana/docker-compose.tests.yml +++ b/apps/elk-kibana/docker-compose.tests.yml @@ -49,11 +49,12 @@ services: - "9300:9300" volumes: - type: bind - source: ./../elk-elasticsearch/bin/elasticsearch.sh - target: /opt/elasticsearch/bin/elasticsearch.sh + source: ./../elk-elasticsearch/bin/elk-elasticsearch.sh + target: /opt/elasticsearch/bin/elk-elasticsearch.sh - type: bind - source: ./../elk-elasticsearch/config/ - target: /opt/elasticsearch/config/ + source: ./../elasticsearch-base/bin/elasticsearch.sh + target: /opt/elasticsearch/bin/elasticsearch.sh + # Not mounting config as it gets concatenated into a single file # Limit CPUs and RAM for the process to not get too greedy deploy: resources: diff --git a/apps/export-tables-to-backup-crawler/docker-compose.tests.yml b/apps/export-tables-to-backup-crawler/docker-compose.tests.yml index fa6ca3e3dc..520977affe 100644 --- a/apps/export-tables-to-backup-crawler/docker-compose.tests.yml +++ b/apps/export-tables-to-backup-crawler/docker-compose.tests.yml @@ -49,5 +49,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/extract-and-vector/docker-compose.tests.yml b/apps/extract-and-vector/docker-compose.tests.yml index b13ba25133..49d239eafc 100644 --- a/apps/extract-and-vector/docker-compose.tests.yml +++ b/apps/extract-and-vector/docker-compose.tests.yml @@ -69,7 +69,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ rabbitmq-server: diff --git a/apps/facebook-fetch-story-stats/docker-compose.tests.yml b/apps/facebook-fetch-story-stats/docker-compose.tests.yml index 52fd7b1b3d..2be7cdbdf4 100644 --- a/apps/facebook-fetch-story-stats/docker-compose.tests.yml +++ b/apps/facebook-fetch-story-stats/docker-compose.tests.yml @@ -53,7 +53,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ rabbitmq-server: diff --git a/apps/import-solr-data-for-testing/docker-compose.tests.yml b/apps/import-solr-data-for-testing/docker-compose.tests.yml index 679bb5b161..2bdbe20e94 100644 --- a/apps/import-solr-data-for-testing/docker-compose.tests.yml +++ b/apps/import-solr-data-for-testing/docker-compose.tests.yml @@ -50,7 +50,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ solr-shard-01: diff --git a/apps/import-solr-data/docker-compose.tests.yml b/apps/import-solr-data/docker-compose.tests.yml index bb70f29a81..ffb28df709 100644 --- a/apps/import-solr-data/docker-compose.tests.yml +++ b/apps/import-solr-data/docker-compose.tests.yml @@ -75,7 +75,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ solr-shard-01: diff --git a/apps/import-stories-feedly/docker-compose.tests.yml b/apps/import-stories-feedly/docker-compose.tests.yml index a0b0c338b0..be21298935 100644 --- a/apps/import-stories-feedly/docker-compose.tests.yml +++ b/apps/import-stories-feedly/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/import-stories-scrapehtml/docker-compose.tests.yml b/apps/import-stories-scrapehtml/docker-compose.tests.yml index 7feb9e03b1..dd31bd936d 100644 --- a/apps/import-stories-scrapehtml/docker-compose.tests.yml +++ b/apps/import-stories-scrapehtml/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/munin-cron/docker-compose.tests.yml b/apps/munin-cron/docker-compose.tests.yml index e24e465693..65fac73f80 100644 --- a/apps/munin-cron/docker-compose.tests.yml +++ b/apps/munin-cron/docker-compose.tests.yml @@ -58,7 +58,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ solr-shard-01: diff --git a/apps/munin-httpd/docker-compose.tests.yml b/apps/munin-httpd/docker-compose.tests.yml index 17c380dfb8..b1d2c1a61c 100644 --- a/apps/munin-httpd/docker-compose.tests.yml +++ b/apps/munin-httpd/docker-compose.tests.yml @@ -88,7 +88,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ solr-shard-01: diff --git a/apps/munin-node/docker-compose.tests.yml b/apps/munin-node/docker-compose.tests.yml index 9a99b0065b..e626f8bd25 100644 --- a/apps/munin-node/docker-compose.tests.yml +++ b/apps/munin-node/docker-compose.tests.yml @@ -43,7 +43,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ solr-shard-01: diff --git a/apps/nytlabels-fetch-annotation-and-tag/docker-compose.tests.yml b/apps/nytlabels-fetch-annotation-and-tag/docker-compose.tests.yml index bcabb80022..8f7aa3ef47 100644 --- a/apps/nytlabels-fetch-annotation-and-tag/docker-compose.tests.yml +++ b/apps/nytlabels-fetch-annotation-and-tag/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/podcast-fetch-episode/docker-compose.tests.yml b/apps/podcast-fetch-episode/docker-compose.tests.yml index 599c00c076..fd1eeaa610 100644 --- a/apps/podcast-fetch-episode/docker-compose.tests.yml +++ b/apps/podcast-fetch-episode/docker-compose.tests.yml @@ -55,5 +55,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/podcast-fetch-transcript/docker-compose.tests.yml b/apps/podcast-fetch-transcript/docker-compose.tests.yml index ea93f92b0a..3865e9d91f 100644 --- a/apps/podcast-fetch-transcript/docker-compose.tests.yml +++ b/apps/podcast-fetch-transcript/docker-compose.tests.yml @@ -101,7 +101,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ rabbitmq-server: diff --git a/apps/podcast-poll-due-operations/docker-compose.tests.yml b/apps/podcast-poll-due-operations/docker-compose.tests.yml index 912d4cc95e..584b501fef 100644 --- a/apps/podcast-poll-due-operations/docker-compose.tests.yml +++ b/apps/podcast-poll-due-operations/docker-compose.tests.yml @@ -50,5 +50,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/podcast-submit-operation/docker-compose.tests.yml b/apps/podcast-submit-operation/docker-compose.tests.yml index 349eaa1c3b..85d255dad3 100644 --- a/apps/podcast-submit-operation/docker-compose.tests.yml +++ b/apps/podcast-submit-operation/docker-compose.tests.yml @@ -52,5 +52,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/postgresql-base/Dockerfile b/apps/postgresql-base/Dockerfile index 327d9ecede..57f81ab8d0 100644 --- a/apps/postgresql-base/Dockerfile +++ b/apps/postgresql-base/Dockerfile @@ -1,14 +1,84 @@ # -# PostgreSQL base +# PostgreSQL base server # -FROM gcr.io/mcback/base:latest +FROM gcr.io/mcback/postgresql-repo-base:latest -# Add Add PostgreSQL GPG key -RUN curl -L https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - +# Install packages +RUN \ + # + # Install PostgreSQL + apt-get -y --no-install-recommends install \ + postgresql-11 \ + postgresql-client-11 \ + postgresql-contrib-11 \ + postgresql-plperl-11 \ + && \ + true -# Add PostgreSQL APT repository -RUN echo "deb http://apt.postgresql.org/pub/repos/apt/ focal-pgdg main" > /etc/apt/sources.list.d/pgdg.list +# Make some run directories +RUN \ + mkdir -p /var/run/postgresql/11-main.pg_stat_tmp && \ + chown -R postgres:postgres /var/run/postgresql/11-main.pg_stat_tmp && \ + true -# Fetch new repositories -RUN apt-get -y update +# Write our own configuration +RUN rm -rf /etc/postgresql/11/main/ +COPY conf/ /etc/postgresql/11/main/ + +# This is where "update_memory_config.sh" script will write its memory settings +# which it will auto-determine from available RAM on every run. +RUN \ + touch /var/run/postgresql/postgresql-memory.conf && \ + chown postgres:postgres /var/run/postgresql/postgresql-memory.conf && \ + true + +# Copy helper scripts +RUN mkdir -p /opt/postgresql-base/ +COPY bin/* /opt/postgresql-base/bin/ + +USER postgres + +RUN \ + # + # Remove APT-initialized data directory because it doesn't have the right + # locale, doesn't use checksums etc. + rm -rf /var/lib/postgresql/11/main/ && \ + # + # Update memory configuration in case we decide to start PostgreSQL at + # build time + # Update memory configuration + /opt/postgresql-base/bin/update_memory_config.sh && \ + # + # Run initdb + mkdir -p /var/lib/postgresql/11/main/ && \ + /usr/lib/postgresql/11/bin/initdb \ + --pgdata=/var/lib/postgresql/11/main/ \ + --data-checksums \ + --encoding=UTF-8 \ + --lc-collate='en_US.UTF-8' \ + --lc-ctype='en_US.UTF-8' \ + && \ + true + +# VOLUME doesn't get set here as children of this image might amend the initial +# data directory somehow (e.g. pre-initialize it with some schema). Once you do +# that in the sub-image, don't forget to define VOLUME afterwards! + +# SIGTERM (Docker's default) will initiate PostgreSQL's "Smart Shutdown" mode +# which will then wait for the current transactions to finish. If there are +# active long-running queries, Docker will wait for "stop_grace_period", run +# out of patience and SIGKILL the process, forcing PostgreSQL to recover the +# database on restart. +# So, instead we stop the database with SIGINT which triggers "Fast Shutdown": +# active connections get terminated, and PostgreSQL shuts down considerably +# faster and safer. +STOPSIGNAL SIGINT + +# Server +EXPOSE 5432 + +# *Not* adding /opt/postgresql-base/ to $PATH so that users get to pick which +# specific version of "postgresql.sh" to run + +CMD ["/opt/postgresql-base/bin/postgresql.sh"] diff --git a/apps/postgresql-base/bin/postgresql.sh b/apps/postgresql-base/bin/postgresql.sh new file mode 100755 index 0000000000..824d60f8da --- /dev/null +++ b/apps/postgresql-base/bin/postgresql.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -u +set -e + +MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/11/bin/" +MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/11/main/" +MC_POSTGRESQL_CONF_PATH="/etc/postgresql/11/main/postgresql.conf" + +# Update memory configuration +/opt/postgresql-base/bin/update_memory_config.sh + +# Start PostgreSQL +exec "${MC_POSTGRESQL_BIN_DIR}/postgres" \ + -D "${MC_POSTGRESQL_DATA_DIR}" \ + -c "config_file=${MC_POSTGRESQL_CONF_PATH}" diff --git a/apps/postgresql-server/bin/update_memory_config.sh b/apps/postgresql-base/bin/update_memory_config.sh similarity index 100% rename from apps/postgresql-server/bin/update_memory_config.sh rename to apps/postgresql-base/bin/update_memory_config.sh diff --git a/apps/postgresql-server/conf/environment b/apps/postgresql-base/conf/environment similarity index 100% rename from apps/postgresql-server/conf/environment rename to apps/postgresql-base/conf/environment diff --git a/apps/postgresql-server/conf/pg_ctl.conf b/apps/postgresql-base/conf/pg_ctl.conf similarity index 100% rename from apps/postgresql-server/conf/pg_ctl.conf rename to apps/postgresql-base/conf/pg_ctl.conf diff --git a/apps/postgresql-server/conf/pg_hba.conf b/apps/postgresql-base/conf/pg_hba.conf similarity index 89% rename from apps/postgresql-server/conf/pg_hba.conf rename to apps/postgresql-base/conf/pg_hba.conf index d734d1ffe9..1338cb83eb 100644 --- a/apps/postgresql-server/conf/pg_hba.conf +++ b/apps/postgresql-base/conf/pg_hba.conf @@ -6,4 +6,4 @@ host all all ::1/128 md5 local replication all peer host replication all 127.0.0.1/32 md5 host replication all ::1/128 md5 -host all mediacloud samenet md5 +host all all samenet md5 diff --git a/apps/postgresql-server/conf/pg_ident.conf b/apps/postgresql-base/conf/pg_ident.conf similarity index 100% rename from apps/postgresql-server/conf/pg_ident.conf rename to apps/postgresql-base/conf/pg_ident.conf diff --git a/apps/postgresql-server/conf/postgresql.conf b/apps/postgresql-base/conf/postgresql.conf similarity index 100% rename from apps/postgresql-server/conf/postgresql.conf rename to apps/postgresql-base/conf/postgresql.conf diff --git a/apps/postgresql-server/conf/start.conf b/apps/postgresql-base/conf/start.conf similarity index 100% rename from apps/postgresql-server/conf/start.conf rename to apps/postgresql-base/conf/start.conf diff --git a/apps/postgresql-pgbouncer/Dockerfile b/apps/postgresql-pgbouncer/Dockerfile index 37d2dbc4c0..a2496d679f 100644 --- a/apps/postgresql-pgbouncer/Dockerfile +++ b/apps/postgresql-pgbouncer/Dockerfile @@ -2,7 +2,7 @@ # PgBouncer # -FROM gcr.io/mcback/postgresql-base:latest +FROM gcr.io/mcback/postgresql-repo-base:latest # Install PgBouncer RUN \ diff --git a/apps/postgresql-repo-base/.dockerignore b/apps/postgresql-repo-base/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/postgresql-repo-base/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/postgresql-repo-base/Dockerfile b/apps/postgresql-repo-base/Dockerfile new file mode 100644 index 0000000000..43c9660011 --- /dev/null +++ b/apps/postgresql-repo-base/Dockerfile @@ -0,0 +1,19 @@ +# +# PostgreSQL repository base +# + +FROM gcr.io/mcback/base:latest + +RUN \ + # + # Add Add PostgreSQL GPG key + curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \ + # + # Add PostgreSQL APT repository + echo "deb http://apt.postgresql.org/pub/repos/apt/ focal-pgdg main" \ + > /etc/apt/sources.list.d/pgdg.list && \ + # + # Fetch new repositories + apt-get -y update && \ + # + true diff --git a/apps/postgresql-server/Dockerfile b/apps/postgresql-server/Dockerfile index a82b832a4f..e5609decb9 100644 --- a/apps/postgresql-server/Dockerfile +++ b/apps/postgresql-server/Dockerfile @@ -1,53 +1,36 @@ # -# PostgreSQL server +# Main backend PostgreSQL server # FROM gcr.io/mcback/postgresql-base:latest -# Install packages +USER root RUN \ - # - # Install PostgreSQL - apt-get -y --no-install-recommends install \ - postgresql-11 \ - postgresql-client-11 \ - postgresql-contrib-11 \ - postgresql-plperl-11 \ + mkdir -p \ + /opt/postgresql-server/bin/ \ + /opt/postgresql-server/schema/ \ && \ true -# Make some run directories -RUN \ - mkdir -p /var/run/postgresql/11-main.pg_stat_tmp && \ - chown -R postgres:postgres /var/run/postgresql/11-main.pg_stat_tmp && \ - true - -# Write our own configuration -RUN rm -rf /etc/postgresql/11/main/ -COPY conf/ /etc/postgresql/11/main/ - -# This is where "update_memory_config.sh" script will write its memory settings -# which it will auto-determine from available RAM on every run. -RUN \ - touch /var/run/postgresql/postgresql-memory.conf && \ - chown postgres:postgres /var/run/postgresql/postgresql-memory.conf && \ - true - # Copy helper scripts, schema, migrations -RUN mkdir -p /opt/mediacloud/ -COPY bin/ /opt/mediacloud/bin/ -COPY schema/ /opt/mediacloud/schema/ - -USER postgres +COPY bin/* /opt/postgresql-server/bin/ +COPY schema/ /opt/postgresql-server/schema/ # Initialize data volume, create users, a database, and initialize it with # schema # If a new empty volume gets mounted to /var/lib/postgresql/ upon # container start, Docker will copy the files from the container to the volume -RUN /opt/mediacloud/bin/initialize_schema.sh +USER postgres +RUN /opt/postgresql-server/bin/initialize_schema.sh + +# Remove the init script so that someone doesn't accidentally run it in +# production +USER root +RUN rm /opt/postgresql-server/bin/initialize_schema.sh +USER postgres ENV \ - PATH="/opt/mediacloud/bin:${PATH}" \ + PATH="/opt/postgresql-server/bin:${PATH}" \ # # Make sure that we can connect via "psql" without sudoing into "postgres" user PGHOST=localhost \ @@ -56,26 +39,8 @@ ENV \ PGPASSWORD=mediacloud \ PGDATABASE=mediacloud -# Remove the init script so that someone doesn't accidentally run it in -# production -USER root -RUN rm /opt/mediacloud/bin/initialize_schema.sh -USER postgres - # PostgreSQL data VOLUME /var/lib/postgresql/ -# SIGTERM (Docker's default) will initiate PostgreSQL's "Smart Shutdown" mode -# which will then wait for the current transactions to finish. If there are -# active long-running queries, Docker will wait for "stop_grace_period", run -# out of patience and SIGKILL the process, forcing PostgreSQL to recover the -# database on restart. -# So, instead we stop the database with SIGINT which triggers "Fast Shutdown": -# active connections get terminated, and PostgreSQL shuts down considerably -# faster and safer. -STOPSIGNAL SIGINT - -# Server -EXPOSE 5432 - -CMD ["/opt/mediacloud/bin/postgresql_server.sh"] +# Use our own wrapper script which runs schema upgrades first +CMD ["/opt/postgresql-server/bin/postgresql.sh"] diff --git a/apps/postgresql-server/bin/apply_migrations.sh b/apps/postgresql-server/bin/apply_migrations.sh index 11535e0d90..b69ab04a18 100755 --- a/apps/postgresql-server/bin/apply_migrations.sh +++ b/apps/postgresql-server/bin/apply_migrations.sh @@ -7,7 +7,7 @@ MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/11/bin/" MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/11/main/" MC_POSTGRESQL_CONF_PATH="/etc/postgresql/11/main/postgresql.conf" -SCHEMA_DIR="/opt/mediacloud/schema/" +SCHEMA_DIR="/opt/postgresql-server/schema/" SCHEMA_PATH="${SCHEMA_DIR}/mediawords.sql" MIGRATIONS_DIR="${SCHEMA_DIR}/migrations/" diff --git a/apps/postgresql-server/bin/initialize_schema.sh b/apps/postgresql-server/bin/initialize_schema.sh index e4dafb9a82..de545a0a48 100755 --- a/apps/postgresql-server/bin/initialize_schema.sh +++ b/apps/postgresql-server/bin/initialize_schema.sh @@ -8,20 +8,7 @@ MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/11/main/" MC_POSTGRESQL_CONF_PATH="/etc/postgresql/11/main/postgresql.conf" # Update memory configuration -/opt/mediacloud/bin/update_memory_config.sh - -# Remove APT-initialized data directory because it doesn't have the right -# locale, doesn't use checksums etc. -rm -rf /var/lib/postgresql/11/main/ - -# Run initdb -mkdir -p "${MC_POSTGRESQL_DATA_DIR}" -"${MC_POSTGRESQL_BIN_DIR}/initdb" \ - --pgdata="${MC_POSTGRESQL_DATA_DIR}" \ - --data-checksums \ - --encoding=UTF-8 \ - --lc-collate='en_US.UTF-8' \ - --lc-ctype='en_US.UTF-8' +/opt/postgresql-base/bin/update_memory_config.sh "${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ -o "-c config_file=${MC_POSTGRESQL_CONF_PATH}" \ @@ -49,7 +36,7 @@ EOF psql -v ON_ERROR_STOP=1 -c "${CREATE_DB_SQL}" # Initialize with schema -psql -v ON_ERROR_STOP=1 -d mediacloud -f /opt/mediacloud/schema/mediawords.sql +psql -v ON_ERROR_STOP=1 -d mediacloud -f /opt/postgresql-server/schema/mediawords.sql # Stop PostgreSQL "${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ diff --git a/apps/postgresql-server/bin/postgresql_server.sh b/apps/postgresql-server/bin/postgresql.sh similarity index 58% rename from apps/postgresql-server/bin/postgresql_server.sh rename to apps/postgresql-server/bin/postgresql.sh index 50661ff1dd..cf7e7c5c57 100755 --- a/apps/postgresql-server/bin/postgresql_server.sh +++ b/apps/postgresql-server/bin/postgresql.sh @@ -3,12 +3,8 @@ set -u set -e -MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/11/bin/" -MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/11/main/" -MC_POSTGRESQL_CONF_PATH="/etc/postgresql/11/main/postgresql.conf" - # Update memory configuration -/opt/mediacloud/bin/update_memory_config.sh +/opt/postgresql-base/bin/update_memory_config.sh # Run schema migrations if needed if [ -e /var/lib/postgresql/first_run ]; then @@ -19,11 +15,9 @@ elif [ ! -z ${MC_POSTGRESQL_SKIP_MIGRATIONS+x} ]; then echo "Skipping schema migrations because 'MC_POSTGRESQL_SKIP_MIGRATIONS' is set." else echo "Applying schema migrations..." - /opt/mediacloud/bin/apply_migrations.sh + /opt/postgresql-server/bin/apply_migrations.sh echo "Done applying schema migrations." fi # Start PostgreSQL -exec "${MC_POSTGRESQL_BIN_DIR}/postgres" \ - -D "${MC_POSTGRESQL_DATA_DIR}" \ - -c "config_file=${MC_POSTGRESQL_CONF_PATH}" +exec /opt/postgresql-base/bin/postgresql.sh diff --git a/apps/postgresql-server/bin/pps b/apps/postgresql-server/bin/pps index f007f3dfb8..ff24e59a45 100755 --- a/apps/postgresql-server/bin/pps +++ b/apps/postgresql-server/bin/pps @@ -6,7 +6,26 @@ else COLS=`tput cols` fi -echo "select psa.pid, min(application_name) as client, substr(query_start::text, 0, 20) as date, granted as l, regexp_replace(query, E'[\\n\\r ]+', ' ', 'g' ) q from pg_stat_activity psa left join pg_locks pl on ( psa.pid = pl.pid and pl.granted = 'f' ) where state not like 'idle%' group by psa.pid, usename, state, query_start, granted, q order by query_start desc" | psql mediacloud | cut -c 1-$COLS - +cat < /opt/elasticsearch/config/elasticsearch.yml && \ + # + true + +USER elasticsearch + +# Preload with Temporal index template +# (https://github.com/temporalio/temporal/blob/v1.7.0/schema/elasticsearch/v7/visibility/index_template.json) +COPY index_template.json setup_index_template.sh / +RUN /setup_index_template.sh +USER root +RUN rm /index_template.json /setup_index_template.sh +USER elasticsearch + +CMD ["/opt/elasticsearch/bin/elasticsearch.sh"] diff --git a/apps/temporal-elasticsearch/config/.dockerignore b/apps/temporal-elasticsearch/config/.dockerignore new file mode 100644 index 0000000000..b3c0a37b66 --- /dev/null +++ b/apps/temporal-elasticsearch/config/.dockerignore @@ -0,0 +1 @@ +elasticsearch.keystore diff --git a/apps/temporal-elasticsearch/config/.gitignore b/apps/temporal-elasticsearch/config/.gitignore new file mode 100644 index 0000000000..3eb03f777e --- /dev/null +++ b/apps/temporal-elasticsearch/config/.gitignore @@ -0,0 +1,3 @@ +# Might get created by a Docker container +elasticsearch.keystore + diff --git a/apps/temporal-elasticsearch/config/temporal-elasticsearch.yml b/apps/temporal-elasticsearch/config/temporal-elasticsearch.yml new file mode 100644 index 0000000000..e96f46b92d --- /dev/null +++ b/apps/temporal-elasticsearch/config/temporal-elasticsearch.yml @@ -0,0 +1,2 @@ +cluster.name: temporal-elasticsearch +node.name: temporal-elasticsearch diff --git a/apps/temporal-elasticsearch/index_template.json b/apps/temporal-elasticsearch/index_template.json new file mode 100644 index 0000000000..c89ab7cd89 --- /dev/null +++ b/apps/temporal-elasticsearch/index_template.json @@ -0,0 +1,89 @@ +{ + "order": 0, + "index_patterns": [ + "temporal-visibility-*" + ], + "settings": { + "index": { + "number_of_shards": "5", + "number_of_replicas": "0", + "search.idle.after": "365d" + } + }, + "mappings": { + "dynamic": "false", + "properties": { + "NamespaceId": { + "type": "keyword" + }, + "WorkflowId": { + "type": "keyword" + }, + "RunId": { + "type": "keyword" + }, + "WorkflowType": { + "type": "keyword" + }, + "StartTime": { + "type": "long" + }, + "ExecutionTime": { + "type": "long" + }, + "CloseTime": { + "type": "long" + }, + "ExecutionStatus": { + "type": "long" + }, + "HistoryLength": { + "type": "long" + }, + "KafkaKey": { + "type": "keyword" + }, + "Encoding": { + "type": "keyword" + }, + "TaskQueue": { + "type": "keyword" + }, + "Attr": { + "properties": { + "TemporalChangeVersion": { + "type": "keyword" + }, + "CustomStringField": { + "type": "text" + }, + "CustomKeywordField": { + "type": "keyword" + }, + "CustomIntField": { + "type": "long" + }, + "CustomDoubleField": { + "type": "double" + }, + "CustomBoolField": { + "type": "boolean" + }, + "CustomDatetimeField": { + "type": "date" + }, + "CustomNamespace": { + "type": "keyword" + }, + "Operator": { + "type": "keyword" + }, + "BinaryChecksums": { + "type": "keyword" + } + } + } + } + }, + "aliases": {} +} diff --git a/apps/temporal-elasticsearch/setup_index_template.sh b/apps/temporal-elasticsearch/setup_index_template.sh new file mode 100755 index 0000000000..ef42765eec --- /dev/null +++ b/apps/temporal-elasticsearch/setup_index_template.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -u +set -e + + +echo "Starting Elasticsearch for index setup..." +/opt/elasticsearch/bin/elasticsearch & + +for i in {1..120}; do + echo "Waiting for Elasticsearch to start..." + if curl --silent --show-error --fail "http://127.0.0.1:9200/_cluster/health"; then + break + else + sleep 1 + fi +done + + +echo "Creating Temporal index template..." +curl -XPUT "http://127.0.0.1:9200/_template/temporal-visibility-template" \ + --fail \ + --silent \ + --show-error \ + -H "Content-Type: application/json" \ + -d @index_template.json +echo "Done creating Temporal index template." + + +echo "Stopping Elasticsearch..." +killall java +while pgrep java > /dev/null; do + sleep 0.5 +done diff --git a/apps/temporal-postgresql/.dockerignore b/apps/temporal-postgresql/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/temporal-postgresql/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/temporal-postgresql/Dockerfile b/apps/temporal-postgresql/Dockerfile new file mode 100644 index 0000000000..4c175ff017 --- /dev/null +++ b/apps/temporal-postgresql/Dockerfile @@ -0,0 +1,74 @@ +# +# PostgreSQL server for Temporal's workflow storage +# + +FROM gcr.io/mcback/postgresql-base:latest + +USER root + +RUN \ + mkdir -p \ + /opt/temporal-postgresql/bin/ \ + /opt/temporal-postgresql/schema/ \ + && \ + # + # Install temporal-sql-tool + # FIXME use upstream CI builds once they start working + # Keep version that's being used in sync with temporal-server + mkdir -p /var/tmp/temporal/ && \ + /dl_to_stdout.sh "https://github.com/mediacloud/temporal-ci-builds/releases/download/v1.7.0-gr/temporal-ci-builds_1.7.0-gr_linux_amd64.tar.gz" | \ + tar -zx -C /var/tmp/temporal/ && \ + mv /var/tmp/temporal/temporal-sql-tool /usr/bin/ && \ + rm -rf /var/tmp/temporal/ && \ + true + +# Check out schema +RUN \ + apt-get -y --no-install-recommends install git && \ + mkdir -p /var/tmp/temporal/ && \ + cd /var/tmp/temporal/ && \ + git init && \ + git remote add origin https://github.com/temporalio/temporal.git && \ + # HEAD of "v1.7.0" tag: + git fetch --depth 1 origin e3496b1c51bfaaae8142b78e4032cc791de8a76f && \ + git checkout FETCH_HEAD && \ + mv schema/postgresql/* /opt/temporal-postgresql/schema/ && \ + cd / && \ + rm -rf /var/tmp/temporal/ && \ + apt-get -y remove git && \ + apt-get -y autoremove && \ + apt-get -y clean && \ + true + +# Copy helper scripts +COPY bin/* /opt/temporal-postgresql/bin/ + +USER postgres + +# Initialize data volume, create users, a database, and initialize it with +# schema +# If a new empty volume gets mounted to /var/lib/postgresql/ upon +# container start, Docker will copy the files from the container to the volume +RUN /opt/temporal-postgresql/bin/initialize_schema.sh + +# Remove the init script so that someone doesn't accidentally run it in +# production +USER root +RUN rm /opt/temporal-postgresql/bin/initialize_schema.sh +USER postgres + +ENV \ + PATH="/opt/temporal-postgresql/bin:${PATH}" \ + # + # Make sure that we can connect via "psql" without sudoing into "postgres" user + PGHOST=localhost \ + PGPORT=5432 \ + PGUSER=temporal \ + PGPASSWORD=temporal \ + PGDATABASE=temporal + +# PostgreSQL data +VOLUME /var/lib/postgresql/ + +# Use our own wrapper script which runs schema upgrades first +CMD ["/opt/temporal-postgresql/bin/postgresql.sh"] diff --git a/apps/temporal-postgresql/bin/apply_migrations.sh b/apps/temporal-postgresql/bin/apply_migrations.sh new file mode 100755 index 0000000000..fa9a9f8b94 --- /dev/null +++ b/apps/temporal-postgresql/bin/apply_migrations.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +set -u +set -e + +MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/11/bin/" +MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/11/main/" +MC_POSTGRESQL_CONF_PATH="/etc/postgresql/11/main/postgresql.conf" + +# Apply migrations when running on a different port so that clients don't end +# up connecting in the middle of migrating +TEMP_PORT=12345 + +# In case the database is in recovery, wait for up to 1 hour for it to complete +PGCTL_START_TIMEOUT=3600 + +# Start PostgreSQL on a temporary port +"${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ + -o "-c config_file=${MC_POSTGRESQL_CONF_PATH} -p ${TEMP_PORT}" \ + -D "${MC_POSTGRESQL_DATA_DIR}" \ + -t "${PGCTL_START_TIMEOUT}" \ + -w \ + start + +SCHEMAS_DIR="/opt/temporal-postgresql/schema/v96" +TSQL="temporal-sql-tool \ + --plugin postgres \ + --ep 127.0.0.1 \ + -p 12345 \ + -u temporal \ + --pw temporal \ +" + +MAIN_SCHEMA_DIR="${SCHEMAS_DIR}/temporal/versioned" +$TSQL --db temporal update-schema -d "${MAIN_SCHEMA_DIR}" + +VISIBILITY_SCHEMA_DIR="${SCHEMAS_DIR}/visibility/versioned" +$TSQL --db temporal_visibility update-schema -d "${VISIBILITY_SCHEMA_DIR}" + +# Stop PostgreSQL +"${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ + -D "${MC_POSTGRESQL_DATA_DIR}" \ + -m fast \ + -w \ + stop diff --git a/apps/temporal-postgresql/bin/initialize_schema.sh b/apps/temporal-postgresql/bin/initialize_schema.sh new file mode 100755 index 0000000000..e9620d6ae8 --- /dev/null +++ b/apps/temporal-postgresql/bin/initialize_schema.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# +# FIXME reuse code between "initialize_schema.sh" and "apply_migrations.sh" +# + +set -u +set -e + +MC_POSTGRESQL_BIN_DIR="/usr/lib/postgresql/11/bin/" +MC_POSTGRESQL_DATA_DIR="/var/lib/postgresql/11/main/" +MC_POSTGRESQL_CONF_PATH="/etc/postgresql/11/main/postgresql.conf" + +# Update memory configuration +/opt/postgresql-base/bin/update_memory_config.sh + +"${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ + -o "-c config_file=${MC_POSTGRESQL_CONF_PATH}" \ + -D "${MC_POSTGRESQL_DATA_DIR}" \ + -w \ + -t 1200 \ + start + +psql -v ON_ERROR_STOP=1 -c "CREATE USER temporal WITH PASSWORD 'temporal' SUPERUSER;" + +SCHEMAS_DIR="/opt/temporal-postgresql/schema/v96" +TSQL="temporal-sql-tool \ + --plugin postgres \ + --ep 127.0.0.1 \ + -p 5432 \ + -u temporal \ + --pw temporal \ +" + +MAIN_SCHEMA_DIR="${SCHEMAS_DIR}/temporal/versioned" +$TSQL create --db temporal +$TSQL --db temporal setup-schema -v 0.0 +$TSQL --db temporal update-schema -d "${MAIN_SCHEMA_DIR}" + +VISIBILITY_SCHEMA_DIR="${SCHEMAS_DIR}/visibility/versioned" +$TSQL create --db temporal_visibility +$TSQL --db temporal_visibility setup-schema -v 0.0 +$TSQL --db temporal_visibility update-schema -d "${VISIBILITY_SCHEMA_DIR}" + +# Given that "temporal-sql-tool" returns 0 on errors too, make sure that something got created +psql -d temporal -c 'SELECT * FROM visibility_tasks' > /dev/null +psql -d temporal_visibility -c 'SELECT * FROM schema_version' > /dev/null + +# Stop PostgreSQL +"${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ + -D "${MC_POSTGRESQL_DATA_DIR}" \ + -m fast \ + -w \ + -t 1200 \ + stop + +# Create a file that will denote that we're running off a fresh data volume and +# it's the first time ever that we've started the server +cat > /var/lib/postgresql/first_run << EOF +If this file exists, it means that a fresh data volume was just mounted to the +container, and the container is about to run for the first time ever, so +there's no point in attempting to check the schema version and apply +migrations. + +After the first time this container gets run, this file will get deleted and +every subsequent run of the same container will then attempt to apply +migrations in order to upgrade the schema before continuing with anything else. +EOF +chown postgres:postgres /var/lib/postgresql/first_run diff --git a/apps/temporal-postgresql/bin/postgresql.sh b/apps/temporal-postgresql/bin/postgresql.sh new file mode 100755 index 0000000000..4b7af3a946 --- /dev/null +++ b/apps/temporal-postgresql/bin/postgresql.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -u +set -e + +# Update memory configuration +/opt/postgresql-base/bin/update_memory_config.sh + +# Run schema migrations if needed +if [ -e /var/lib/postgresql/first_run ]; then + echo "Skipping schema migrations on first run..." + rm /var/lib/postgresql/first_run +elif [ ! -z ${MC_TEMPORAL_SKIP_MIGRATIONS+x} ]; then + echo "Skipping schema migrations because 'MC_TEMPORAL_SKIP_MIGRATIONS' is set." +else + echo "Applying schema migrations..." + /opt/temporal-postgresql/bin/apply_migrations.sh + echo "Done applying schema migrations." +fi + +# Start PostgreSQL +exec /opt/postgresql-base/bin/postgresql.sh diff --git a/apps/temporal-server/.dockerignore b/apps/temporal-server/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/temporal-server/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/temporal-server/Dockerfile b/apps/temporal-server/Dockerfile new file mode 100644 index 0000000000..501162a99e --- /dev/null +++ b/apps/temporal-server/Dockerfile @@ -0,0 +1,79 @@ +# +# Temporal server +# + +FROM gcr.io/mcback/base:latest + +# FIXME +RUN apt-get -y update + +# Install dependencies +RUN \ + apt-get -y --no-install-recommends install \ + libprotobuf17 \ + && \ + true + +# Install Temporal server +RUN \ + # FIXME use upstream CI builds once they start working + # Keep version that's being used in sync with temporal-postgresql + mkdir -p /var/tmp/temporal/ && \ + /dl_to_stdout.sh "https://github.com/mediacloud/temporal-ci-builds/releases/download/v1.7.0-gr/temporal-ci-builds_1.7.0-gr_linux_amd64.tar.gz" | \ + tar -zx -C /var/tmp/temporal/ && \ + mv /var/tmp/temporal/temporal-server /var/tmp/temporal/tctl /usr/bin/ && \ + cd / && \ + rm -rf /var/tmp/temporal/ && \ + true + +RUN \ + # + # Install envsubst for generating configuration + apt-get -y --no-install-recommends install \ + gettext-base \ + && \ + # + # Install utilities useful for tctl + apt-get -y --no-install-recommends install \ + jq \ + && \ + # + # Add unprivileged user the service will run as + useradd -ms /bin/bash temporal && \ + # + # Directory for wrapper scripts + mkdir -p /opt/temporal-server/bin/ && \ + # + # Directory for configuration (has to be writable to generate final + # configuration files from templates) + mkdir -p /opt/temporal-server/config/ && \ + chown temporal:temporal /opt/temporal-server/config/ && \ + # + # Directories for first run shim and archival + mkdir -p /var/lib/temporal/archival/temporal/ && \ + mkdir -p /var/lib/temporal/archival/visibility/ && \ + chown -R temporal:temporal /var/lib/temporal/ && \ + # + # Create a file that will denote whether it's the first run of this service + # mounted to a specific volume + touch /var/lib/temporal/first_run && \ + # + true + +COPY bin/* /opt/temporal-server/bin/ +COPY config/* /opt/temporal-server/config/ + +ENV PATH="/opt/temporal-server/bin:${PATH}" \ + # https://docs.temporal.io/docs/tctl/#environment-variables + TEMPORAL_CLI_ADDRESS="temporal-server:7233" \ + TEMPORAL_CLI_NAMESPACE="default" + +# Archives and first run shim +VOLUME /var/lib/temporal/ + +# Port descriptions: https://docs.temporal.io/docs/server-architecture/ +EXPOSE 6933 6934 6935 6939 7233 7234 7235 7239 + +USER temporal + +CMD ["temporal.sh"] diff --git a/apps/temporal-server/bin/temporal.sh b/apps/temporal-server/bin/temporal.sh new file mode 100755 index 0000000000..f103074fd6 --- /dev/null +++ b/apps/temporal-server/bin/temporal.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +set -u +set -e + +# Hostname for binding configuration +export MC_TEMPORAL_HOST_IP=$(hostname -i) + +# Generate final config +envsubst \ + < /opt/temporal-server/config/mediacloud_template.yaml \ + > /opt/temporal-server/config/mediacloud.yaml + +# FIXME give up and crash after a while + +while true; do + echo "Waiting for PostgreSQL to start..." + if nc -z -w 10 temporal-postgresql 5432; then + break + else + sleep 1 + fi +done + +while true; do + echo "Waiting for Elasticsearch to start..." + if curl --silent --show-error --fail "http://temporal-elasticsearch:9200/_cluster/health"; then + break + else + sleep 1 + fi +done + +register_default_namespace() { + echo "Registering default namespace: $DEFAULT_NAMESPACE" + until tctl --ns default namespace describe < /dev/null; do + echo "Default namespace not found. Creating..." + sleep 1 + # FIXME doesn't work + # FIXME retention period super short + tctl --ns default namespace register --rd 1 --desc "Default namespace for Temporal Server" || echo "Creating default namespace failed." + done + echo "Default namespace registration complete." +} + +if [ -e /var/lib/temporal/first_run ]; then + echo "Registering default namespace on first run..." + # FIXME not that great to run it in the background + register_default_namespace & + rm /var/lib/temporal/first_run +fi + +exec temporal-server \ + --root /opt/temporal-server \ + --env mediacloud \ + start diff --git a/apps/temporal-server/config/dynamicconfig.yaml b/apps/temporal-server/config/dynamicconfig.yaml new file mode 100644 index 0000000000..0576043ba0 --- /dev/null +++ b/apps/temporal-server/config/dynamicconfig.yaml @@ -0,0 +1,60 @@ +frontend.enableClientVersionCheck: +- value: true + constraints: {} +history.persistenceMaxQPS: +- value: 3000 + constraints: {} +frontend.persistenceMaxQPS: +- value: 3000 + constraints: {} +frontend.historyMgrNumConns: +- value: 10 + constraints: {} +frontend.throttledLogRPS: +- value: 20 + constraints: {} +history.historyMgrNumConns: +- value: 50 + constraints: {} +history.defaultActivityRetryPolicy: +- value: + InitialIntervalInSeconds: 1 + MaximumIntervalCoefficient: 100.0 + BackoffCoefficient: 2.0 + MaximumAttempts: 0 +history.defaultWorkflowRetryPolicy: +- value: + InitialIntervalInSeconds: 1 + MaximumIntervalCoefficient: 100.0 + BackoffCoefficient: 2.0 + MaximumAttempts: 0 +system.advancedVisibilityWritingMode: + - value: "on" + constraints: {} +system.enableReadVisibilityFromES: + - value: true + constraints: {} +frontend.validSearchAttributes: + - value: + NamespaceId: "Keyword" + WorkflowId: "Keyword" + RunId: "Keyword" + WorkflowType: "Keyword" + StartTime: "Int" + ExecutionTime: "Int" + CloseTime: "Int" + ExecutionStatus: "Int" + HistoryLength: "Int" + TaskQueue: "Keyword" + KafkaKey: "Keyword" + Encoding: "Keyword" + CustomStringField: "String" + CustomKeywordField: "Keyword" + CustomIntField: "Int" + CustomDoubleField: "Double" + CustomBoolField: "Bool" + CustomDatetimeField: "Datetime" + TemporalChangeVersion: "Keyword" + BinaryChecksums: "Keyword" + CustomNamespace: "Keyword" + Operator: "Keyword" diff --git a/apps/temporal-server/config/mediacloud_template.yaml b/apps/temporal-server/config/mediacloud_template.yaml new file mode 100644 index 0000000000..e872217ba6 --- /dev/null +++ b/apps/temporal-server/config/mediacloud_template.yaml @@ -0,0 +1,199 @@ +# FIXME adapt Helm chart configuration here + +log: + stdout: true + level: info + +persistence: + # FIXME is this right? + numHistoryShards: 4096 + defaultStore: default + visibilityStore: visibility + advancedVisibilityStore: es-visibility + datastores: + + default: + sql: + pluginName: "postgres" + databaseName: "temporal" + + # We skip the PgBouncer and connect directly as PgBouncer's + # support is not too well documented + connectAddr: "temporal-postgresql:5432" + + connectProtocol: "tcp" + user: "temporal" + password: "temporal" + + # FIXME keep in sync with PostgreSQL configuration + maxConns: 20 + maxIdleConns: 20 + maxConnLifetime: 1h + + visibility: + sql: + pluginName: "postgres" + databaseName: "temporal_visibility" + connectAddr: "temporal-postgresql:5432" + connectProtocol: "tcp" + user: "temporal" + password: "temporal" + maxConns: 10 + maxIdleConns: 10 + maxConnLifetime: 1h + + es-visibility: + elasticsearch: + # Keep in sync with temporal-elasticsearch version: + version: "v7" + url: + scheme: http + host: "temporal-elasticsearch:9200" + username: "" + password: "" + indices: + # FIXME rename to "temporal-visibility" perhaps? + visibility: "temporal-visibility-dev" + +global: + membership: + maxJoinDuration: 30s + # broadcastAddress: "0.0.0.0" + tls: + internode: + # This server section configures the TLS certificate that internal temporal + # cluster nodes (history or matching) present to other clients within the Temporal Cluster. + server: + requireClientAuth: false + + certFile: "" + keyFile: "" + clientCaFiles: + - "" + + certData: "" + keyData: "" + clientCaData: + - "" + + # This client section is used to configure the TLS clients within + # the Temporal Cluster that connect to an Internode (history or matching) + client: + serverName: "" + disableHostVerification: false + rootCaFiles: + - "" + rootCaData: + - "" + frontend: + # This server section configures the TLS certificate that the Frontend + # server presents to all clients (specifically the Worker role within + # the Temporal Cluster and all External SDKs connecting to the Cluster) + server: + requireClientAuth: false + certFile: "" + keyFile: "" + clientCaFiles: + - "" + - "" + + certData: "" + keyData: "" + clientCaData: + - "" + - "" + + # This client section is used to configure the TLS clients within + # the Temporal Cluster (specifically the Worker role) that connect to the Frontend service + client: + serverName: "" + disableHostVerification: false + rootCaFiles: + - "" + rootCaData: + - "" + + # FIXME collect statistics with either statsd or prometheus: + # metrics: + # statsd: + # hostPort: "temporal-statsd:8125" + # prefix: "temporal" + + metrics: + prometheus: + timerType: "histogram" + listenAddress: "temporal-prometheus:9090" + +services: + frontend: + rpc: + grpcPort: 7233 + membershipPort: 6933 + bindOnIP: "${MC_TEMPORAL_HOST_IP}" + + matching: + rpc: + grpcPort: 7235 + membershipPort: 6935 + bindOnIP: "${MC_TEMPORAL_HOST_IP}" + + history: + rpc: + grpcPort: 7234 + membershipPort: 6934 + bindOnIP: "${MC_TEMPORAL_HOST_IP}" + + worker: + rpc: + grpcPort: 7239 + membershipPort: 6939 + bindOnIP: "${MC_TEMPORAL_HOST_IP}" + +clusterMetadata: + enableGlobalNamespace: false + failoverVersionIncrement: 10 + masterClusterName: "active" + currentClusterName: "active" + clusterInformation: + active: + enabled: true + initialFailoverVersion: 1 + rpcName: "frontend" + rpcAddress: "127.0.0.1:7233" + +dcRedirectionPolicy: + policy: "noop" + toDC: "" + +archival: + history: + state: "enabled" + enableRead: true + provider: + filestore: + fileMode: "0666" + dirMode: "0766" + visibility: + state: "enabled" + enableRead: true + provider: + filestore: + fileMode: "0666" + dirMode: "0766" + +# FIXME archive workflows on S3: https://docs.temporal.io/docs/server-archive-data/ +namespaceDefaults: + archival: + history: + state: "enabled" + URI: "file:///var/lib/temporal/archival/temporal" + visibility: + state: "enabled" + URI: "file:///var/lib/temporal/archival/visibility" + +publicClient: + hostPort: "${MC_TEMPORAL_HOST_IP}:7233" + +dynamicConfigClient: + filepath: "/opt/temporal-server/config/dynamicconfig.yaml" + pollInterval: "60s" diff --git a/apps/temporal-server/docker-compose.tests.yml b/apps/temporal-server/docker-compose.tests.yml new file mode 100644 index 0000000000..1ab02ce88f --- /dev/null +++ b/apps/temporal-server/docker-compose.tests.yml @@ -0,0 +1,132 @@ +version: "3.7" + +services: + + # Service to use for testing the Temporal service + # + # Usage: + # + # host$ ./dev/run.py temporal-server bash + # container$ python3 + # + # ...and then submit a Temporal workflow somehow. + # + temporal-server: + image: gcr.io/mcback/common:latest + init: true + stop_signal: SIGKILL + depends_on: + - temporal-server-actual + - temporal-webapp + + # Actual Temporal server, operating under "temporal-server" alias + temporal-server-actual: + image: gcr.io/mcback/temporal-server:latest + init: true + stop_signal: SIGKILL + depends_on: + - temporal-postgresql + - temporal-elasticsearch + networks: + default: + aliases: + - temporal-server + expose: + - 6933 + - 6934 + - 6935 + - 6939 + - 7233 + - 7234 + - 7235 + - 7239 + ports: + # Expose to host for debugging + - "6933:6933" + - "6934:6934" + - "6935:6935" + - "6939:6939" + - "7233:7233" + - "7234:7234" + - "7235:7235" + - "7239:7239" + volumes: + - type: bind + source: ./bin/ + target: /opt/temporal-server/bin/ + - type: bind + source: ./config/dynamicconfig.yaml + target: /opt/temporal-server/config/dynamicconfig.yaml + - type: bind + source: ./config/mediacloud_template.yaml + target: /opt/temporal-server/config/mediacloud_template.yaml + + temporal-postgresql: + image: gcr.io/mcback/temporal-postgresql:latest + init: true + stop_signal: SIGKILL + networks: + - default + expose: + - 5432 + ports: + # Expose to host for debugging + - "5432:5432" + volumes: + - type: bind + source: ./../temporal-postgresql/bin/ + target: /opt/temporal-postgresql/bin/ + - type: bind + source: ./../postgresql-base/conf/ + target: /etc/postgresql/11/main/ + + temporal-elasticsearch: + image: gcr.io/mcback/temporal-elasticsearch:latest + init: true + stop_signal: SIGKILL + networks: + - default + expose: + - "9200" + - "9300" + ports: + # Expose to host for debugging + - "9200:9200" + - "9300:9300" + volumes: + - type: bind + source: ./../elasticsearch-base/bin/elasticsearch.sh + target: /opt/elasticsearch/bin/elasticsearch.sh + # Not mounting config as it gets concatenated into a single file + # Limit CPUs and RAM for the process to not get too greedy + deploy: + resources: + limits: + cpus: "2" + memory: "2G" + temporal-webapp: + image: gcr.io/mcback/temporal-webapp:latest + init: true + stop_signal: SIGKILL + networks: + - default + expose: + - "8088" + ports: + # Expose to host for debugging + - "8088:8088" + # Limit CPUs and RAM for the process to not get too greedy + deploy: + resources: + limits: + cpus: "1" + memory: "2G" + +networks: + default: + attachable: true + ipam: + driver: default + config: + # Use same subnet as in production + - subnet: "10.1.0.0/16" diff --git a/apps/temporal-webapp/.dockerignore b/apps/temporal-webapp/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/temporal-webapp/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/temporal-webapp/Dockerfile b/apps/temporal-webapp/Dockerfile new file mode 100644 index 0000000000..cd338db5e5 --- /dev/null +++ b/apps/temporal-webapp/Dockerfile @@ -0,0 +1,85 @@ +# +# Temporal webapp +# + +FROM gcr.io/mcback/base:latest + +# FIXME +RUN apt-get -y update + +RUN \ + # + # Add NodeSource APT repository + curl -fsSL https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - && \ + echo "deb https://deb.nodesource.com/node_14.x focal main" \ + > /etc/apt/sources.list.d/nodesource.list && \ + apt-get -y update && \ + # + # Install Node.js + apt-get -y --no-install-recommends install nodejs && \ + # + true + +# FIXME Vue.js still gets built in development mode +ENV NODE_ENV=production \ + NPM_CONFIG_PRODUCTION=true \ + TEMPORAL_GRPC_ENDPOINT=temporal-server:7233 \ + TEMPORAL_PERMIT_WRITE_API=true + +RUN \ + # + # Install build dependencies + apt-get -y --no-install-recommends install git && \ + # + # Create target directory + mkdir -p /opt/temporal-webapp/ && \ + # + # Download Temporal webapp + # * We use Git instead of building a released package because we need + # the submodules for the build too; + # * We check out a specific commit hash instead of a version tag to prevent + # dependency confusion + # (https://medium.com/@alex.birsan/dependency-confusion-4a5d60fec610); + # * We do some extra trickery to do a shallow copy of just a single commit + # hash to save space + time (https://stackoverflow.com/a/43136160/200603); + # * Submodule is referred to as a SSH URI, so we need to make Git's SSH + # work first too. + # + cd /opt/temporal-webapp/ && \ + git init && \ + git remote add origin https://github.com/temporalio/web.git && \ + # HEAD of "v1.7.1" tag: + git fetch --depth 1 origin f5bfe968a5ebd6f6e2bd687355bb8b746dcf52c6 && \ + git checkout FETCH_HEAD && \ + # SSH checkout doesn't work with the build container's public key not + # registered with GitHub + sed -i 's/git@github.com:/https:\/\/github.com\//g' .gitmodules && \ + git submodule init && \ + git submodule sync && \ + git submodule update --init --recursive --depth 1 && \ + # + # Build the webapp + npm install --production && \ + npm run build-production && \ + # + # Remove build dependencies + apt-get -y remove git && \ + apt-get -y autoremove && \ + apt-get -y clean && \ + # + # Remove Git history as we won't need it + rm -rf .git/ && \ + # + # Add unprivileged user the service will run as + useradd -ms /bin/bash temporal && \ + # + true + +WORKDIR /opt/temporal-webapp/ + +# Webapp port +EXPOSE 8088 + +USER temporal + +CMD ["node", "server.js"] diff --git a/apps/tools/docker-compose.tests.yml b/apps/tools/docker-compose.tests.yml index 109d12142b..75fac4b678 100644 --- a/apps/tools/docker-compose.tests.yml +++ b/apps/tools/docker-compose.tests.yml @@ -45,7 +45,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ solr-shard-01: diff --git a/apps/topics-base/docker-compose.tests.yml b/apps/topics-base/docker-compose.tests.yml index 63dcc58b7d..e72597cc5d 100644 --- a/apps/topics-base/docker-compose.tests.yml +++ b/apps/topics-base/docker-compose.tests.yml @@ -93,7 +93,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ rabbitmq-server: diff --git a/apps/topics-extract-story-links/docker-compose.tests.yml b/apps/topics-extract-story-links/docker-compose.tests.yml index 80d21f6a60..136c527dce 100644 --- a/apps/topics-extract-story-links/docker-compose.tests.yml +++ b/apps/topics-extract-story-links/docker-compose.tests.yml @@ -75,5 +75,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/topics-fetch-link/docker-compose.tests.yml b/apps/topics-fetch-link/docker-compose.tests.yml index e5f84171bf..ff27233a1a 100644 --- a/apps/topics-fetch-link/docker-compose.tests.yml +++ b/apps/topics-fetch-link/docker-compose.tests.yml @@ -93,7 +93,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ rabbitmq-server: diff --git a/apps/topics-fetch-twitter-urls/docker-compose.tests.yml b/apps/topics-fetch-twitter-urls/docker-compose.tests.yml index 63fe1cd6d4..cff3365876 100644 --- a/apps/topics-fetch-twitter-urls/docker-compose.tests.yml +++ b/apps/topics-fetch-twitter-urls/docker-compose.tests.yml @@ -97,7 +97,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ rabbitmq-server: diff --git a/apps/topics-map/docker-compose.tests.yml b/apps/topics-map/docker-compose.tests.yml index 82438fcb22..de0279d904 100644 --- a/apps/topics-map/docker-compose.tests.yml +++ b/apps/topics-map/docker-compose.tests.yml @@ -55,6 +55,6 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ diff --git a/apps/topics-mine/docker-compose.tests.yml b/apps/topics-mine/docker-compose.tests.yml index ed2e90a371..6d90fea73e 100644 --- a/apps/topics-mine/docker-compose.tests.yml +++ b/apps/topics-mine/docker-compose.tests.yml @@ -112,7 +112,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ rabbitmq-server: diff --git a/apps/topics-snapshot/docker-compose.tests.yml b/apps/topics-snapshot/docker-compose.tests.yml index d0dbe3db4c..b192ca1ef9 100644 --- a/apps/topics-snapshot/docker-compose.tests.yml +++ b/apps/topics-snapshot/docker-compose.tests.yml @@ -97,7 +97,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ solr-shard-01: diff --git a/apps/webapp-api/docker-compose.tests.yml b/apps/webapp-api/docker-compose.tests.yml index 5f2963bc66..ff29f1e7f7 100644 --- a/apps/webapp-api/docker-compose.tests.yml +++ b/apps/webapp-api/docker-compose.tests.yml @@ -69,7 +69,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ solr-shard-01: diff --git a/apps/webapp-httpd/docker-compose.tests.yml b/apps/webapp-httpd/docker-compose.tests.yml index 4569acad28..4a3050f291 100644 --- a/apps/webapp-httpd/docker-compose.tests.yml +++ b/apps/webapp-httpd/docker-compose.tests.yml @@ -98,7 +98,7 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ solr-shard-01: diff --git a/apps/word2vec-generate-snapshot-model/docker-compose.tests.yml b/apps/word2vec-generate-snapshot-model/docker-compose.tests.yml index 294d99757a..760d8f78c8 100644 --- a/apps/word2vec-generate-snapshot-model/docker-compose.tests.yml +++ b/apps/word2vec-generate-snapshot-model/docker-compose.tests.yml @@ -49,5 +49,5 @@ services: source: ./../postgresql-server/schema/ target: /opt/mediacloud/schema/ - type: bind - source: ./../postgresql-server/conf/ + source: ./../postgresql-base/conf/ target: /etc/postgresql/11/main/ From 3823e8e814216e2bb7c9bd754864e4bc40ec547c Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 25 Mar 2021 21:04:41 +0200 Subject: [PATCH 033/175] Add some Temporal setup fixes --- apps/elasticsearch-base/Dockerfile | 4 ++-- apps/elk-elasticsearch/Dockerfile | 3 +++ apps/temporal-elasticsearch/Dockerfile | 3 +++ apps/temporal-server/bin/temporal.sh | 9 +++++---- apps/temporal-server/config/mediacloud_template.yaml | 10 +++++----- 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/apps/elasticsearch-base/Dockerfile b/apps/elasticsearch-base/Dockerfile index e8e07fdd8b..07207b0d51 100644 --- a/apps/elasticsearch-base/Dockerfile +++ b/apps/elasticsearch-base/Dockerfile @@ -63,7 +63,7 @@ EXPOSE 9200 # Elasticsearch TCP transport EXPOSE 9300 -# Elasticsearch data -VOLUME /var/lib/elasticsearch +# No "VOLUME /var/lib/elasticsearch" here because sub-images might want to +# pre-init the volume with some data CMD ["/opt/elasticsearch/bin/elasticsearch.sh"] diff --git a/apps/elk-elasticsearch/Dockerfile b/apps/elk-elasticsearch/Dockerfile index 9e9a5595e6..754ccb43b6 100644 --- a/apps/elk-elasticsearch/Dockerfile +++ b/apps/elk-elasticsearch/Dockerfile @@ -31,4 +31,7 @@ RUN \ USER elasticsearch +# Elasticsearch data +VOLUME /var/lib/elasticsearch + CMD ["/opt/elasticsearch/bin/elk-elasticsearch.sh"] diff --git a/apps/temporal-elasticsearch/Dockerfile b/apps/temporal-elasticsearch/Dockerfile index ee90d5bca6..f9dd3de72c 100644 --- a/apps/temporal-elasticsearch/Dockerfile +++ b/apps/temporal-elasticsearch/Dockerfile @@ -29,4 +29,7 @@ USER root RUN rm /index_template.json /setup_index_template.sh USER elasticsearch +# Elasticsearch data +VOLUME /var/lib/elasticsearch + CMD ["/opt/elasticsearch/bin/elasticsearch.sh"] diff --git a/apps/temporal-server/bin/temporal.sh b/apps/temporal-server/bin/temporal.sh index f103074fd6..4caa3890e9 100755 --- a/apps/temporal-server/bin/temporal.sh +++ b/apps/temporal-server/bin/temporal.sh @@ -32,12 +32,12 @@ while true; do done register_default_namespace() { - echo "Registering default namespace: $DEFAULT_NAMESPACE" + echo "Registering default namespace" until tctl --ns default namespace describe < /dev/null; do echo "Default namespace not found. Creating..." - sleep 1 - # FIXME doesn't work + sleep 0.2 # FIXME retention period super short + # FIXME doesn't work a few seconds after getting created tctl --ns default namespace register --rd 1 --desc "Default namespace for Temporal Server" || echo "Creating default namespace failed." done echo "Default namespace registration complete." @@ -50,7 +50,8 @@ if [ -e /var/lib/temporal/first_run ]; then rm /var/lib/temporal/first_run fi -exec temporal-server \ +# No "exec" because default namespace gets registered in the background +temporal-server \ --root /opt/temporal-server \ --env mediacloud \ start diff --git a/apps/temporal-server/config/mediacloud_template.yaml b/apps/temporal-server/config/mediacloud_template.yaml index e872217ba6..21358e1d99 100644 --- a/apps/temporal-server/config/mediacloud_template.yaml +++ b/apps/temporal-server/config/mediacloud_template.yaml @@ -6,7 +6,7 @@ log: persistence: # FIXME is this right? - numHistoryShards: 4096 + numHistoryShards: 16 defaultStore: default visibilityStore: visibility advancedVisibilityStore: es-visibility @@ -119,10 +119,10 @@ global: # hostPort: "temporal-statsd:8125" # prefix: "temporal" - metrics: - prometheus: - timerType: "histogram" - listenAddress: "temporal-prometheus:9090" + # metrics: + # prometheus: + # timerType: "histogram" + # listenAddress: "temporal-prometheus:9090" services: frontend: From 725e2d33b4bec7005c3306b339f36ce8cbccbeee Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 25 Mar 2021 21:26:22 +0200 Subject: [PATCH 034/175] Fix typo --- .../bin/apply_migrations.sh | 3 +- apps/temporal-postgresql/schemas/temporal.sql | 1038 +++++++++++++++++ .../schemas/temporal_visibility.sql | 181 +++ 3 files changed, 1220 insertions(+), 2 deletions(-) create mode 100644 apps/temporal-postgresql/schemas/temporal.sql create mode 100644 apps/temporal-postgresql/schemas/temporal_visibility.sql diff --git a/apps/temporal-postgresql/bin/apply_migrations.sh b/apps/temporal-postgresql/bin/apply_migrations.sh index fa9a9f8b94..d44cda5907 100755 --- a/apps/temporal-postgresql/bin/apply_migrations.sh +++ b/apps/temporal-postgresql/bin/apply_migrations.sh @@ -28,8 +28,7 @@ TSQL="temporal-sql-tool \ --ep 127.0.0.1 \ -p 12345 \ -u temporal \ - --pw temporal \ -" + --pw temporal" MAIN_SCHEMA_DIR="${SCHEMAS_DIR}/temporal/versioned" $TSQL --db temporal update-schema -d "${MAIN_SCHEMA_DIR}" diff --git a/apps/temporal-postgresql/schemas/temporal.sql b/apps/temporal-postgresql/schemas/temporal.sql new file mode 100644 index 0000000000..d32bb8cf1e --- /dev/null +++ b/apps/temporal-postgresql/schemas/temporal.sql @@ -0,0 +1,1038 @@ +-- +-- PostgreSQL database dump +-- + +-- Dumped from database version 11.11 (Ubuntu 11.11-1.pgdg20.04+1) +-- Dumped by pg_dump version 11.11 (Ubuntu 11.11-1.pgdg20.04+1) + +SET statement_timeout = 0; +SET lock_timeout = 0; +SET idle_in_transaction_session_timeout = 0; +SET client_encoding = 'UTF8'; +SET standard_conforming_strings = on; +SELECT pg_catalog.set_config('search_path', '', false); +SET check_function_bodies = false; +SET xmloption = content; +SET client_min_messages = warning; +SET row_security = off; + +SET default_tablespace = ''; + +SET default_with_oids = false; + +-- +-- Name: activity_info_maps; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.activity_info_maps ( + shard_id integer NOT NULL, + namespace_id bytea NOT NULL, + workflow_id character varying(255) NOT NULL, + run_id bytea NOT NULL, + schedule_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) +); + + +ALTER TABLE public.activity_info_maps OWNER TO temporal; + +-- +-- Name: buffered_events; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.buffered_events ( + shard_id integer NOT NULL, + namespace_id bytea NOT NULL, + workflow_id character varying(255) NOT NULL, + run_id bytea NOT NULL, + id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL +); + + +ALTER TABLE public.buffered_events OWNER TO temporal; + +-- +-- Name: buffered_events_id_seq; Type: SEQUENCE; Schema: public; Owner: temporal +-- + +CREATE SEQUENCE public.buffered_events_id_seq + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1; + + +ALTER TABLE public.buffered_events_id_seq OWNER TO temporal; + +-- +-- Name: buffered_events_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: temporal +-- + +ALTER SEQUENCE public.buffered_events_id_seq OWNED BY public.buffered_events.id; + + +-- +-- Name: child_execution_info_maps; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.child_execution_info_maps ( + shard_id integer NOT NULL, + namespace_id bytea NOT NULL, + workflow_id character varying(255) NOT NULL, + run_id bytea NOT NULL, + initiated_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) +); + + +ALTER TABLE public.child_execution_info_maps OWNER TO temporal; + +-- +-- Name: cluster_membership; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.cluster_membership ( + membership_partition integer NOT NULL, + host_id bytea NOT NULL, + rpc_address character varying(15) NOT NULL, + rpc_port smallint NOT NULL, + role smallint NOT NULL, + session_start timestamp without time zone DEFAULT '1970-01-01 00:00:01'::timestamp without time zone, + last_heartbeat timestamp without time zone DEFAULT '1970-01-01 00:00:01'::timestamp without time zone, + record_expiry timestamp without time zone DEFAULT '1970-01-01 00:00:01'::timestamp without time zone +); + + +ALTER TABLE public.cluster_membership OWNER TO temporal; + +-- +-- Name: cluster_metadata; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.cluster_metadata ( + metadata_partition integer NOT NULL, + data bytea DEFAULT '\x'::bytea NOT NULL, + data_encoding character varying(16) DEFAULT 'Proto3'::character varying NOT NULL, + version bigint DEFAULT 1 NOT NULL +); + + +ALTER TABLE public.cluster_metadata OWNER TO temporal; + +-- +-- Name: current_executions; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.current_executions ( + shard_id integer NOT NULL, + namespace_id bytea NOT NULL, + workflow_id character varying(255) NOT NULL, + run_id bytea NOT NULL, + create_request_id character varying(64) NOT NULL, + state integer NOT NULL, + status integer NOT NULL, + start_version bigint NOT NULL, + last_write_version bigint NOT NULL +); + + +ALTER TABLE public.current_executions OWNER TO temporal; + +-- +-- Name: executions; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.executions ( + shard_id integer NOT NULL, + namespace_id bytea NOT NULL, + workflow_id character varying(255) NOT NULL, + run_id bytea NOT NULL, + next_event_id bigint NOT NULL, + last_write_version bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL, + state bytea NOT NULL, + state_encoding character varying(16) NOT NULL +); + + +ALTER TABLE public.executions OWNER TO temporal; + +-- +-- Name: history_node; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.history_node ( + shard_id integer NOT NULL, + tree_id bytea NOT NULL, + branch_id bytea NOT NULL, + node_id bigint NOT NULL, + txn_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL +); + + +ALTER TABLE public.history_node OWNER TO temporal; + +-- +-- Name: history_tree; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.history_tree ( + shard_id integer NOT NULL, + tree_id bytea NOT NULL, + branch_id bytea NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL +); + + +ALTER TABLE public.history_tree OWNER TO temporal; + +-- +-- Name: namespace_metadata; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.namespace_metadata ( + partition_id integer NOT NULL, + notification_version bigint NOT NULL +); + + +ALTER TABLE public.namespace_metadata OWNER TO temporal; + +-- +-- Name: namespaces; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.namespaces ( + partition_id integer NOT NULL, + id bytea NOT NULL, + name character varying(255) NOT NULL, + notification_version bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL, + is_global boolean NOT NULL +); + + +ALTER TABLE public.namespaces OWNER TO temporal; + +-- +-- Name: queue; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.queue ( + queue_type integer NOT NULL, + message_id bigint NOT NULL, + message_payload bytea NOT NULL, + message_encoding character varying(16) DEFAULT 'Json'::character varying NOT NULL +); + + +ALTER TABLE public.queue OWNER TO temporal; + +-- +-- Name: queue_metadata; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.queue_metadata ( + queue_type integer NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) DEFAULT 'Json'::character varying NOT NULL +); + + +ALTER TABLE public.queue_metadata OWNER TO temporal; + +-- +-- Name: replication_tasks; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.replication_tasks ( + shard_id integer NOT NULL, + task_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL +); + + +ALTER TABLE public.replication_tasks OWNER TO temporal; + +-- +-- Name: replication_tasks_dlq; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.replication_tasks_dlq ( + source_cluster_name character varying(255) NOT NULL, + shard_id integer NOT NULL, + task_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL +); + + +ALTER TABLE public.replication_tasks_dlq OWNER TO temporal; + +-- +-- Name: request_cancel_info_maps; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.request_cancel_info_maps ( + shard_id integer NOT NULL, + namespace_id bytea NOT NULL, + workflow_id character varying(255) NOT NULL, + run_id bytea NOT NULL, + initiated_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) +); + + +ALTER TABLE public.request_cancel_info_maps OWNER TO temporal; + +-- +-- Name: schema_update_history; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.schema_update_history ( + version_partition integer NOT NULL, + year integer NOT NULL, + month integer NOT NULL, + update_time timestamp without time zone NOT NULL, + description character varying(255), + manifest_md5 character varying(64), + new_version character varying(64), + old_version character varying(64) +); + + +ALTER TABLE public.schema_update_history OWNER TO temporal; + +-- +-- Name: schema_version; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.schema_version ( + version_partition integer NOT NULL, + db_name character varying(255) NOT NULL, + creation_time timestamp without time zone, + curr_version character varying(64), + min_compatible_version character varying(64) +); + + +ALTER TABLE public.schema_version OWNER TO temporal; + +-- +-- Name: shards; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.shards ( + shard_id integer NOT NULL, + range_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL +); + + +ALTER TABLE public.shards OWNER TO temporal; + +-- +-- Name: signal_info_maps; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.signal_info_maps ( + shard_id integer NOT NULL, + namespace_id bytea NOT NULL, + workflow_id character varying(255) NOT NULL, + run_id bytea NOT NULL, + initiated_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) +); + + +ALTER TABLE public.signal_info_maps OWNER TO temporal; + +-- +-- Name: signals_requested_sets; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.signals_requested_sets ( + shard_id integer NOT NULL, + namespace_id bytea NOT NULL, + workflow_id character varying(255) NOT NULL, + run_id bytea NOT NULL, + signal_id character varying(64) NOT NULL +); + + +ALTER TABLE public.signals_requested_sets OWNER TO temporal; + +-- +-- Name: task_queues; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.task_queues ( + range_hash bigint NOT NULL, + task_queue_id bytea NOT NULL, + range_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL +); + + +ALTER TABLE public.task_queues OWNER TO temporal; + +-- +-- Name: tasks; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.tasks ( + range_hash bigint NOT NULL, + task_queue_id bytea NOT NULL, + task_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL +); + + +ALTER TABLE public.tasks OWNER TO temporal; + +-- +-- Name: timer_info_maps; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.timer_info_maps ( + shard_id integer NOT NULL, + namespace_id bytea NOT NULL, + workflow_id character varying(255) NOT NULL, + run_id bytea NOT NULL, + timer_id character varying(255) NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) +); + + +ALTER TABLE public.timer_info_maps OWNER TO temporal; + +-- +-- Name: timer_tasks; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.timer_tasks ( + shard_id integer NOT NULL, + visibility_timestamp timestamp without time zone NOT NULL, + task_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL +); + + +ALTER TABLE public.timer_tasks OWNER TO temporal; + +-- +-- Name: transfer_tasks; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.transfer_tasks ( + shard_id integer NOT NULL, + task_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL +); + + +ALTER TABLE public.transfer_tasks OWNER TO temporal; + +-- +-- Name: visibility_tasks; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.visibility_tasks ( + shard_id integer NOT NULL, + task_id bigint NOT NULL, + data bytea NOT NULL, + data_encoding character varying(16) NOT NULL +); + + +ALTER TABLE public.visibility_tasks OWNER TO temporal; + +-- +-- Name: buffered_events id; Type: DEFAULT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.buffered_events ALTER COLUMN id SET DEFAULT nextval('public.buffered_events_id_seq'::regclass); + + +-- +-- Data for Name: activity_info_maps; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.activity_info_maps (shard_id, namespace_id, workflow_id, run_id, schedule_id, data, data_encoding) FROM stdin; +\. + + +-- +-- Data for Name: buffered_events; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.buffered_events (shard_id, namespace_id, workflow_id, run_id, id, data, data_encoding) FROM stdin; +\. + + +-- +-- Data for Name: child_execution_info_maps; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.child_execution_info_maps (shard_id, namespace_id, workflow_id, run_id, initiated_id, data, data_encoding) FROM stdin; +\. + + +-- +-- Data for Name: cluster_membership; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.cluster_membership (membership_partition, host_id, rpc_address, rpc_port, role, session_start, last_heartbeat, record_expiry) FROM stdin; +0 \\xd343f2848d9b11eba18c02420a010005 10.1.0.5 6933 1 2021-03-25 18:56:36.884362 2021-03-25 18:57:25.896779 2021-03-27 18:57:25.896779 +0 \\xd346e1178d9b11eba18c02420a010005 10.1.0.5 6935 3 2021-03-25 18:56:36.908823 2021-03-25 18:57:28.931428 2021-03-27 18:57:28.931428 +0 \\xd34bfa858d9b11eba18c02420a010005 10.1.0.5 6939 4 2021-03-25 18:56:36.936065 2021-03-25 18:57:36.950967 2021-03-27 18:57:36.950967 +0 \\xd3454ab18d9b11eba18c02420a010005 10.1.0.5 6934 2 2021-03-25 18:56:36.892785 2021-03-25 18:57:38.910999 2021-03-27 18:57:38.910999 +\. + + +-- +-- Data for Name: cluster_metadata; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.cluster_metadata (metadata_partition, data, data_encoding, version) FROM stdin; +0 \\x0a0661637469766510101a2432323465333031362d636566362d343963642d393565302d663962656330306438653934 Proto3 1 +\. + + +-- +-- Data for Name: current_executions; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.current_executions (shard_id, namespace_id, workflow_id, run_id, create_request_id, state, status, start_version, last_write_version) FROM stdin; +8 \\x32049b68787240948e63d0dd59896a83 temporal-sys-tq-scanner \\x8b5622d072f44c068c5e4b04d4b3a410 766425e5-f5f5-4743-ba90-8ca7ebdcdfb7 1 10 0 +\. + + +-- +-- Data for Name: executions; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.executions (shard_id, namespace_id, workflow_id, run_id, next_event_id, last_write_version, data, data_encoding, state, state_encoding) FROM stdin; +8 \\x32049b68787240948e63d0dd59896a83 temporal-sys-tq-scanner \\x8b5622d072f44c068c5e4b04d4b3a410 2 0 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121774656d706f72616c2d7379732d74712d7363616e6e65724a2374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d30522074656d706f72616c2d7379732d74712d7363616e6e65722d776f726b666c6f775a0062040880af1a6a02080a8801808040900101a2010c08e8b9f3820610ebfaefc803aa010c08e8b9f3820610ebfaefc803ca0100d00101fa0109656d70747955756964980201da020c30202a2f3132202a202a202ab2035412520a4c0a2438623536323264302d373266342d346330362d386335652d346230346434623361343130122438633465363161342d386136312d346263372d613234362d32373039363532303138326212020801ba032438623536323264302d373266342d346330362d386335652d346230346434623361343130c2030308f201ca030c0880f78e830610ebfaefc803 Proto3 \\x0a2437363634323565352d663566352d343734332d626139302d386361376562646364666237122438623536323264302d373266342d346330362d386335652d34623034643462336134313018012001 Proto3 +\. + + +-- +-- Data for Name: history_node; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.history_node (shard_id, tree_id, branch_id, node_id, txn_id, data, data_encoding) FROM stdin; +8 \\x8b5622d072f44c068c5e4b04d4b3a410 \\x8c4e61a48a614bc7a24627096520182b 1 -1048577 \\x0aef010801120c08e8b9f3820610ebfaefc80318012880804032d6010a220a2074656d706f72616c2d7379732d74712d7363616e6e65722d776f726b666c6f772a270a2374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d3010013a0042040880af1a4a02080a5803722438623536323264302d373266342d346330362d386335652d3462303464346233613431307a103435406561346434626234623434314082012438623536323264302d373266342d346330362d386335652d346230346434623361343130900101a2010c30202a2f3132202a202a202aaa010408988e01ca0100 Proto3 +\. + + +-- +-- Data for Name: history_tree; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.history_tree (shard_id, tree_id, branch_id, data, data_encoding) FROM stdin; +8 \\x8b5622d072f44c068c5e4b04d4b3a410 \\x8c4e61a48a614bc7a24627096520182b \\x0a4c0a2438623536323264302d373266342d346330362d386335652d346230346434623361343130122438633465363161342d386136312d346263372d613234362d323730393635323031383262120c08e8b9f3820610c89790c9031a6133323034396236382d373837322d343039342d386536332d6430646435393839366138333a74656d706f72616c2d7379732d74712d7363616e6e65723a38623536323264302d373266342d346330362d386335652d346230346434623361343130 Proto3 +\. + + +-- +-- Data for Name: namespace_metadata; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.namespace_metadata (partition_id, notification_version) FROM stdin; +54321 3 +\. + + +-- +-- Data for Name: namespaces; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.namespaces (partition_id, id, name, notification_version, data, data_encoding, is_global) FROM stdin; +54321 \\x32049b68787240948e63d0dd59896a83 temporal-system 1 \\x0a780a2433323034396236382d373837322d343039342d386536332d64306464353938393661383310011a0f74656d706f72616c2d73797374656d222254656d706f72616c20696e7465726e616c2073797374656d206e616d6573706163652a1974656d706f72616c2d636f72654074656d706f72616c2e696f120a0a040880f524200130011a100a06616374697665120661637469766528ffffffffffffffffff01 Proto3 f +54321 \\x26f80ced0ead4534b085ff8c0643031c default 2 \\x0a580a2432366638306365642d306561642d343533342d623038352d66663863303634333033316310011a0764656661756c74222544656661756c74206e616d65737061636520666f722054656d706f72616c2053657276657212660a040880a3051a0020022a2a66696c653a2f2f2f7661722f6c69622f74656d706f72616c2f617263686976616c2f74656d706f72616c30023a2c66696c653a2f2f2f7661722f6c69622f74656d706f72616c2f617263686976616c2f7669736962696c6974791a100a066163746976651206616374697665 Proto3 f +\. + + +-- +-- Data for Name: queue; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.queue (queue_type, message_id, message_payload, message_encoding) FROM stdin; +\. + + +-- +-- Data for Name: queue_metadata; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.queue_metadata (queue_type, data, data_encoding) FROM stdin; +1 \\x7b7d Json +-1 \\x7b7d Json +\. + + +-- +-- Data for Name: replication_tasks; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.replication_tasks (shard_id, task_id, data, data_encoding) FROM stdin; +\. + + +-- +-- Data for Name: replication_tasks_dlq; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.replication_tasks_dlq (source_cluster_name, shard_id, task_id, data, data_encoding) FROM stdin; +\. + + +-- +-- Data for Name: request_cancel_info_maps; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.request_cancel_info_maps (shard_id, namespace_id, workflow_id, run_id, initiated_id, data, data_encoding) FROM stdin; +\. + + +-- +-- Data for Name: schema_update_history; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.schema_update_history (version_partition, year, month, update_time, description, manifest_md5, new_version, old_version) FROM stdin; +0 2021 3 2021-03-21 23:09:17.543434 initial version 0.0 0 +0 2021 3 2021-03-21 23:09:17.807128 base version of schema 55b84ca114ac34d84bdc5f52c198fa33 1.0 0.0 +0 2021 3 2021-03-21 23:09:17.80979 schema update for cluster metadata 58f06841bbb187cb210db32a090c21ee 1.1 1.0 +0 2021 3 2021-03-21 23:09:17.811408 schema update for RPC replication c6bdeea21882e2625038927a84929b16 1.2 1.1 +0 2021 3 2021-03-21 23:09:17.815148 schema update for kafka deprecation 3beee7d470421674194475f94b58d89b 1.3 1.2 +0 2021 3 2021-03-21 23:09:17.816468 schema update for cluster metadata cleanup c53e2e9cea5660c8a1f3b2ac73cdb138 1.4 1.3 +\. + + +-- +-- Data for Name: schema_version; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.schema_version (version_partition, db_name, creation_time, curr_version, min_compatible_version) FROM stdin; +0 temporal 2021-03-21 23:09:17.816194 1.4 1.0 +\. + + +-- +-- Data for Name: shards; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.shards (shard_id, range_id, data, data_encoding) FROM stdin; +1 1 \\x080110011a0d31302e312e302e353a3732333430014802 Proto3 +3 1 \\x080310011a0d31302e312e302e353a3732333430014802 Proto3 +7 1 \\x080710011a0d31302e312e302e353a3732333430014802 Proto3 +10 1 \\x080a10011a0d31302e312e302e353a3732333430014802 Proto3 +5 1 \\x080510011a0d31302e312e302e353a3732333430014802 Proto3 +11 1 \\x080b10011a0d31302e312e302e353a3732333430014802 Proto3 +9 1 \\x080910011a0d31302e312e302e353a3732333430014802 Proto3 +6 1 \\x080610011a0d31302e312e302e353a3732333430014802 Proto3 +8 1 \\x080810011a0d31302e312e302e353a3732333430014802 Proto3 +2 1 \\x080210011a0d31302e312e302e353a3732333430014802 Proto3 +14 1 \\x080e10011a0d31302e312e302e353a3732333430014802 Proto3 +16 1 \\x081010011a0d31302e312e302e353a3732333430014802 Proto3 +4 1 \\x080410011a0d31302e312e302e353a3732333430014802 Proto3 +13 1 \\x080d10011a0d31302e312e302e353a3732333430014802 Proto3 +12 1 \\x080c10011a0d31302e312e302e353a3732333430014802 Proto3 +15 1 \\x080f10011a0d31302e312e302e353a3732333430014802 Proto3 +\. + + +-- +-- Data for Name: signal_info_maps; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.signal_info_maps (shard_id, namespace_id, workflow_id, run_id, initiated_id, data, data_encoding) FROM stdin; +\. + + +-- +-- Data for Name: signals_requested_sets; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.signals_requested_sets (shard_id, namespace_id, workflow_id, run_id, signal_id) FROM stdin; +\. + + +-- +-- Data for Name: task_queues; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.task_queues (range_hash, task_queue_id, range_id, data, data_encoding) FROM stdin; +1502813099 \\x32049b68787240948e63d0dd59896a836561346434626234623434313a66643265396231612d623363362d343662392d386333352d38343130363233323937613701 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312316561346434626234623434313a66643265396231612d623363362d343662392d386333352d38343130363233323937613718012002320b08a1ddf8820610a6a680223a0b08a1baf3820610b89f8022 Proto3 +940296977 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3101 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f31180120013a0b08a1baf3820610dc8bc822 Proto3 +653791233 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f3102 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312262f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f31180220013a0b08a1baf3820610d194da22 Proto3 +1410825331 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d3002 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d30180220013a0b08a1baf3820610c28d8623 Proto3 +3095716534 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d626174636865722d7461736b717565756502 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121e74656d706f72616c2d7379732d626174636865722d7461736b7175657565180220013a0b08a1baf3820610f2ec8e23 Proto3 +2063506710 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d617263686976616c2d747101 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121474656d706f72616c2d617263686976616c2d7471180120013a0b08a1baf38206109cd1ca23 Proto3 +1994493189 \\x32049b68787240948e63d0dd59896a836561346434626234623434313a61616336633234302d393530662d343065332d396130352d38353531313136653639373301 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312316561346434626234623434313a61616336633234302d393530662d343065332d396130352d38353531313136653639373318012002320b08a1ddf8820610c2dffd223a0b08a1baf382061095d6fd22 Proto3 +4095103286 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f3102 1\\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312322f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f31180220013a0b08a1baf38206109bcfeb23 Proto3 +1332228876 \\x32049b68787240948e63d0dd59896a836561346434626234623434313a36326232346637372d393431352d343336342d616534332d31663134366537316561326601 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312316561346434626234623434313a36326232346637372d393431352d343336342d616534332d31663134366537316561326618012002320b08a1ddf8820610fa8293243a0b08a1baf3820610fefa9224 Proto3 +3469555445 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d626174636865722d7461736b717565756501 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121e74656d706f72616c2d7379732d626174636865722d7461736b7175657565180120013a0b08a1baf3820610dfaee824 Proto3 +1018868252 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f3202 1\\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312322f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f32180220013a0b08a1baf3820610cbab8c24 Proto3 +2528910666 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c69637901 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122a74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c696379180120013a0b08a1baf382061081b68324 Proto3 +3617866575 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f3201 1\\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312322f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f32180120013a0b08a1baf3820610efcdba27 Proto3 +1177789365 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3302 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f33180220013a0b08a0baf3820610dad79721 Proto3 +3681167674 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3202 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f32180220013a0b08a0baf3820610b1849221 Proto3 +2662461164 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3102 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f31180220013a0b08a0baf38206109fd4a922 Proto3 +289827042 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3102 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f31180220013a0b08a0baf3820610e9d7b922 Proto3 +430461988 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3301 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f33180120013a0b08a0baf3820610d493bd22 Proto3 +740397391 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3301 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f33180120013a0b08a0baf382061087d6b023 Proto3 +1693213168 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f3302 1\\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312322f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f33180220013a0b08a0baf3820610ddb1fb23 Proto3 +739332331 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f3201 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312262f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f32180120013a0b08a0baf3820610dde58024 Proto3 +481579558 \\x32049b68787240948e63d0dd59896a836561346434626234623434313a30636231303935612d366365342d343133332d623631652d65366137323333313833336201 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312316561346434626234623434313a30636231303935612d366365342d343133332d623631652d65366137323333313833336218012002320b08a1ddf8820610f89fee1d3a0b08a1baf38206108499ee1d Proto3 +70786998 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3101 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f31180120013a0b08a1baf3820610fabec31f Proto3 +2358430835 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3202 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f32180220013a0b08a1baf3820610efe1eb1f Proto3 +3662736275 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3302 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f33180220013a0b08a1baf38206108f8eef1f Proto3 +3963122975 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f3202 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312262f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f32180220013a0b08a1baf38206108cb38521 Proto3 +4214421317 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d3001 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d30180120013a0b08a1baf382061080949521 Proto3 +3597285451 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f3101 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312262f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f31180120013a0b08a1baf3820610909c9823 Proto3 +1688886821 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c69637902 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122a74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c696379180220013a0b08a1baf382061081f6e824 Proto3 +288707420 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d617263686976616c2d747102 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121474656d706f72616c2d617263686976616c2d7471180220013a0b08a1baf38206108190e924 Proto3 +\. + + +-- +-- Data for Name: tasks; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.tasks (range_hash, task_queue_id, task_id, data, data_encoding) FROM stdin; +\. + + +-- +-- Data for Name: timer_info_maps; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.timer_info_maps (shard_id, namespace_id, workflow_id, run_id, timer_id, data, data_encoding) FROM stdin; +\. + + +-- +-- Data for Name: timer_tasks; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.timer_tasks (shard_id, visibility_timestamp, task_id, data, data_encoding) FROM stdin; +8 2021-03-31 00:00:00.958136 1048579 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121774656d706f72616c2d7379732d74712d7363616e6e65721a2438623536323264302d373266342d346330362d386335652d346230346434623361343130200f508380405a0c0880f78e830610ebfaefc803 Proto3 +8 2021-03-26 00:00:00.958136 1048580 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121774656d706f72616c2d7379732d74712d7363616e6e65721a2438623536323264302d373266342d346330362d386335652d34623034643462336134313020123002508480405a0c0880c8f4820610ebfaefc803 Proto3 +\. + + +-- +-- Data for Name: transfer_tasks; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.transfer_tasks (shard_id, task_id, data, data_encoding) FROM stdin; +\. + + +-- +-- Data for Name: visibility_tasks; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.visibility_tasks (shard_id, task_id, data, data_encoding) FROM stdin; +\. + + +-- +-- Name: buffered_events_id_seq; Type: SEQUENCE SET; Schema: public; Owner: temporal +-- + +SELECT pg_catalog.setval('public.buffered_events_id_seq', 1, false); + + +-- +-- Name: activity_info_maps activity_info_maps_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.activity_info_maps + ADD CONSTRAINT activity_info_maps_pkey PRIMARY KEY (shard_id, namespace_id, workflow_id, run_id, schedule_id); + + +-- +-- Name: buffered_events buffered_events_id_key; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.buffered_events + ADD CONSTRAINT buffered_events_id_key UNIQUE (id); + + +-- +-- Name: buffered_events buffered_events_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.buffered_events + ADD CONSTRAINT buffered_events_pkey PRIMARY KEY (shard_id, namespace_id, workflow_id, run_id, id); + + +-- +-- Name: child_execution_info_maps child_execution_info_maps_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.child_execution_info_maps + ADD CONSTRAINT child_execution_info_maps_pkey PRIMARY KEY (shard_id, namespace_id, workflow_id, run_id, initiated_id); + + +-- +-- Name: cluster_membership cluster_membership_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.cluster_membership + ADD CONSTRAINT cluster_membership_pkey PRIMARY KEY (membership_partition, host_id); + + +-- +-- Name: cluster_metadata cluster_metadata_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.cluster_metadata + ADD CONSTRAINT cluster_metadata_pkey PRIMARY KEY (metadata_partition); + + +-- +-- Name: current_executions current_executions_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.current_executions + ADD CONSTRAINT current_executions_pkey PRIMARY KEY (shard_id, namespace_id, workflow_id); + + +-- +-- Name: executions executions_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.executions + ADD CONSTRAINT executions_pkey PRIMARY KEY (shard_id, namespace_id, workflow_id, run_id); + + +-- +-- Name: history_node history_node_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.history_node + ADD CONSTRAINT history_node_pkey PRIMARY KEY (shard_id, tree_id, branch_id, node_id, txn_id); + + +-- +-- Name: history_tree history_tree_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.history_tree + ADD CONSTRAINT history_tree_pkey PRIMARY KEY (shard_id, tree_id, branch_id); + + +-- +-- Name: namespace_metadata namespace_metadata_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.namespace_metadata + ADD CONSTRAINT namespace_metadata_pkey PRIMARY KEY (partition_id); + + +-- +-- Name: namespaces namespaces_name_key; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.namespaces + ADD CONSTRAINT namespaces_name_key UNIQUE (name); + + +-- +-- Name: namespaces namespaces_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.namespaces + ADD CONSTRAINT namespaces_pkey PRIMARY KEY (partition_id, id); + + +-- +-- Name: queue_metadata queue_metadata_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.queue_metadata + ADD CONSTRAINT queue_metadata_pkey PRIMARY KEY (queue_type); + + +-- +-- Name: queue queue_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.queue + ADD CONSTRAINT queue_pkey PRIMARY KEY (queue_type, message_id); + + +-- +-- Name: replication_tasks_dlq replication_tasks_dlq_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.replication_tasks_dlq + ADD CONSTRAINT replication_tasks_dlq_pkey PRIMARY KEY (source_cluster_name, shard_id, task_id); + + +-- +-- Name: replication_tasks replication_tasks_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.replication_tasks + ADD CONSTRAINT replication_tasks_pkey PRIMARY KEY (shard_id, task_id); + + +-- +-- Name: request_cancel_info_maps request_cancel_info_maps_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.request_cancel_info_maps + ADD CONSTRAINT request_cancel_info_maps_pkey PRIMARY KEY (shard_id, namespace_id, workflow_id, run_id, initiated_id); + + +-- +-- Name: schema_update_history schema_update_history_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.schema_update_history + ADD CONSTRAINT schema_update_history_pkey PRIMARY KEY (version_partition, year, month, update_time); + + +-- +-- Name: schema_version schema_version_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.schema_version + ADD CONSTRAINT schema_version_pkey PRIMARY KEY (version_partition, db_name); + + +-- +-- Name: shards shards_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.shards + ADD CONSTRAINT shards_pkey PRIMARY KEY (shard_id); + + +-- +-- Name: signal_info_maps signal_info_maps_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.signal_info_maps + ADD CONSTRAINT signal_info_maps_pkey PRIMARY KEY (shard_id, namespace_id, workflow_id, run_id, initiated_id); + + +-- +-- Name: signals_requested_sets signals_requested_sets_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.signals_requested_sets + ADD CONSTRAINT signals_requested_sets_pkey PRIMARY KEY (shard_id, namespace_id, workflow_id, run_id, signal_id); + + +-- +-- Name: task_queues task_queues_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.task_queues + ADD CONSTRAINT task_queues_pkey PRIMARY KEY (range_hash, task_queue_id); + + +-- +-- Name: tasks tasks_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.tasks + ADD CONSTRAINT tasks_pkey PRIMARY KEY (range_hash, task_queue_id, task_id); + + +-- +-- Name: timer_info_maps timer_info_maps_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.timer_info_maps + ADD CONSTRAINT timer_info_maps_pkey PRIMARY KEY (shard_id, namespace_id, workflow_id, run_id, timer_id); + + +-- +-- Name: timer_tasks timer_tasks_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.timer_tasks + ADD CONSTRAINT timer_tasks_pkey PRIMARY KEY (shard_id, visibility_timestamp, task_id); + + +-- +-- Name: transfer_tasks transfer_tasks_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.transfer_tasks + ADD CONSTRAINT transfer_tasks_pkey PRIMARY KEY (shard_id, task_id); + + +-- +-- Name: visibility_tasks visibility_tasks_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.visibility_tasks + ADD CONSTRAINT visibility_tasks_pkey PRIMARY KEY (shard_id, task_id); + + +-- +-- Name: cm_idx_lasthb; Type: INDEX; Schema: public; Owner: temporal +-- + +CREATE INDEX cm_idx_lasthb ON public.cluster_membership USING btree (last_heartbeat); + + +-- +-- Name: cm_idx_recordexpiry; Type: INDEX; Schema: public; Owner: temporal +-- + +CREATE INDEX cm_idx_recordexpiry ON public.cluster_membership USING btree (record_expiry); + + +-- +-- Name: cm_idx_rolehost; Type: INDEX; Schema: public; Owner: temporal +-- + +CREATE UNIQUE INDEX cm_idx_rolehost ON public.cluster_membership USING btree (role, host_id); + + +-- +-- Name: cm_idx_rolelasthb; Type: INDEX; Schema: public; Owner: temporal +-- + +CREATE INDEX cm_idx_rolelasthb ON public.cluster_membership USING btree (role, last_heartbeat); + + +-- +-- Name: cm_idx_rpchost; Type: INDEX; Schema: public; Owner: temporal +-- + +CREATE INDEX cm_idx_rpchost ON public.cluster_membership USING btree (rpc_address, role); + + +-- +-- PostgreSQL database dump complete +-- + diff --git a/apps/temporal-postgresql/schemas/temporal_visibility.sql b/apps/temporal-postgresql/schemas/temporal_visibility.sql new file mode 100644 index 0000000000..cca1ec456c --- /dev/null +++ b/apps/temporal-postgresql/schemas/temporal_visibility.sql @@ -0,0 +1,181 @@ +-- +-- PostgreSQL database dump +-- + +-- Dumped from database version 11.11 (Ubuntu 11.11-1.pgdg20.04+1) +-- Dumped by pg_dump version 11.11 (Ubuntu 11.11-1.pgdg20.04+1) + +SET statement_timeout = 0; +SET lock_timeout = 0; +SET idle_in_transaction_session_timeout = 0; +SET client_encoding = 'UTF8'; +SET standard_conforming_strings = on; +SELECT pg_catalog.set_config('search_path', '', false); +SET check_function_bodies = false; +SET xmloption = content; +SET client_min_messages = warning; +SET row_security = off; + +SET default_tablespace = ''; + +SET default_with_oids = false; + +-- +-- Name: executions_visibility; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.executions_visibility ( + namespace_id character(64) NOT NULL, + run_id character(64) NOT NULL, + start_time timestamp without time zone NOT NULL, + execution_time timestamp without time zone NOT NULL, + workflow_id character varying(255) NOT NULL, + workflow_type_name character varying(255) NOT NULL, + status integer NOT NULL, + close_time timestamp without time zone, + history_length bigint, + memo bytea, + encoding character varying(64) NOT NULL, + task_queue character varying(255) DEFAULT ''::character varying NOT NULL +); + + +ALTER TABLE public.executions_visibility OWNER TO temporal; + +-- +-- Name: schema_update_history; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.schema_update_history ( + version_partition integer NOT NULL, + year integer NOT NULL, + month integer NOT NULL, + update_time timestamp without time zone NOT NULL, + description character varying(255), + manifest_md5 character varying(64), + new_version character varying(64), + old_version character varying(64) +); + + +ALTER TABLE public.schema_update_history OWNER TO temporal; + +-- +-- Name: schema_version; Type: TABLE; Schema: public; Owner: temporal +-- + +CREATE TABLE public.schema_version ( + version_partition integer NOT NULL, + db_name character varying(255) NOT NULL, + creation_time timestamp without time zone, + curr_version character varying(64), + min_compatible_version character varying(64) +); + + +ALTER TABLE public.schema_version OWNER TO temporal; + +-- +-- Data for Name: executions_visibility; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.executions_visibility (namespace_id, run_id, start_time, execution_time, workflow_id, workflow_type_name, status, close_time, history_length, memo, encoding, task_queue) FROM stdin; +\. + + +-- +-- Data for Name: schema_update_history; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.schema_update_history (version_partition, year, month, update_time, description, manifest_md5, new_version, old_version) FROM stdin; +0 2021 3 2021-03-21 23:09:18.317158 initial version 0.0 0 +0 2021 3 2021-03-21 23:09:18.40721 base version of visibility schema 698373883c1c0dd44607a446a62f2a79 1.0 0.0 +0 2021 3 2021-03-21 23:09:18.41287 add close time & status index e286f8af0a62e291b35189ce29d3fff3 1.1 1.0 +\. + + +-- +-- Data for Name: schema_version; Type: TABLE DATA; Schema: public; Owner: temporal +-- + +COPY public.schema_version (version_partition, db_name, creation_time, curr_version, min_compatible_version) FROM stdin; +0 temporal_visibility 2021-03-21 23:09:18.411815 1.1 0.1 +\. + + +-- +-- Name: executions_visibility executions_visibility_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.executions_visibility + ADD CONSTRAINT executions_visibility_pkey PRIMARY KEY (namespace_id, run_id); + + +-- +-- Name: schema_update_history schema_update_history_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.schema_update_history + ADD CONSTRAINT schema_update_history_pkey PRIMARY KEY (version_partition, year, month, update_time); + + +-- +-- Name: schema_version schema_version_pkey; Type: CONSTRAINT; Schema: public; Owner: temporal +-- + +ALTER TABLE ONLY public.schema_version + ADD CONSTRAINT schema_version_pkey PRIMARY KEY (version_partition, db_name); + + +-- +-- Name: by_close_time_by_status; Type: INDEX; Schema: public; Owner: temporal +-- + +CREATE INDEX by_close_time_by_status ON public.executions_visibility USING btree (namespace_id, close_time DESC, run_id, status); + + +-- +-- Name: by_status_by_close_time; Type: INDEX; Schema: public; Owner: temporal +-- + +CREATE INDEX by_status_by_close_time ON public.executions_visibility USING btree (namespace_id, status, close_time DESC, run_id); + + +-- +-- Name: by_status_by_start_time; Type: INDEX; Schema: public; Owner: temporal +-- + +CREATE INDEX by_status_by_start_time ON public.executions_visibility USING btree (namespace_id, status, start_time DESC, run_id); + + +-- +-- Name: by_type_close_time; Type: INDEX; Schema: public; Owner: temporal +-- + +CREATE INDEX by_type_close_time ON public.executions_visibility USING btree (namespace_id, workflow_type_name, status, close_time DESC, run_id); + + +-- +-- Name: by_type_start_time; Type: INDEX; Schema: public; Owner: temporal +-- + +CREATE INDEX by_type_start_time ON public.executions_visibility USING btree (namespace_id, workflow_type_name, status, start_time DESC, run_id); + + +-- +-- Name: by_workflow_id_close_time; Type: INDEX; Schema: public; Owner: temporal +-- + +CREATE INDEX by_workflow_id_close_time ON public.executions_visibility USING btree (namespace_id, workflow_id, status, close_time DESC, run_id); + + +-- +-- Name: by_workflow_id_start_time; Type: INDEX; Schema: public; Owner: temporal +-- + +CREATE INDEX by_workflow_id_start_time ON public.executions_visibility USING btree (namespace_id, workflow_id, status, start_time DESC, run_id); + + +-- +-- PostgreSQL database dump complete +-- From 372c1f47e362b27bd7c4a0721cb52ec89657d470 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 25 Mar 2021 21:44:36 +0200 Subject: [PATCH 035/175] Pre-initialize temporal-postgresql with schema that includes default namespace --- apps/temporal-postgresql/Dockerfile | 1 + .../bin/apply_migrations.sh | 6 +- .../bin/initialize_schema.sh | 14 +- apps/temporal-postgresql/schema/README.md | 70 +++++++ .../temporal.sql => schema/mc_temporal.sql} | 192 +++++++----------- .../mc_temporal_visibility.sql} | 14 +- apps/temporal-server/Dockerfile | 14 +- apps/temporal-server/bin/temporal.sh | 22 +- 8 files changed, 156 insertions(+), 177 deletions(-) create mode 100644 apps/temporal-postgresql/schema/README.md rename apps/temporal-postgresql/{schemas/temporal.sql => schema/mc_temporal.sql} (53%) rename apps/temporal-postgresql/{schemas/temporal_visibility.sql => schema/mc_temporal_visibility.sql} (85%) diff --git a/apps/temporal-postgresql/Dockerfile b/apps/temporal-postgresql/Dockerfile index 4c175ff017..233452b161 100644 --- a/apps/temporal-postgresql/Dockerfile +++ b/apps/temporal-postgresql/Dockerfile @@ -42,6 +42,7 @@ RUN \ # Copy helper scripts COPY bin/* /opt/temporal-postgresql/bin/ +COPY schema/*.sql /opt/temporal-postgresql/schema/ USER postgres diff --git a/apps/temporal-postgresql/bin/apply_migrations.sh b/apps/temporal-postgresql/bin/apply_migrations.sh index d44cda5907..7bd525211d 100755 --- a/apps/temporal-postgresql/bin/apply_migrations.sh +++ b/apps/temporal-postgresql/bin/apply_migrations.sh @@ -22,7 +22,7 @@ PGCTL_START_TIMEOUT=3600 -w \ start -SCHEMAS_DIR="/opt/temporal-postgresql/schema/v96" +VENDOR_SCHEMA_DIR="/opt/temporal-postgresql/schema/v96" TSQL="temporal-sql-tool \ --plugin postgres \ --ep 127.0.0.1 \ @@ -30,10 +30,10 @@ TSQL="temporal-sql-tool \ -u temporal \ --pw temporal" -MAIN_SCHEMA_DIR="${SCHEMAS_DIR}/temporal/versioned" +MAIN_SCHEMA_DIR="${VENDOR_SCHEMA_DIR}/temporal/versioned" $TSQL --db temporal update-schema -d "${MAIN_SCHEMA_DIR}" -VISIBILITY_SCHEMA_DIR="${SCHEMAS_DIR}/visibility/versioned" +VISIBILITY_SCHEMA_DIR="${VENDOR_SCHEMA_DIR}/visibility/versioned" $TSQL --db temporal_visibility update-schema -d "${VISIBILITY_SCHEMA_DIR}" # Stop PostgreSQL diff --git a/apps/temporal-postgresql/bin/initialize_schema.sh b/apps/temporal-postgresql/bin/initialize_schema.sh index e9620d6ae8..1baa42fc12 100755 --- a/apps/temporal-postgresql/bin/initialize_schema.sh +++ b/apps/temporal-postgresql/bin/initialize_schema.sh @@ -22,7 +22,7 @@ MC_POSTGRESQL_CONF_PATH="/etc/postgresql/11/main/postgresql.conf" psql -v ON_ERROR_STOP=1 -c "CREATE USER temporal WITH PASSWORD 'temporal' SUPERUSER;" -SCHEMAS_DIR="/opt/temporal-postgresql/schema/v96" +SCHEMA_DIR="/opt/temporal-postgresql/schema" TSQL="temporal-sql-tool \ --plugin postgres \ --ep 127.0.0.1 \ @@ -31,19 +31,11 @@ TSQL="temporal-sql-tool \ --pw temporal \ " -MAIN_SCHEMA_DIR="${SCHEMAS_DIR}/temporal/versioned" $TSQL create --db temporal -$TSQL --db temporal setup-schema -v 0.0 -$TSQL --db temporal update-schema -d "${MAIN_SCHEMA_DIR}" +psql -v ON_ERROR_STOP=1 -d temporal -f "${SCHEMA_DIR}/mc_temporal.sql" -VISIBILITY_SCHEMA_DIR="${SCHEMAS_DIR}/visibility/versioned" $TSQL create --db temporal_visibility -$TSQL --db temporal_visibility setup-schema -v 0.0 -$TSQL --db temporal_visibility update-schema -d "${VISIBILITY_SCHEMA_DIR}" - -# Given that "temporal-sql-tool" returns 0 on errors too, make sure that something got created -psql -d temporal -c 'SELECT * FROM visibility_tasks' > /dev/null -psql -d temporal_visibility -c 'SELECT * FROM schema_version' > /dev/null +psql -v ON_ERROR_STOP=1 -d temporal_visibility -f "${SCHEMA_DIR}/mc_temporal_visibility.sql" # Stop PostgreSQL "${MC_POSTGRESQL_BIN_DIR}/pg_ctl" \ diff --git a/apps/temporal-postgresql/schema/README.md b/apps/temporal-postgresql/schema/README.md new file mode 100644 index 0000000000..591b890657 --- /dev/null +++ b/apps/temporal-postgresql/schema/README.md @@ -0,0 +1,70 @@ +# Why `temporal-postgresql` is pre-initialized using a SQL dump instead of `temporal-sql-tool`? + +In the vendor's Docker image, PostgreSQL's schema gets initialized using `temporal-sql-tool`: + +https://github.com/temporalio/temporal/blob/9fcf5e4153b29302de3c2333fdeff6343d0ca889/docker/start.sh#L71-L78 + +Later, the default namespace gets registered in the background, while `temporal-server` is getting started: + +https://github.com/temporalio/temporal/blob/9fcf5e4153b29302de3c2333fdeff6343d0ca889/docker/start.sh#L179-L187 + +There are a few issues with this approach: + +1. Running an important command in background while starting a service is not particularly "clean"; +2. Clients can't just wait for `temporal-server`'s port to open because there's no guarantee that the default namespace will already exist when they first connect; +3. Even when `tctl` manages to create the namespace, it doesn't seem to become available to clients for about 30 seconds so the clients are then forced to test for namespace existence. + +Therefore, to generate the default schema we: + +1. Pre-initialize schema using vendor's tools; +2. Start Temporal server; +3. Create default namespace; +4. Wait for a minute to let things "settle" or whatever it is that it's doing in the background; +5. `pg_dump` both databases to schema files to be later used for building the image. + +This is how the initial schema was generated: + +```bash +export TSQL="temporal-sql-tool \ + --plugin postgres \ + --ep 127.0.0.1 \ + -p 12345 \ + -u temporal \ + --pw temporal" + +# Create both databases using vendor's tools +$TSQL create --db temporal +$TSQL --db temporal setup-schema -v 0.0 +$TSQL --db temporal update-schema -d "${MAIN_SCHEMA_DIR}" + +$TSQL create --db temporal_visibility +$TSQL --db temporal_visibility setup-schema -v 0.0 +$TSQL --db temporal_visibility update-schema -d "${VISIBILITY_SCHEMA_DIR}" + +# Start the server in the background +temporal-server & + +# Create the default namespace whenever the server becomes ready +until tctl --ns default namespace describe < /dev/null; do + echo "Default namespace not found. Creating..." + sleep 0.2 + + # FIXME retention period rather short + tctl \ + --ns default \ + namespace register \ + --rd 1 \ + --desc "Default namespace for Temporal Server" \ + || echo "Creating default namespace failed." + +done + +# Even after creating the default namespace, it doesn't become immediately ready +# so wait for a bit +sleep 60 + +# Dump both databases pre-initialized with default namespace to be used for +# building the image +pg_dump --inserts temporal > mc_temporal.sql +pg_dump --inserts temporal_visibility > mc_temporal_visibility.sql +``` diff --git a/apps/temporal-postgresql/schemas/temporal.sql b/apps/temporal-postgresql/schema/mc_temporal.sql similarity index 53% rename from apps/temporal-postgresql/schemas/temporal.sql rename to apps/temporal-postgresql/schema/mc_temporal.sql index d32bb8cf1e..2081d5ae6f 100644 --- a/apps/temporal-postgresql/schemas/temporal.sql +++ b/apps/temporal-postgresql/schema/mc_temporal.sql @@ -477,285 +477,230 @@ ALTER TABLE ONLY public.buffered_events ALTER COLUMN id SET DEFAULT nextval('pub -- Data for Name: activity_info_maps; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.activity_info_maps (shard_id, namespace_id, workflow_id, run_id, schedule_id, data, data_encoding) FROM stdin; -\. -- -- Data for Name: buffered_events; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.buffered_events (shard_id, namespace_id, workflow_id, run_id, id, data, data_encoding) FROM stdin; -\. -- -- Data for Name: child_execution_info_maps; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.child_execution_info_maps (shard_id, namespace_id, workflow_id, run_id, initiated_id, data, data_encoding) FROM stdin; -\. -- -- Data for Name: cluster_membership; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.cluster_membership (membership_partition, host_id, rpc_address, rpc_port, role, session_start, last_heartbeat, record_expiry) FROM stdin; -0 \\xd343f2848d9b11eba18c02420a010005 10.1.0.5 6933 1 2021-03-25 18:56:36.884362 2021-03-25 18:57:25.896779 2021-03-27 18:57:25.896779 -0 \\xd346e1178d9b11eba18c02420a010005 10.1.0.5 6935 3 2021-03-25 18:56:36.908823 2021-03-25 18:57:28.931428 2021-03-27 18:57:28.931428 -0 \\xd34bfa858d9b11eba18c02420a010005 10.1.0.5 6939 4 2021-03-25 18:56:36.936065 2021-03-25 18:57:36.950967 2021-03-27 18:57:36.950967 -0 \\xd3454ab18d9b11eba18c02420a010005 10.1.0.5 6934 2 2021-03-25 18:56:36.892785 2021-03-25 18:57:38.910999 2021-03-27 18:57:38.910999 -\. +INSERT INTO public.cluster_membership VALUES (0, '\x0bc631ce8d9d11eba61a02420a010005', '10.1.0.5', 6939, 4, '2021-03-25 19:05:21.191604', '2021-03-25 19:08:16.229112', '2021-03-27 19:08:16.229112'); +INSERT INTO public.cluster_membership VALUES (0, '\x0bc172578d9d11eba61a02420a010005', '10.1.0.5', 6934, 2, '2021-03-25 19:05:21.155385', '2021-03-25 19:08:21.199238', '2021-03-27 19:08:21.199238'); +INSERT INTO public.cluster_membership VALUES (0, '\x0bc024568d9d11eba61a02420a010005', '10.1.0.5', 6933, 1, '2021-03-25 19:05:21.147447', '2021-03-25 19:08:24.182213', '2021-03-27 19:08:24.182213'); +INSERT INTO public.cluster_membership VALUES (0, '\x0bc2d69a8d9d11eba61a02420a010005', '10.1.0.5', 6935, 3, '2021-03-25 19:05:21.165985', '2021-03-25 19:08:24.227229', '2021-03-27 19:08:24.227229'); -- -- Data for Name: cluster_metadata; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.cluster_metadata (metadata_partition, data, data_encoding, version) FROM stdin; -0 \\x0a0661637469766510101a2432323465333031362d636566362d343963642d393565302d663962656330306438653934 Proto3 1 -\. +INSERT INTO public.cluster_metadata VALUES (0, '\x0a0661637469766510101a2432626461333835342d303934662d343463372d393131382d306261343466663166373463', 'Proto3', 1); -- -- Data for Name: current_executions; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.current_executions (shard_id, namespace_id, workflow_id, run_id, create_request_id, state, status, start_version, last_write_version) FROM stdin; -8 \\x32049b68787240948e63d0dd59896a83 temporal-sys-tq-scanner \\x8b5622d072f44c068c5e4b04d4b3a410 766425e5-f5f5-4743-ba90-8ca7ebdcdfb7 1 10 0 -\. +INSERT INTO public.current_executions VALUES (8, '\x32049b68787240948e63d0dd59896a83', 'temporal-sys-tq-scanner', '\xe26b7c7d5d724fdbb82c95b0043a260b', '77710b14-d9ae-4f95-a433-6300280ac0f2', 1, 1, 0, 0); -- -- Data for Name: executions; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.executions (shard_id, namespace_id, workflow_id, run_id, next_event_id, last_write_version, data, data_encoding, state, state_encoding) FROM stdin; -8 \\x32049b68787240948e63d0dd59896a83 temporal-sys-tq-scanner \\x8b5622d072f44c068c5e4b04d4b3a410 2 0 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121774656d706f72616c2d7379732d74712d7363616e6e65724a2374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d30522074656d706f72616c2d7379732d74712d7363616e6e65722d776f726b666c6f775a0062040880af1a6a02080a8801808040900101a2010c08e8b9f3820610ebfaefc803aa010c08e8b9f3820610ebfaefc803ca0100d00101fa0109656d70747955756964980201da020c30202a2f3132202a202a202ab2035412520a4c0a2438623536323264302d373266342d346330362d386335652d346230346434623361343130122438633465363161342d386136312d346263372d613234362d32373039363532303138326212020801ba032438623536323264302d373266342d346330362d386335652d346230346434623361343130c2030308f201ca030c0880f78e830610ebfaefc803 Proto3 \\x0a2437363634323565352d663566352d343734332d626139302d386361376562646364666237122438623536323264302d373266342d346330362d386335652d34623034643462336134313018012001 Proto3 -\. +INSERT INTO public.executions VALUES (8, '\x32049b68787240948e63d0dd59896a83', 'temporal-sys-tq-scanner', '\xe26b7c7d5d724fdbb82c95b0043a260b', 2, 0, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121774656d706f72616c2d7379732d74712d7363616e6e65724a2374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d30522074656d706f72616c2d7379732d74712d7363616e6e65722d776f726b666c6f775a0062040880af1a6a02080a8801808040900101a2010b08f5bdf3820610acfea362aa010b08f5bdf3820610acfea362ca0100d00101fa0109656d70747955756964980201da020c30202a2f3132202a202a202ab2035412520a4c0a2465323662376337642d356437322d346664622d623832632d393562303034336132363062122433613039333430642d353231372d346362322d396165382d61633132356539396364326612020801ba032465323662376337642d356437322d346664622d623832632d393562303034336132363062c2030308f101ca030b0880f78e830610acfea362', 'Proto3', '\x0a2437373731306231342d643961652d346639352d613433332d363330303238306163306632122465323662376337642d356437322d346664622d623832632d39356230303433613236306218012001', 'Proto3'); -- -- Data for Name: history_node; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.history_node (shard_id, tree_id, branch_id, node_id, txn_id, data, data_encoding) FROM stdin; -8 \\x8b5622d072f44c068c5e4b04d4b3a410 \\x8c4e61a48a614bc7a24627096520182b 1 -1048577 \\x0aef010801120c08e8b9f3820610ebfaefc80318012880804032d6010a220a2074656d706f72616c2d7379732d74712d7363616e6e65722d776f726b666c6f772a270a2374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d3010013a0042040880af1a4a02080a5803722438623536323264302d373266342d346330362d386335652d3462303464346233613431307a103435406561346434626234623434314082012438623536323264302d373266342d346330362d386335652d346230346434623361343130900101a2010c30202a2f3132202a202a202aaa010408988e01ca0100 Proto3 -\. +INSERT INTO public.history_node VALUES (8, '\xe26b7c7d5d724fdbb82c95b0043a260b', '\x3a09340d52174cb29ae8ac125e99cd2f', 1, -1048577, '\x0aee010801120b08f5bdf3820610acfea36218012880804032d6010a220a2074656d706f72616c2d7379732d74712d7363616e6e65722d776f726b666c6f772a270a2374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d3010013a0042040880af1a4a02080a5803722465323662376337642d356437322d346664622d623832632d3935623030343361323630627a103435403266323636626239626333654082012465323662376337642d356437322d346664622d623832632d393562303034336132363062900101a2010c30202a2f3132202a202a202aaa0104088b8a01ca0100', 'Proto3'); -- -- Data for Name: history_tree; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.history_tree (shard_id, tree_id, branch_id, data, data_encoding) FROM stdin; -8 \\x8b5622d072f44c068c5e4b04d4b3a410 \\x8c4e61a48a614bc7a24627096520182b \\x0a4c0a2438623536323264302d373266342d346330362d386335652d346230346434623361343130122438633465363161342d386136312d346263372d613234362d323730393635323031383262120c08e8b9f3820610c89790c9031a6133323034396236382d373837322d343039342d386536332d6430646435393839366138333a74656d706f72616c2d7379732d74712d7363616e6e65723a38623536323264302d373266342d346330362d386335652d346230346434623361343130 Proto3 -\. +INSERT INTO public.history_tree VALUES (8, '\xe26b7c7d5d724fdbb82c95b0043a260b', '\x3a09340d52174cb29ae8ac125e99cd2f', '\x0a4c0a2465323662376337642d356437322d346664622d623832632d393562303034336132363062122433613039333430642d353231372d346362322d396165382d616331323565393963643266120b08f5bdf382061092f3bd621a6133323034396236382d373837322d343039342d386536332d6430646435393839366138333a74656d706f72616c2d7379732d74712d7363616e6e65723a65323662376337642d356437322d346664622d623832632d393562303034336132363062', 'Proto3'); -- -- Data for Name: namespace_metadata; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.namespace_metadata (partition_id, notification_version) FROM stdin; -54321 3 -\. +INSERT INTO public.namespace_metadata VALUES (54321, 3); -- -- Data for Name: namespaces; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.namespaces (partition_id, id, name, notification_version, data, data_encoding, is_global) FROM stdin; -54321 \\x32049b68787240948e63d0dd59896a83 temporal-system 1 \\x0a780a2433323034396236382d373837322d343039342d386536332d64306464353938393661383310011a0f74656d706f72616c2d73797374656d222254656d706f72616c20696e7465726e616c2073797374656d206e616d6573706163652a1974656d706f72616c2d636f72654074656d706f72616c2e696f120a0a040880f524200130011a100a06616374697665120661637469766528ffffffffffffffffff01 Proto3 f -54321 \\x26f80ced0ead4534b085ff8c0643031c default 2 \\x0a580a2432366638306365642d306561642d343533342d623038352d66663863303634333033316310011a0764656661756c74222544656661756c74206e616d65737061636520666f722054656d706f72616c2053657276657212660a040880a3051a0020022a2a66696c653a2f2f2f7661722f6c69622f74656d706f72616c2f617263686976616c2f74656d706f72616c30023a2c66696c653a2f2f2f7661722f6c69622f74656d706f72616c2f617263686976616c2f7669736962696c6974791a100a066163746976651206616374697665 Proto3 f -\. +INSERT INTO public.namespaces VALUES (54321, '\x32049b68787240948e63d0dd59896a83', 'temporal-system', 1, '\x0a780a2433323034396236382d373837322d343039342d386536332d64306464353938393661383310011a0f74656d706f72616c2d73797374656d222254656d706f72616c20696e7465726e616c2073797374656d206e616d6573706163652a1974656d706f72616c2d636f72654074656d706f72616c2e696f120a0a040880f524200130011a100a06616374697665120661637469766528ffffffffffffffffff01', 'Proto3', false); +INSERT INTO public.namespaces VALUES (54321, '\xf61d7f7ff624482884bcc0fedab2456c', 'default', 2, '\x0a580a2466363164376637662d663632342d343832382d383462632d63306665646162323435366310011a0764656661756c74222544656661756c74206e616d65737061636520666f722054656d706f72616c2053657276657212660a040880a3051a0020022a2a66696c653a2f2f2f7661722f6c69622f74656d706f72616c2f617263686976616c2f74656d706f72616c30023a2c66696c653a2f2f2f7661722f6c69622f74656d706f72616c2f617263686976616c2f7669736962696c6974791a100a066163746976651206616374697665', 'Proto3', false); -- -- Data for Name: queue; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.queue (queue_type, message_id, message_payload, message_encoding) FROM stdin; -\. -- -- Data for Name: queue_metadata; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.queue_metadata (queue_type, data, data_encoding) FROM stdin; -1 \\x7b7d Json --1 \\x7b7d Json -\. +INSERT INTO public.queue_metadata VALUES (1, '\x7b7d', 'Json'); +INSERT INTO public.queue_metadata VALUES (-1, '\x7b7d', 'Json'); -- -- Data for Name: replication_tasks; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.replication_tasks (shard_id, task_id, data, data_encoding) FROM stdin; -\. -- -- Data for Name: replication_tasks_dlq; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.replication_tasks_dlq (source_cluster_name, shard_id, task_id, data, data_encoding) FROM stdin; -\. -- -- Data for Name: request_cancel_info_maps; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.request_cancel_info_maps (shard_id, namespace_id, workflow_id, run_id, initiated_id, data, data_encoding) FROM stdin; -\. -- -- Data for Name: schema_update_history; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.schema_update_history (version_partition, year, month, update_time, description, manifest_md5, new_version, old_version) FROM stdin; -0 2021 3 2021-03-21 23:09:17.543434 initial version 0.0 0 -0 2021 3 2021-03-21 23:09:17.807128 base version of schema 55b84ca114ac34d84bdc5f52c198fa33 1.0 0.0 -0 2021 3 2021-03-21 23:09:17.80979 schema update for cluster metadata 58f06841bbb187cb210db32a090c21ee 1.1 1.0 -0 2021 3 2021-03-21 23:09:17.811408 schema update for RPC replication c6bdeea21882e2625038927a84929b16 1.2 1.1 -0 2021 3 2021-03-21 23:09:17.815148 schema update for kafka deprecation 3beee7d470421674194475f94b58d89b 1.3 1.2 -0 2021 3 2021-03-21 23:09:17.816468 schema update for cluster metadata cleanup c53e2e9cea5660c8a1f3b2ac73cdb138 1.4 1.3 -\. +INSERT INTO public.schema_update_history VALUES (0, 2021, 3, '2021-03-21 23:09:17.543434', 'initial version', '', '0.0', '0'); +INSERT INTO public.schema_update_history VALUES (0, 2021, 3, '2021-03-21 23:09:17.807128', 'base version of schema', '55b84ca114ac34d84bdc5f52c198fa33', '1.0', '0.0'); +INSERT INTO public.schema_update_history VALUES (0, 2021, 3, '2021-03-21 23:09:17.80979', 'schema update for cluster metadata', '58f06841bbb187cb210db32a090c21ee', '1.1', '1.0'); +INSERT INTO public.schema_update_history VALUES (0, 2021, 3, '2021-03-21 23:09:17.811408', 'schema update for RPC replication', 'c6bdeea21882e2625038927a84929b16', '1.2', '1.1'); +INSERT INTO public.schema_update_history VALUES (0, 2021, 3, '2021-03-21 23:09:17.815148', 'schema update for kafka deprecation', '3beee7d470421674194475f94b58d89b', '1.3', '1.2'); +INSERT INTO public.schema_update_history VALUES (0, 2021, 3, '2021-03-21 23:09:17.816468', 'schema update for cluster metadata cleanup', 'c53e2e9cea5660c8a1f3b2ac73cdb138', '1.4', '1.3'); -- -- Data for Name: schema_version; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.schema_version (version_partition, db_name, creation_time, curr_version, min_compatible_version) FROM stdin; -0 temporal 2021-03-21 23:09:17.816194 1.4 1.0 -\. +INSERT INTO public.schema_version VALUES (0, 'temporal', '2021-03-21 23:09:17.816194', '1.4', '1.0'); -- -- Data for Name: shards; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.shards (shard_id, range_id, data, data_encoding) FROM stdin; -1 1 \\x080110011a0d31302e312e302e353a3732333430014802 Proto3 -3 1 \\x080310011a0d31302e312e302e353a3732333430014802 Proto3 -7 1 \\x080710011a0d31302e312e302e353a3732333430014802 Proto3 -10 1 \\x080a10011a0d31302e312e302e353a3732333430014802 Proto3 -5 1 \\x080510011a0d31302e312e302e353a3732333430014802 Proto3 -11 1 \\x080b10011a0d31302e312e302e353a3732333430014802 Proto3 -9 1 \\x080910011a0d31302e312e302e353a3732333430014802 Proto3 -6 1 \\x080610011a0d31302e312e302e353a3732333430014802 Proto3 -8 1 \\x080810011a0d31302e312e302e353a3732333430014802 Proto3 -2 1 \\x080210011a0d31302e312e302e353a3732333430014802 Proto3 -14 1 \\x080e10011a0d31302e312e302e353a3732333430014802 Proto3 -16 1 \\x081010011a0d31302e312e302e353a3732333430014802 Proto3 -4 1 \\x080410011a0d31302e312e302e353a3732333430014802 Proto3 -13 1 \\x080d10011a0d31302e312e302e353a3732333430014802 Proto3 -12 1 \\x080c10011a0d31302e312e302e353a3732333430014802 Proto3 -15 1 \\x080f10011a0d31302e312e302e353a3732333430014802 Proto3 -\. +INSERT INTO public.shards VALUES (7, 1, '\x080710011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (4, 1, '\x080410011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (10, 1, '\x080a10011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (1, 1, '\x080110011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (11, 1, '\x080b10011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (2, 1, '\x080210011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (5, 1, '\x080510011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (8, 1, '\x080810011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (12, 1, '\x080c10011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (15, 1, '\x080f10011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (13, 1, '\x080d10011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (6, 1, '\x080610011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (9, 1, '\x080910011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (3, 1, '\x080310011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (16, 1, '\x081010011a0d31302e312e302e353a3732333430014802', 'Proto3'); +INSERT INTO public.shards VALUES (14, 1, '\x080e10011a0d31302e312e302e353a3732333430014802', 'Proto3'); -- -- Data for Name: signal_info_maps; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.signal_info_maps (shard_id, namespace_id, workflow_id, run_id, initiated_id, data, data_encoding) FROM stdin; -\. -- -- Data for Name: signals_requested_sets; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.signals_requested_sets (shard_id, namespace_id, workflow_id, run_id, signal_id) FROM stdin; -\. -- -- Data for Name: task_queues; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.task_queues (range_hash, task_queue_id, range_id, data, data_encoding) FROM stdin; -1502813099 \\x32049b68787240948e63d0dd59896a836561346434626234623434313a66643265396231612d623363362d343662392d386333352d38343130363233323937613701 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312316561346434626234623434313a66643265396231612d623363362d343662392d386333352d38343130363233323937613718012002320b08a1ddf8820610a6a680223a0b08a1baf3820610b89f8022 Proto3 -940296977 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3101 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f31180120013a0b08a1baf3820610dc8bc822 Proto3 -653791233 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f3102 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312262f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f31180220013a0b08a1baf3820610d194da22 Proto3 -1410825331 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d3002 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d30180220013a0b08a1baf3820610c28d8623 Proto3 -3095716534 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d626174636865722d7461736b717565756502 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121e74656d706f72616c2d7379732d626174636865722d7461736b7175657565180220013a0b08a1baf3820610f2ec8e23 Proto3 -2063506710 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d617263686976616c2d747101 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121474656d706f72616c2d617263686976616c2d7471180120013a0b08a1baf38206109cd1ca23 Proto3 -1994493189 \\x32049b68787240948e63d0dd59896a836561346434626234623434313a61616336633234302d393530662d343065332d396130352d38353531313136653639373301 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312316561346434626234623434313a61616336633234302d393530662d343065332d396130352d38353531313136653639373318012002320b08a1ddf8820610c2dffd223a0b08a1baf382061095d6fd22 Proto3 -4095103286 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f3102 1\\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312322f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f31180220013a0b08a1baf38206109bcfeb23 Proto3 -1332228876 \\x32049b68787240948e63d0dd59896a836561346434626234623434313a36326232346637372d393431352d343336342d616534332d31663134366537316561326601 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312316561346434626234623434313a36326232346637372d393431352d343336342d616534332d31663134366537316561326618012002320b08a1ddf8820610fa8293243a0b08a1baf3820610fefa9224 Proto3 -3469555445 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d626174636865722d7461736b717565756501 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121e74656d706f72616c2d7379732d626174636865722d7461736b7175657565180120013a0b08a1baf3820610dfaee824 Proto3 -1018868252 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f3202 1\\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312322f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f32180220013a0b08a1baf3820610cbab8c24 Proto3 -2528910666 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c69637901 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122a74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c696379180120013a0b08a1baf382061081b68324 Proto3 -3617866575 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f3201 1\\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312322f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f32180120013a0b08a1baf3820610efcdba27 Proto3 -1177789365 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3302 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f33180220013a0b08a0baf3820610dad79721 Proto3 -3681167674 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3202 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f32180220013a0b08a0baf3820610b1849221 Proto3 -2662461164 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3102 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f31180220013a0b08a0baf38206109fd4a922 Proto3 -289827042 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3102 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f31180220013a0b08a0baf3820610e9d7b922 Proto3 -430461988 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3301 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f33180120013a0b08a0baf3820610d493bd22 Proto3 -740397391 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3301 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f33180120013a0b08a0baf382061087d6b023 Proto3 -1693213168 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f3302 1\\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312322f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f33180220013a0b08a0baf3820610ddb1fb23 Proto3 -739332331 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f3201 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312262f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f32180120013a0b08a0baf3820610dde58024 Proto3 -481579558 \\x32049b68787240948e63d0dd59896a836561346434626234623434313a30636231303935612d366365342d343133332d623631652d65366137323333313833336201 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312316561346434626234623434313a30636231303935612d366365342d343133332d623631652d65366137323333313833336218012002320b08a1ddf8820610f89fee1d3a0b08a1baf38206108499ee1d Proto3 -70786998 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3101 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f31180120013a0b08a1baf3820610fabec31f Proto3 -2358430835 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3202 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f32180220013a0b08a1baf3820610efe1eb1f Proto3 -3662736275 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3302 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f33180220013a0b08a1baf38206108f8eef1f Proto3 -3963122975 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f3202 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312262f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f32180220013a0b08a1baf38206108cb38521 Proto3 -4214421317 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d3001 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d30180120013a0b08a1baf382061080949521 Proto3 -3597285451 \\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f3101 1 \\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312262f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f31180120013a0b08a1baf3820610909c9823 Proto3 -1688886821 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c69637902 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122a74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c696379180220013a0b08a1baf382061081f6e824 Proto3 -288707420 \\x32049b68787240948e63d0dd59896a8374656d706f72616c2d617263686976616c2d747102 1 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121474656d706f72616c2d617263686976616c2d7471180220013a0b08a1baf38206108190e924 Proto3 -\. +INSERT INTO public.task_queues VALUES (1177789365, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3302', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f33180220013a0c08a2bff3820610c9bdf39d01', 'Proto3'); +INSERT INTO public.task_queues VALUES (4075587537, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f3301', 1, '\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312322f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f33180120013a0c08a3bff3820610f0a5a3a701', 'Proto3'); +INSERT INTO public.task_queues VALUES (4130865127, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3201', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f32180120013a0c08a4bff3820610b693c39e01', 'Proto3'); +INSERT INTO public.task_queues VALUES (3662736275, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3302', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f33180220013a0c08a4bff3820610cfa590a301', 'Proto3'); +INSERT INTO public.task_queues VALUES (289827042, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3102', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f31180220013a0c08a5bff3820610e2a9f09c01', 'Proto3'); +INSERT INTO public.task_queues VALUES (288707420, '\x32049b68787240948e63d0dd59896a8374656d706f72616c2d617263686976616c2d747102', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121474656d706f72616c2d617263686976616c2d7471180220013a0c08a5bff3820610eadbdc9d01', 'Proto3'); +INSERT INTO public.task_queues VALUES (1410825331, '\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d3002', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d30180220013a0c08a5bff3820610edb3e99d01', 'Proto3'); +INSERT INTO public.task_queues VALUES (653791233, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f3102', 1, '\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312262f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f31180220013a0c08a5bff38206108482ee9e01', 'Proto3'); +INSERT INTO public.task_queues VALUES (4214421317, '\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d3001', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122374656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d30180120013a0c08a5bff38206108bbbbd9f01', 'Proto3'); +INSERT INTO public.task_queues VALUES (2358430835, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3202', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f32180220013a0c08a5bff382061092f3c99f01', 'Proto3'); +INSERT INTO public.task_queues VALUES (3720966762, '\x32049b68787240948e63d0dd59896a833266323636626239626333653a63643331636664302d623061322d343663662d623232392d34663839353730386234636501', 1, '\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312313266323636626239626333653a63643331636664302d623061322d343663662d623232392d34663839353730386234636518012002320c08a5e2f8820610efc4b2a0013a0c08a5bff3820610a4bdb2a001', 'Proto3'); +INSERT INTO public.task_queues VALUES (2063506710, '\x32049b68787240948e63d0dd59896a8374656d706f72616c2d617263686976616c2d747101', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121474656d706f72616c2d617263686976616c2d7471180120013a0c08a5bff3820610e7e6afa301', 'Proto3'); +INSERT INTO public.task_queues VALUES (4095103286, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f3102', 1, '\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312322f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f31180220013a0c08a5bff382061093b198a001', 'Proto3'); +INSERT INTO public.task_queues VALUES (1688886821, '\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c69637902', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122a74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c696379180220013a0c08a5bff3820610d0bea6a301', 'Proto3'); +INSERT INTO public.task_queues VALUES (2575880815, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f3101', 1, '\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312322f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f31180120013a0c08a5bff3820610e7c0baa301', 'Proto3'); +INSERT INTO public.task_queues VALUES (2528910666, '\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c69637901', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122a74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c696379180120013a0c08a5bff3820610dbb7ada401', 'Proto3'); +INSERT INTO public.task_queues VALUES (3898027385, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f3302', 1, '\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312262f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f33180220013a0c08a2bff382061094e7e9a001', 'Proto3'); +INSERT INTO public.task_queues VALUES (740397391, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3301', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f33180120013a0c08a2bff3820610f4fe9ba401', 'Proto3'); +INSERT INTO public.task_queues VALUES (1838990935, '\x32049b68787240948e63d0dd59896a833266323636626239626333653a33373030343439352d613332312d346361352d396537372d32656131363231623131646101', 1, '\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312313266323636626239626333653a33373030343439352d613332312d346361352d396537372d32656131363231623131646118012002320c08a5e2f8820610a5d0dd9b013a0c08a5bff3820610bdc9dd9b01', 'Proto3'); +INSERT INTO public.task_queues VALUES (430461988, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3301', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f33180120013a0c08a5bff3820610eeb2da9b01', 'Proto3'); +INSERT INTO public.task_queues VALUES (1779306934, '\x32049b68787240948e63d0dd59896a833266323636626239626333653a31393461633537342d663165362d346264622d383465322d63336136396133303637623001', 1, '\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312313266323636626239626333653a31393461633537342d663165362d346264622d383465322d63336136396133303637623018012002320c08a5e2f8820610e5c3a39f013a0c08a5bff3820610ecbba39f01', 'Proto3'); +INSERT INTO public.task_queues VALUES (3681167674, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f3202', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833122b2f5f7379732f74656d706f72616c2d7379732d74712d7363616e6e65722d7461736b71756575652d302f32180220013a0c08a5bff3820610ae94d59e01', 'Proto3'); +INSERT INTO public.task_queues VALUES (3469555445, '\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d626174636865722d7461736b717565756501', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121e74656d706f72616c2d7379732d626174636865722d7461736b7175657565180120013a0c08a5bff38206109ac398a001', 'Proto3'); +INSERT INTO public.task_queues VALUES (4294755352, '\x32049b68787240948e63d0dd59896a833266323636626239626333653a64303864363434642d386137302d343337302d383638392d38636266343433376333323901', 1, '\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312313266323636626239626333653a64303864363434642d386137302d343337302d383638392d38636266343433376333323918012002320c08a5e2f8820610abcaa59f013a0c08a5bff382061099c2a59f01', 'Proto3'); +INSERT INTO public.task_queues VALUES (2174630977, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d617263686976616c2d74712f3201', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121c2f5f7379732f74656d706f72616c2d617263686976616c2d74712f32180120013a0c08a5bff382061083bce6a101', 'Proto3'); +INSERT INTO public.task_queues VALUES (3095716534, '\x32049b68787240948e63d0dd59896a8374656d706f72616c2d7379732d626174636865722d7461736b717565756502', 1, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121e74656d706f72616c2d7379732d626174636865722d7461736b7175657565180220013a0c08a5bff3820610c5bda5a001', 'Proto3'); +INSERT INTO public.task_queues VALUES (3597285451, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f3101', 1, '\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312262f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f31180120013a0c08a5bff3820610debfe4a201', 'Proto3'); +INSERT INTO public.task_queues VALUES (1018868252, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f3202', 1, '\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312322f5f7379732f74656d706f72616c2d7379732d70726f636573736f722d706172656e742d636c6f73652d706f6c6963792f32180220013a0c08a5bff3820610dd97eea201', 'Proto3'); +INSERT INTO public.task_queues VALUES (3963122975, '\x32049b68787240948e63d0dd59896a832f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f3202', 1, '\x0a2433323034396236382d373837322d343039342d386536332d64306464353938393661383312262f5f7379732f74656d706f72616c2d7379732d626174636865722d7461736b71756575652f32180220013a0c08a5bff3820610b1fde7a101', 'Proto3'); -- -- Data for Name: tasks; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.tasks (range_hash, task_queue_id, task_id, data, data_encoding) FROM stdin; -\. -- -- Data for Name: timer_info_maps; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.timer_info_maps (shard_id, namespace_id, workflow_id, run_id, timer_id, data, data_encoding) FROM stdin; -\. -- -- Data for Name: timer_tasks; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.timer_tasks (shard_id, visibility_timestamp, task_id, data, data_encoding) FROM stdin; -8 2021-03-31 00:00:00.958136 1048579 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121774656d706f72616c2d7379732d74712d7363616e6e65721a2438623536323264302d373266342d346330362d386335652d346230346434623361343130200f508380405a0c0880f78e830610ebfaefc803 Proto3 -8 2021-03-26 00:00:00.958136 1048580 \\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121774656d706f72616c2d7379732d74712d7363616e6e65721a2438623536323264302d373266342d346330362d386335652d34623034643462336134313020123002508480405a0c0880c8f4820610ebfaefc803 Proto3 -\. +INSERT INTO public.timer_tasks VALUES (8, '2021-03-31 00:00:00.206111', 1048579, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121774656d706f72616c2d7379732d74712d7363616e6e65721a2465323662376337642d356437322d346664622d623832632d393562303034336132363062200f508380405a0b0880f78e830610acfea362', 'Proto3'); +INSERT INTO public.timer_tasks VALUES (8, '2021-03-26 00:00:00.206111', 1048580, '\x0a2433323034396236382d373837322d343039342d386536332d643064643539383936613833121774656d706f72616c2d7379732d74712d7363616e6e65721a2465323662376337642d356437322d346664622d623832632d39356230303433613236306220123002508480405a0b0880c8f4820610acfea362', 'Proto3'); -- -- Data for Name: transfer_tasks; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.transfer_tasks (shard_id, task_id, data, data_encoding) FROM stdin; -\. -- -- Data for Name: visibility_tasks; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.visibility_tasks (shard_id, task_id, data, data_encoding) FROM stdin; -\. -- @@ -1035,4 +980,3 @@ CREATE INDEX cm_idx_rpchost ON public.cluster_membership USING btree (rpc_addres -- -- PostgreSQL database dump complete -- - diff --git a/apps/temporal-postgresql/schemas/temporal_visibility.sql b/apps/temporal-postgresql/schema/mc_temporal_visibility.sql similarity index 85% rename from apps/temporal-postgresql/schemas/temporal_visibility.sql rename to apps/temporal-postgresql/schema/mc_temporal_visibility.sql index cca1ec456c..92463903f1 100644 --- a/apps/temporal-postgresql/schemas/temporal_visibility.sql +++ b/apps/temporal-postgresql/schema/mc_temporal_visibility.sql @@ -79,28 +79,22 @@ ALTER TABLE public.schema_version OWNER TO temporal; -- Data for Name: executions_visibility; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.executions_visibility (namespace_id, run_id, start_time, execution_time, workflow_id, workflow_type_name, status, close_time, history_length, memo, encoding, task_queue) FROM stdin; -\. -- -- Data for Name: schema_update_history; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.schema_update_history (version_partition, year, month, update_time, description, manifest_md5, new_version, old_version) FROM stdin; -0 2021 3 2021-03-21 23:09:18.317158 initial version 0.0 0 -0 2021 3 2021-03-21 23:09:18.40721 base version of visibility schema 698373883c1c0dd44607a446a62f2a79 1.0 0.0 -0 2021 3 2021-03-21 23:09:18.41287 add close time & status index e286f8af0a62e291b35189ce29d3fff3 1.1 1.0 -\. +INSERT INTO public.schema_update_history VALUES (0, 2021, 3, '2021-03-21 23:09:18.317158', 'initial version', '', '0.0', '0'); +INSERT INTO public.schema_update_history VALUES (0, 2021, 3, '2021-03-21 23:09:18.40721', 'base version of visibility schema', '698373883c1c0dd44607a446a62f2a79', '1.0', '0.0'); +INSERT INTO public.schema_update_history VALUES (0, 2021, 3, '2021-03-21 23:09:18.41287', 'add close time & status index', 'e286f8af0a62e291b35189ce29d3fff3', '1.1', '1.0'); -- -- Data for Name: schema_version; Type: TABLE DATA; Schema: public; Owner: temporal -- -COPY public.schema_version (version_partition, db_name, creation_time, curr_version, min_compatible_version) FROM stdin; -0 temporal_visibility 2021-03-21 23:09:18.411815 1.1 0.1 -\. +INSERT INTO public.schema_version VALUES (0, 'temporal_visibility', '2021-03-21 23:09:18.411815', '1.1', '0.1'); -- diff --git a/apps/temporal-server/Dockerfile b/apps/temporal-server/Dockerfile index 501162a99e..f5944725da 100644 --- a/apps/temporal-server/Dockerfile +++ b/apps/temporal-server/Dockerfile @@ -49,15 +49,13 @@ RUN \ mkdir -p /opt/temporal-server/config/ && \ chown temporal:temporal /opt/temporal-server/config/ && \ # - # Directories for first run shim and archival - mkdir -p /var/lib/temporal/archival/temporal/ && \ - mkdir -p /var/lib/temporal/archival/visibility/ && \ + # Directories workflow archival + mkdir -p \ + /var/lib/temporal/archival/temporal/ \ + /var/lib/temporal/archival/visibility/ \ + && \ chown -R temporal:temporal /var/lib/temporal/ && \ # - # Create a file that will denote whether it's the first run of this service - # mounted to a specific volume - touch /var/lib/temporal/first_run && \ - # true COPY bin/* /opt/temporal-server/bin/ @@ -68,7 +66,7 @@ ENV PATH="/opt/temporal-server/bin:${PATH}" \ TEMPORAL_CLI_ADDRESS="temporal-server:7233" \ TEMPORAL_CLI_NAMESPACE="default" -# Archives and first run shim +# Archives VOLUME /var/lib/temporal/ # Port descriptions: https://docs.temporal.io/docs/server-architecture/ diff --git a/apps/temporal-server/bin/temporal.sh b/apps/temporal-server/bin/temporal.sh index 4caa3890e9..48009185f3 100755 --- a/apps/temporal-server/bin/temporal.sh +++ b/apps/temporal-server/bin/temporal.sh @@ -31,27 +31,7 @@ while true; do fi done -register_default_namespace() { - echo "Registering default namespace" - until tctl --ns default namespace describe < /dev/null; do - echo "Default namespace not found. Creating..." - sleep 0.2 - # FIXME retention period super short - # FIXME doesn't work a few seconds after getting created - tctl --ns default namespace register --rd 1 --desc "Default namespace for Temporal Server" || echo "Creating default namespace failed." - done - echo "Default namespace registration complete." -} - -if [ -e /var/lib/temporal/first_run ]; then - echo "Registering default namespace on first run..." - # FIXME not that great to run it in the background - register_default_namespace & - rm /var/lib/temporal/first_run -fi - -# No "exec" because default namespace gets registered in the background -temporal-server \ +exec temporal-server \ --root /opt/temporal-server \ --env mediacloud \ start From 257ef134ad9d19c42a449020701c07fac3fb47ab Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 25 Mar 2021 21:49:50 +0200 Subject: [PATCH 036/175] Make munin-cron build again --- apps/munin-cron/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/munin-cron/Dockerfile b/apps/munin-cron/Dockerfile index 27e617e49f..e58e4f1e79 100644 --- a/apps/munin-cron/Dockerfile +++ b/apps/munin-cron/Dockerfile @@ -4,6 +4,9 @@ FROM gcr.io/mcback/cron-base:latest +# FIXME +RUN apt-get -y update + # Install packages RUN \ # From afe03f14e044e72a1c40f0871a5629433c33a68b Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 25 Mar 2021 22:11:09 +0200 Subject: [PATCH 037/175] Adapt Helm chart --- apps/temporal-server/bin/temporal.sh | 2 ++ apps/temporal-server/config/dynamicconfig.yaml | 6 ++++++ apps/temporal-server/config/mediacloud_template.yaml | 9 +++------ 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/apps/temporal-server/bin/temporal.sh b/apps/temporal-server/bin/temporal.sh index 48009185f3..a79008f63b 100755 --- a/apps/temporal-server/bin/temporal.sh +++ b/apps/temporal-server/bin/temporal.sh @@ -31,6 +31,8 @@ while true; do fi done +# FIXME perhaps run all four services ("frontend", "history", "matching", "worker") +# as separate containers? exec temporal-server \ --root /opt/temporal-server \ --env mediacloud \ diff --git a/apps/temporal-server/config/dynamicconfig.yaml b/apps/temporal-server/config/dynamicconfig.yaml index 0576043ba0..aeda9d600c 100644 --- a/apps/temporal-server/config/dynamicconfig.yaml +++ b/apps/temporal-server/config/dynamicconfig.yaml @@ -13,6 +13,12 @@ frontend.historyMgrNumConns: frontend.throttledLogRPS: - value: 20 constraints: {} +matching.numTaskqueueReadPartitions: +- value: 5 + constraints: {} +matching.numTaskqueueWritePartitions: +- value: 5 + constraints: {} history.historyMgrNumConns: - value: 50 constraints: {} diff --git a/apps/temporal-server/config/mediacloud_template.yaml b/apps/temporal-server/config/mediacloud_template.yaml index 21358e1d99..e4295dc00c 100644 --- a/apps/temporal-server/config/mediacloud_template.yaml +++ b/apps/temporal-server/config/mediacloud_template.yaml @@ -1,11 +1,8 @@ -# FIXME adapt Helm chart configuration here - log: stdout: true level: info persistence: - # FIXME is this right? numHistoryShards: 16 defaultStore: default visibilityStore: visibility @@ -52,11 +49,11 @@ persistence: username: "" password: "" indices: - # FIXME rename to "temporal-visibility" perhaps? - visibility: "temporal-visibility-dev" + visibility: "temporal-visibility-mc" global: membership: + name: temporal maxJoinDuration: 30s # broadcastAddress: "0.0.0.0" tls: @@ -196,4 +193,4 @@ publicClient: dynamicConfigClient: filepath: "/opt/temporal-server/config/dynamicconfig.yaml" - pollInterval: "60s" + pollInterval: "10s" From d4e3b7764caf8c1b42b98b9f2ecce25097742888 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Fri, 26 Mar 2021 00:26:46 +0200 Subject: [PATCH 038/175] Add extra whitespace --- apps/cliff-fetch-annotation-and-tag/.dockerignore | 1 + apps/nytlabels-fetch-annotation-and-tag/.dockerignore | 1 + 2 files changed, 2 insertions(+) diff --git a/apps/cliff-fetch-annotation-and-tag/.dockerignore b/apps/cliff-fetch-annotation-and-tag/.dockerignore index 752414ae9c..9b2c362a80 100644 --- a/apps/cliff-fetch-annotation-and-tag/.dockerignore +++ b/apps/cliff-fetch-annotation-and-tag/.dockerignore @@ -89,3 +89,4 @@ sdist Temporary Items wheels _Inline + diff --git a/apps/nytlabels-fetch-annotation-and-tag/.dockerignore b/apps/nytlabels-fetch-annotation-and-tag/.dockerignore index 752414ae9c..9b2c362a80 100644 --- a/apps/nytlabels-fetch-annotation-and-tag/.dockerignore +++ b/apps/nytlabels-fetch-annotation-and-tag/.dockerignore @@ -89,3 +89,4 @@ sdist Temporary Items wheels _Inline + From 9daa14dc3f95983ceb36feda766b640d32fe07cc Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Fri, 26 Mar 2021 01:58:25 +0200 Subject: [PATCH 039/175] Add Prometheus and Grafana --- .gitmodules | 3 + apps/temporal-grafana/.dockerignore | 92 +++++++++++++ apps/temporal-grafana/Dockerfile | 71 ++++++++++ apps/temporal-grafana/dashboards | 1 + apps/temporal-grafana/grafana.ini | 126 ++++++++++++++++++ .../provisioning/dashboards/temporal.yml | 10 ++ .../provisioning/datasources/temporal.yml | 7 + .../provisioning/notifiers/.empty_dir | 0 .../provisioning/plugins/.empty_dir | 0 apps/temporal-prometheus/.dockerignore | 92 +++++++++++++ apps/temporal-prometheus/Dockerfile | 32 +++++ apps/temporal-prometheus/prometheus.yml | 14 ++ apps/temporal-server/Dockerfile | 7 +- .../config/mediacloud_template.yaml | 27 ++-- apps/temporal-server/docker-compose.tests.yml | 32 +++++ 15 files changed, 501 insertions(+), 13 deletions(-) create mode 100644 apps/temporal-grafana/.dockerignore create mode 100644 apps/temporal-grafana/Dockerfile create mode 160000 apps/temporal-grafana/dashboards create mode 100644 apps/temporal-grafana/grafana.ini create mode 100644 apps/temporal-grafana/provisioning/dashboards/temporal.yml create mode 100644 apps/temporal-grafana/provisioning/datasources/temporal.yml create mode 100644 apps/temporal-grafana/provisioning/notifiers/.empty_dir create mode 100644 apps/temporal-grafana/provisioning/plugins/.empty_dir create mode 100644 apps/temporal-prometheus/.dockerignore create mode 100644 apps/temporal-prometheus/Dockerfile create mode 100644 apps/temporal-prometheus/prometheus.yml diff --git a/.gitmodules b/.gitmodules index 70ec6e112b..73a4a82248 100644 --- a/.gitmodules +++ b/.gitmodules @@ -34,3 +34,6 @@ [submodule "apps/elk-journalbeat/journald-log-sample"] path = apps/elk-journalbeat/journald-log-sample url = https://github.com/mediacloud/journald-log-sample.git +[submodule "apps/temporal-grafana/dashboards"] + path = apps/temporal-grafana/dashboards + url = https://github.com/temporalio/dashboards.git diff --git a/apps/temporal-grafana/.dockerignore b/apps/temporal-grafana/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/temporal-grafana/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/temporal-grafana/Dockerfile b/apps/temporal-grafana/Dockerfile new file mode 100644 index 0000000000..3500dac11a --- /dev/null +++ b/apps/temporal-grafana/Dockerfile @@ -0,0 +1,71 @@ +# +# Grafana for Temporal stats +# + +FROM gcr.io/mcback/base:latest + +# FIXME +RUN apt-get -y update + +# Install dependencies +RUN \ + apt-get -y --no-install-recommends install \ + libfontconfig1 \ + && \ + true + +# Install Grafana +RUN \ + mkdir -p /opt/grafana/ && \ + /dl_to_stdout.sh "https://dl.grafana.com/oss/release/grafana-7.5.0.linux-amd64.tar.gz" | \ + tar -zx -C /opt/grafana/ --strip 1 && \ + true + +RUN \ + # + # Remove sample provisioning + rm -rf /opt/grafana/conf/provisioning/ && \ + # + # Add unprivileged user the service will run as + useradd -ms /bin/bash temporal && \ + mkdir -p \ + /var/lib/grafana/ \ + /var/lib/grafana/logs/ \ + /var/lib/grafana/plugins/ \ + && \ + chown temporal:temporal /var/lib/grafana/ && \ + # + # Create directory for provisioning dashboards + mkdir -p /opt/grafana/dashboards/ && \ + # + true + +COPY provisioning/ /opt/grafana/conf/provisioning/ +COPY dashboards/dashboards/* /opt/grafana/dashboards/ + +# Test if submodules were checked out +RUN \ + if [ ! -f "/opt/grafana/dashboards/temporal.json" ]; then \ + echo && \ + echo "Git submodules haven't been checked out, please run:" && \ + echo && \ + echo " git submodule update --init --recursive" && \ + echo && \ + echo "and then rebuild this image." && \ + echo && \ + exit 1; \ + fi + +WORKDIR /opt/grafana/ + +ENV PATH="/opt/grafana/bin:${PATH}" + +EXPOSE 3000 + +VOLUME /var/lib/grafana/ + +USER temporal + +COPY grafana.ini /opt/grafana/conf/ + +CMD ["grafana-server", "-config", "/opt/grafana/conf/grafana.ini"] diff --git a/apps/temporal-grafana/dashboards b/apps/temporal-grafana/dashboards new file mode 160000 index 0000000000..6094dd666f --- /dev/null +++ b/apps/temporal-grafana/dashboards @@ -0,0 +1 @@ +Subproject commit 6094dd666f386e76a3c03e0049f02521210b6883 diff --git a/apps/temporal-grafana/grafana.ini b/apps/temporal-grafana/grafana.ini new file mode 100644 index 0000000000..fcae717238 --- /dev/null +++ b/apps/temporal-grafana/grafana.ini @@ -0,0 +1,126 @@ +# possible values : production, development +app_mode = production + +# instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty +instance_name = temporal-grafana + +#################################### Paths #################################### +[paths] +# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) +data = /var/lib/grafana + +# Directory where grafana can store logs +logs = /var/lib/grafana/logs + +# Directory where grafana will automatically scan and look for plugins +plugins = /var/lib/grafana/plugins + +#################################### Server #################################### +[server] + +# The http port to use +http_port = 3000 + +#################################### Analytics #################################### +[analytics] +# Server reporting, sends usage counters to stats.grafana.org every 24 hours. +# No ip addresses are being tracked, only simple counters to track +# running instances, dashboard and error counts. It is very helpful to us. +# Change this option to false to disable reporting. +reporting_enabled = false + +# Set to false to disable all checks to https://grafana.net +# for new versions (grafana itself and plugins), check is used +# in some UI views to notify that grafana or plugin update exists +# This option does not cause any auto updates, nor send any information +# only a GET request to http://grafana.com to get latest versions +check_for_updates = false + +#################################### Security #################################### +[security] +# disable creation of admin user on first start of grafana +disable_initial_admin_creation = true + +# default admin user, created on startup +admin_user = mediacloud + +# default admin password, can be changed before first start of grafana, or in profile settings +admin_password = mediacloud + +# used for signing +# (Media Cloud's Grafana is hosted behind a firewall so this can be anything really) +secret_key = wkKjdjnUL9j27QW4L2w5 + +# disable gravatar profile images +disable_gravatar = true + +# disable protection against brute force login attempts +disable_brute_force_login_protection = true + +#################################### Snapshots ########################### +[snapshots] +# snapshot sharing options +external_enabled = false + +#################################### Dashboards History ################## +[dashboards] + +# Path to the default home dashboard. If this value is empty, then Grafana uses StaticRootPath + "dashboards/home.json" +default_home_dashboard_path = dashboards/temporal.json + +#################################### Users ############################### +[users] +# disable user signup / registration +allow_sign_up = false + +# Allow non admin users to create organizations +allow_org_create = false + +# Background text for the user field on the login page +login_hint = mediacloud +password_hint = mediacloud + +# Default UI theme ("dark" or "light") +default_theme = light + +[auth] + +# Set to true to disable the signout link in the side menu. useful if you use auth.proxy, defaults to false +disable_signout_menu = true + +#################################### Anonymous Auth ###################### +[auth.anonymous] +# enable anonymous access +enabled = true + +# specify organization name that should be used for unauthenticated users +org_name = Media Cloud + +# specify role for unauthenticated users +org_role = Viewer + +#################################### Logging ########################## +[log] +# Either "console", "file", "syslog". Default is console and file +# Use space to separate multiple modes, e.g. "console file" +mode = console + +#################################### Alerting ############################ +[alerting] +# Disable alerting engine & UI features +enabled = false + +#################################### Annotations ######################### +[annotations] +# Configures the batch size for the annotation clean-up job. This setting is used for dashboard, API, and alert annotations. +;cleanupjob_batchsize = 100 + +#################################### Explore ############################# +[explore] +# Enable the Explore section +enabled = true + +[date_formats] + +# Default timezone for user preferences. Options are 'browser' for the browser local timezone or a timezone name from IANA Time Zone database, e.g. 'UTC' or 'Europe/Amsterdam' etc. +default_timezone = 'America/New_York' diff --git a/apps/temporal-grafana/provisioning/dashboards/temporal.yml b/apps/temporal-grafana/provisioning/dashboards/temporal.yml new file mode 100644 index 0000000000..5fbf68b99b --- /dev/null +++ b/apps/temporal-grafana/provisioning/dashboards/temporal.yml @@ -0,0 +1,10 @@ +apiVersion: 1 +providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: true + editable: false + options: + path: /opt/grafana/dashboards/ diff --git a/apps/temporal-grafana/provisioning/datasources/temporal.yml b/apps/temporal-grafana/provisioning/datasources/temporal.yml new file mode 100644 index 0000000000..9722c5904c --- /dev/null +++ b/apps/temporal-grafana/provisioning/datasources/temporal.yml @@ -0,0 +1,7 @@ +apiVersion: 1 +datasources: + - name: TemporalMetrics + type: prometheus + url: http://temporal-prometheus:9090 + access: proxy + isDefault: true diff --git a/apps/temporal-grafana/provisioning/notifiers/.empty_dir b/apps/temporal-grafana/provisioning/notifiers/.empty_dir new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apps/temporal-grafana/provisioning/plugins/.empty_dir b/apps/temporal-grafana/provisioning/plugins/.empty_dir new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apps/temporal-prometheus/.dockerignore b/apps/temporal-prometheus/.dockerignore new file mode 100644 index 0000000000..9b2c362a80 --- /dev/null +++ b/apps/temporal-prometheus/.dockerignore @@ -0,0 +1,92 @@ +# +# Files from the build context to be ignored by "docker build". +# +# You might want to add as many of constantly changing files here as possible +# to prevent container's image from getting rebuilt every full moon. +# +# Unfortunately, we can't just symlink this file to every app's directory: +# +# https://github.com/moby/moby/issues/12886 +# +# so for the time being you have to manually copy this file to every app +# subdirectory: +# +# cd apps/ +# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; +# + +*$py.class +*.cover +*.DS_Store +*.egg +*.egg-info/ +*.log +*.manifest +*.mo +*.pot +*.py[cod] +*.sage.py +*.so +*.spec +*.swp +*/*.py[cod] +*/*.swp +*/*/*.py[cod] +*/*/*.swp +*/*/*/*.py[cod] +*/*/*/*.swp +*/*/*/__pycache__/ +*/*/__pycache__/ +*/__pycache__/ +._* +.apdisk +.AppleDB +.AppleDesktop +.AppleDouble +.cache +.com.apple.timemachine.donotpresent +.coverage +.coverage.* +.dockerignore +.DocumentRevisions-V100 +.DS_Store +.eggs +.env +.fseventsd +.git +.gitignore +.hypothesis +.idea +.installed.cfg +.ipynb_checkpoints +.LSOverride +.mypy_cache +.pytest_cache +.Python +.python-version +.ropeproject +.scrapy +.Spotlight-V100 +.spyderproject +.spyproject +.TemporaryItems +.tox +.Trashes +.venv +.VolumeIcon.icns +.webassets-cache +__pycache__ +celerybeat-schedule +coverage.xml +Icon +local_settings.py +Network Trash Folder +nosetests.xml +parts +pip-delete-this-directory.txt +pip-log.txt +sdist +Temporary Items +wheels +_Inline + diff --git a/apps/temporal-prometheus/Dockerfile b/apps/temporal-prometheus/Dockerfile new file mode 100644 index 0000000000..46a2fee5b0 --- /dev/null +++ b/apps/temporal-prometheus/Dockerfile @@ -0,0 +1,32 @@ +# +# Prometheus for Temporal stats +# + +FROM gcr.io/mcback/base:latest + +RUN \ + mkdir -p /opt/prometheus/ && \ + /dl_to_stdout.sh "https://github.com/prometheus/prometheus/releases/download/v2.25.2/prometheus-2.25.2.linux-amd64.tar.gz" | \ + tar -zx -C /opt/prometheus/ --strip 1 && \ + true + +COPY prometheus.yml /opt/prometheus/ + +# Add unprivileged user the service will run as +RUN \ + useradd -ms /bin/bash temporal && \ + mkdir -p /opt/prometheus/data/ && \ + chown temporal:temporal /opt/prometheus/data/ && \ + true + +WORKDIR /opt/prometheus/ + +ENV PATH="/opt/prometheus:${PATH}" + +EXPOSE 9090 + +USER temporal + +VOLUME /opt/prometheus/data/ + +CMD ["prometheus"] diff --git a/apps/temporal-prometheus/prometheus.yml b/apps/temporal-prometheus/prometheus.yml new file mode 100644 index 0000000000..1ca6e96f22 --- /dev/null +++ b/apps/temporal-prometheus/prometheus.yml @@ -0,0 +1,14 @@ +global: + scrape_interval: 5s + external_labels: + monitor: 'temporal-monitor' + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: + - 'localhost:9090' + - 'temporal-server:8000' + - 'temporal-server:8001' + - 'temporal-server:8002' + - 'temporal-server:8003' diff --git a/apps/temporal-server/Dockerfile b/apps/temporal-server/Dockerfile index f5944725da..448d7ff5da 100644 --- a/apps/temporal-server/Dockerfile +++ b/apps/temporal-server/Dockerfile @@ -69,8 +69,11 @@ ENV PATH="/opt/temporal-server/bin:${PATH}" \ # Archives VOLUME /var/lib/temporal/ -# Port descriptions: https://docs.temporal.io/docs/server-architecture/ -EXPOSE 6933 6934 6935 6939 7233 7234 7235 7239 +EXPOSE \ + # Port descriptions: https://docs.temporal.io/docs/server-architecture/ + 6933 6934 6935 6939 7233 7234 7235 7239 \ + # Prometheus endpoints + 8000 8001 8002 8003 USER temporal diff --git a/apps/temporal-server/config/mediacloud_template.yaml b/apps/temporal-server/config/mediacloud_template.yaml index e4295dc00c..2f36bd1621 100644 --- a/apps/temporal-server/config/mediacloud_template.yaml +++ b/apps/temporal-server/config/mediacloud_template.yaml @@ -110,41 +110,46 @@ global: rootCaData: - "" - # FIXME collect statistics with either statsd or prometheus: - # metrics: - # statsd: - # hostPort: "temporal-statsd:8125" - # prefix: "temporal" - - # metrics: - # prometheus: - # timerType: "histogram" - # listenAddress: "temporal-prometheus:9090" - services: frontend: rpc: grpcPort: 7233 membershipPort: 6933 bindOnIP: "${MC_TEMPORAL_HOST_IP}" + metrics: + prometheus: + timerType: "histogram" + listenAddress: "temporal-prometheus:8000" matching: rpc: grpcPort: 7235 membershipPort: 6935 bindOnIP: "${MC_TEMPORAL_HOST_IP}" + metrics: + prometheus: + timerType: "histogram" + listenAddress: "temporal-prometheus:8001" history: rpc: grpcPort: 7234 membershipPort: 6934 bindOnIP: "${MC_TEMPORAL_HOST_IP}" + metrics: + prometheus: + timerType: "histogram" + listenAddress: "temporal-prometheus:8002" worker: rpc: grpcPort: 7239 membershipPort: 6939 bindOnIP: "${MC_TEMPORAL_HOST_IP}" + metrics: + prometheus: + timerType: "histogram" + listenAddress: "temporal-prometheus:8003" clusterMetadata: enableGlobalNamespace: false diff --git a/apps/temporal-server/docker-compose.tests.yml b/apps/temporal-server/docker-compose.tests.yml index 1ab02ce88f..3704a92302 100644 --- a/apps/temporal-server/docker-compose.tests.yml +++ b/apps/temporal-server/docker-compose.tests.yml @@ -27,6 +27,7 @@ services: depends_on: - temporal-postgresql - temporal-elasticsearch + - temporal-prometheus networks: default: aliases: @@ -40,6 +41,10 @@ services: - 7234 - 7235 - 7239 + - 8000 + - 8001 + - 8002 + - 8003 ports: # Expose to host for debugging - "6933:6933" @@ -50,6 +55,10 @@ services: - "7234:7234" - "7235:7235" - "7239:7239" + - "8000:8000" + - "8001:8001" + - "8002:8002" + - "8003:8003" volumes: - type: bind source: ./bin/ @@ -104,6 +113,29 @@ services: limits: cpus: "2" memory: "2G" + + temporal-prometheus: + image: gcr.io/mcback/temporal-prometheus:latest + init: true + stop_signal: SIGKILL + networks: + - default + expose: + - "9090" + ports: + # Expose to host for debugging + - "9090:9090" + volumes: + - type: bind + source: ./../temporal-prometheus/prometheus.yml + target: /opt/prometheus/prometheus.yml + # Limit CPUs and RAM for the process to not get too greedy + deploy: + resources: + limits: + cpus: "2" + memory: "2G" + temporal-webapp: image: gcr.io/mcback/temporal-webapp:latest init: true From 0ea558346971c52f16d8bc86216fba61ce3bb555 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 31 Mar 2021 19:26:21 +0000 Subject: [PATCH 040/175] Bump lxml from 4.6.2 to 4.6.3 in /apps/common/src Bumps [lxml](https://github.com/lxml/lxml) from 4.6.2 to 4.6.3. - [Release notes](https://github.com/lxml/lxml/releases) - [Changelog](https://github.com/lxml/lxml/blob/master/CHANGES.txt) - [Commits](https://github.com/lxml/lxml/compare/lxml-4.6.2...lxml-4.6.3) Signed-off-by: dependabot[bot] --- apps/common/src/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/common/src/requirements.txt b/apps/common/src/requirements.txt index 96c3385b19..03aa36f4ad 100644 --- a/apps/common/src/requirements.txt +++ b/apps/common/src/requirements.txt @@ -51,7 +51,7 @@ Jinja2==2.11.2 kombu==4.6.11 # XML manipulations, HTML parsing -lxml==4.6.2 +lxml==4.6.3 # Japanese language tokenizer, stemmer, etc. mecab-python3==1.0.3 From ea27d3e3d9c13909cfd9ebd5939fd146e4879771 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Thu, 1 Apr 2021 15:02:56 -0400 Subject: [PATCH 041/175] update stopwords lists --- .../mediawords/languages/ca/ca_stop_words.txt | 202 +- .../mediawords/languages/da/da_stop_words.txt | 90 +- .../mediawords/languages/de/de_stop_words.txt | 386 +- .../mediawords/languages/en/en_stop_words.txt | 4293 ++--------------- .../mediawords/languages/es/es_stop_words.txt | 464 +- .../mediawords/languages/fi/fi_stop_words.txt | 570 ++- .../mediawords/languages/fr/fr_stop_words.txt | 540 ++- .../mediawords/languages/ha/ha_stop_words.txt | 1 + .../mediawords/languages/hi/hi_stop_words.txt | 343 +- .../mediawords/languages/hu/hu_stop_words.txt | 670 ++- .../mediawords/languages/it/it_stop_words.txt | 361 +- .../mediawords/languages/ja/ja_stop_words.txt | 71 +- .../mediawords/languages/lt/lt_stop_words.txt | 419 +- .../mediawords/languages/nl/nl_stop_words.txt | 319 +- .../mediawords/languages/no/no_stop_words.txt | 79 +- .../mediawords/languages/pt/pt_stop_words.txt | 3248 +------------ .../mediawords/languages/ro/ro_stop_words.txt | 1087 +++-- .../mediawords/languages/ru/ru_stop_words.txt | 1380 +++--- .../mediawords/languages/sv/sv_stop_words.txt | 281 +- .../mediawords/languages/tr/tr_stop_words.txt | 352 +- .../mediawords/languages/zh/zh_stop_words.txt | 339 +- 21 files changed, 6525 insertions(+), 8970 deletions(-) mode change 100755 => 100644 apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt mode change 100755 => 100644 apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt mode change 100755 => 100644 apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt diff --git a/apps/common/src/python/mediawords/languages/ca/ca_stop_words.txt b/apps/common/src/python/mediawords/languages/ca/ca_stop_words.txt index eaf6168385..8aa8988fd8 100644 --- a/apps/common/src/python/mediawords/languages/ca/ca_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ca/ca_stop_words.txt @@ -1,10 +1,8 @@ -# -# This is a stop word list for the Catalan language. -# -# Sources: -# https://raw.githubusercontent.com/stopwords-iso/stopwords-ca/master/stopwords-ca.txt +# # Sources: # http://latel.upf.edu/morgana/altres/pub/ca_stop.htm +# https://raw.githubusercontent.com/stopwords-iso/stopwords-ca/master/stopwords-ca.txt # https://www.ranks.nl/stopwords/catalan +# (Lightly edited to remove words in the original lists that are actually meaningful) # a @@ -12,10 +10,12 @@ abans abans-d'ahir abintestat ací -adesiara +açò adàgio adés +adesiara adéu +àdhuc ah ahir ai @@ -28,13 +28,15 @@ aixà així això al +alça aleshores +algú algun alguna algunes alguns -algú alhora +àlies allà allèn allí @@ -48,13 +50,12 @@ altres altresí altri al·legro -alça amargament amb -ambdues ambdós -amunt +ambdues amén +amunt anar anc andante @@ -73,11 +74,11 @@ aquell aquella aquelles aquells +aquèn aquest aquesta aquestes aquests -aquèn aquí ara arran @@ -92,7 +93,6 @@ avall avant aviat avui -açò bah baix baldament @@ -100,17 +100,18 @@ ballmanetes banzim-banzam bastant bastants +bé ben bis bitllo-bitllo bo -bé ca +ça cada +cadascú cadascuna cadascunes cadascuns -cadascú cal cap car @@ -126,10 +127,11 @@ certes certs cinc cinquanta +cinquè cinquena cinquenes cinquens -cinquè +ço com comsevulla consegueixo @@ -147,10 +149,10 @@ d'un d'una d'unes d'uns -daixonses daixò -dallonses +daixonses dallò +dallonses dalt daltabaix damunt @@ -160,6 +162,7 @@ davall davant de debades +deçà dedins defora dejorn @@ -167,12 +170,13 @@ dejús del dellà dels -dementre -dempeus demà +dementre demés +dempeus des des de +desè desena desenes desens @@ -180,11 +184,9 @@ després dessobre dessota dessús -desè deu devers devora -deçà diferents dinou dins @@ -217,6 +219,7 @@ emperò en enans enant +ençà encara encontinent endalt @@ -224,9 +227,9 @@ endarrera endarrere endavant endebades -endemig endemà endemés +endemig endins endintre enfora @@ -235,8 +238,8 @@ enguany enguanyasses enjús enlaire -enlloc enllà +enlloc enrera enrere ens @@ -250,65 +253,69 @@ entretant entrò envers envides -environs enviró -ençà +environs ep era erem +érem eren eres +éreu ergo es +és escar essent +éssent esser +ésser est esta +està estada estades estan estant estar +estarà estaran +estaràs +estaré estarem estareu estaria +estaríem estarien estaries -estarà -estaràs -estaré -estaríem estaríeu +estàs estat estats estava +estàvem estaven estaves +estàveu estem estes esteu estic +estigué estiguem +estiguérem estigueren estigueres +estiguéreu estigues +estigués estiguessis estigueu estigui +estiguí estiguin estiguis -estigué -estiguérem -estiguéreu -estigués -estiguí estos -està -estàs -estàvem -estàveu et etc etcètera @@ -325,20 +332,20 @@ feu fi fins fora +fóra +força +fórem foren fores -força +fóreu fos +fóssim fossin fossis +fóssiu fou fra fui -fóra -fórem -fóreu -fóssim -fóssiu gaire gairebé gaires @@ -347,45 +354,47 @@ girientorn gratis ha hagi +hàgim hagin hagis +hàgiu haguda hagudes -hagueren -hagueres -haguessin -haguessis -hagut -haguts hagué haguérem +hagueren +hagueres haguéreu hagués haguéssim +haguessin +haguessis haguéssiu haguí +hagut +haguts hala han has +haurà hauran +hauràs +hauré haurem haureu hauria +hauríem haurien hauries -haurà -hauràs -hauré -hauríem hauríeu havem havent haver haveu havia +havíem havien havies -havíem havíeu he hem @@ -394,13 +403,12 @@ hi ho hom hui -hàgim -hàgiu i +ídem igual iguals -inclusive inclòs +inclusive ja jamai jo @@ -444,6 +452,7 @@ mentre mentrestant menys mes +més meu meua meues @@ -466,7 +475,6 @@ molts mon mons mos -més n n'he n'hi @@ -479,35 +487,37 @@ nogensmenys només noranta nos +nós +nòs nosaltres nostra nostre nostres nou +novè novena novenes novens -novè ns -nòs -nós o oh oi oidà +òlim on onsevulga onsevulla onze pas +pàssim pel pels pengim-penjam per per que +però perquè pertot -però piano pla poc @@ -532,7 +542,6 @@ prou puc puix pus -pàssim qual quals qualsevol @@ -552,6 +561,7 @@ quarts quasi quatre que +què quelcom qui quin @@ -560,7 +570,6 @@ quines quins quinze quisvulla -què ran re rebé @@ -596,25 +605,25 @@ sengles sens sense ser +serà seran +seràs +seré serem sereu seria +seríem serien series -serà -seràs -seré -seríem seríeu ses set setanta +setè setena setenes setens setze -setè seu seua seues @@ -622,6 +631,7 @@ seus seva seves si +sí sia siau sic @@ -633,13 +643,14 @@ siguin siguis sinó sis +sisè sisena sisenes sisens -sisè sobre sobretot soc +sóc sol sola solament @@ -647,6 +658,7 @@ soles sols som son +són sons sos sota @@ -654,9 +666,6 @@ sots sou sovint suara -sí -sóc -són t t'ha t'han @@ -712,14 +721,23 @@ u uf ui uix +últim +última +últimes +últims ultra un una unes +únic +única +únics +úniques uns up upa us +ús va vagi vagin @@ -728,54 +746,34 @@ vaig vair vam van +vàreig +vàrem vares +vàreu vas vau vem verbigràcia vers +vés vet veu vint vora vos +vós vosaltres +vostè +vostès vostra vostre vostres -vostè -vostès vuit vuitanta +vuitè vuitena vuitenes vuitens -vuitè -vàreig -vàrem -vàreu -vés -vós xano-xano xau-xau xec -àdhuc -àlies -ça -ço -érem -éreu -és -éssent -ésser -ídem -òlim -últim -última -últimes -últims -únic -única -únics -úniques -ús diff --git a/apps/common/src/python/mediawords/languages/da/da_stop_words.txt b/apps/common/src/python/mediawords/languages/da/da_stop_words.txt index 220a35602a..adea1f4031 100644 --- a/apps/common/src/python/mediawords/languages/da/da_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/da/da_stop_words.txt @@ -1,16 +1,21 @@ -# -# This is a stop word list for the Danish language. -# # Sources: +# # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-da/blob/master/stopwords-da.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) # ad af +aldrig alle alt anden +andet +andre at +bare +begge blev blive bliver @@ -25,77 +30,148 @@ det dette dig din +dine disse +dit dog du efter +ej eller en end +ene +eneste +enhver er et +få +far +får +fem +fik +fire +flere +fleste for +før +fordi +forrige fra +god +godt ham han hans har havde have +hej +helt hende hendes her hos hun hvad +hvem +hver +hvilken hvis hvor +hvordan +hvorfor +hvornår i ikke ind +ingen +intet +ja jeg jer +jeres jo +kan +kom +komme +kommer +kun kunne +lad +lav +lidt +lige +lille +må man +mand mange med meget men +mens +mere mig min mine mit mod +når +nær +næste +næsten ned +nej +ni +nogen noget nogle nu -når +ny +nyt og også +okay om op os +otte over på +så +sådan +se +seks selv +ser +ses sig +sige sin sine sit skal skulle som -sådan +stor +store +syv +tag +tage thi +ti til +to +tre ud under var +være +været +ved vi vil ville vor -være -været +vores \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/de/de_stop_words.txt b/apps/common/src/python/mediawords/languages/de/de_stop_words.txt index aad240c48c..ec3f32fd30 100644 --- a/apps/common/src/python/mediawords/languages/de/de_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/de/de_stop_words.txt @@ -1,16 +1,28 @@ -# -# This is a stop word list for the German language. -# # Sources: +# # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-de/blob/master/stopwords-de.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) # +a +ab aber +ach +acht +achte +achten +achter +achtes +ag alle +allein allem allen aller +allerdings alles +allgemeinen als also am @@ -25,19 +37,65 @@ anderm andern anderr anders +au auch auf aus +ausser +ausserdem +außer +außerdem +b +bald bei +beide +beiden +beim +beispiel +bekannt +bereits +besonders +besser +besten bin bis +bisher bist +c +d +d.h da +dabei +dadurch +dafür +dagegen +daher +dahin +dahinter +damals damit +danach +daneben +dank dann +daran +darauf +daraus +darf +darfst +darin +darüber +darum +darunter das +dasein +daselbst +dass dasselbe +davon +davor dazu +dazwischen daß dein deine @@ -46,19 +104,34 @@ deinen deiner deines dem +dementsprechend +demgegenüber +demgemäss +demgemäß demselben +demzufolge den +denen denn denselben der +deren derer +derjenige +derjenigen +dermassen +dermaßen derselbe derselben des +deshalb desselben dessen +deswegen dich die +diejenige +diejenigen dies diese dieselbe @@ -70,9 +143,32 @@ dieses dir doch dort +drei +drin +dritte +dritten +dritter +drittes du durch +durchaus +dürfen +dürft +durfte +durften +e +eben +ebenso +ehrlich +ei +ei, +eigen +eigene +eigenen +eigener +eigenes ein +einander eine einem einen @@ -85,8 +181,21 @@ einigen einiger einiges einmal +eins +elf +en +ende +endlich +entweder er +ernst +erst +erste +ersten +erster +erstes es +etwa etwas euch euer @@ -95,18 +204,75 @@ eurem euren eurer eures +f +folgende +früher +fünf +fünfte +fünften +fünfter +fünftes für +g +gab +ganz +ganze +ganzen +ganzer +ganzes +gar +gedurft gegen +gegenüber +gehabt +gehen +geht +gekannt +gekonnt +gemacht +gemocht +gemusst +genug +gerade +gern +gesagt +geschweige gewesen +gewollt +geworden +gibt +ging +gross +grosse +grosser +grosses +große +großer +großes +gut +guter +gutes +h hab habe haben +habt +hast hat hatte +hätte hatten +hätten +hattest +hattet +heisst +her +heute hier hin hinter +hoch +i ich ihm ihn @@ -118,78 +284,219 @@ ihren ihrer ihres im +immer in indem +infolgedessen ins +irgend ist +j +ja +jahr +jahre +jahren +je jede jedem jeden jeder +jedermann +jedermanns jedes +jedoch +jemand +jemandem +jemanden jene jenem jenen jener jenes jetzt +k +kam kann +kannst +kaum kein keine keinem keinen keiner keines +kleine +kleinen +kleiner +kleines +kommen +kommt können +könnt +konnte könnte +konnten +kurz +l +lang +lange +leicht +leide +lieber +los +m machen +macht +machte +mag +magst +mahn +mal man manche manchem manchen mancher manches +mann +mehr mein meine meinem meinen meiner meines +mensch +menschen mich mir mit +mittel +mochte +möchte +mochten +mögen +möglich +mögt +morgen muss +müssen +musst +müsst musste +mussten +muß +mußt +müßt +n +na nach +nachdem +nahm +natürlich +neben +nein +neue +neuen +neun +neunte +neunten +neunter +neuntes nicht nichts +nie +niemand +niemandem +niemanden noch nun nur +o ob +oben oder +offen +oft ohne +p +q +r +recht +rechte +rechten +rechter +rechtes +s +sa +sache +sagt +sagte +sah +schlecht +schluss +sechs +sechste +sechsten +sechster +sechstes sehr +sei +seid +seien sein seine seinem seinen seiner seines +seit +seitdem selbst sich sie +sieben +siebente +siebenten +siebenter +siebentes sind so +solang solche solchem solchen solcher solches soll +sollen +sollst +sollt sollte +sollten sondern sonst +soweit +sowie +später +startseite +statt +steht +suche +t +tag +tage +tagen +tat +teil +tel +tritt +trotzdem +tun +u +über +überhaupt +übrigens um und uns @@ -197,42 +504,111 @@ unse unsem unsen unser +unsere +unserer unses unter +v +vergangenen viel +viele +vielem +vielen +vielleicht +vier +vierte +vierten +vierter +viertes vom von vor +w +wahr +während +währenddem +währenddessen +wann war +wäre waren warst +wart +warum was weg +wegen weil +weit weiter +weitere +weiteren +weiteres welche welchem welchen welcher welches +wem +wen +wenig +wenige +weniger +weniges +wenigstens wenn +wer werde werden +werdet +weshalb +wessen wie wieder +wieso will +willst wir wird +wirklich wirst wo +woher +wohin +wohl wollen +wollt wollte -während +wollten +worden +wurde würde +wurden würden +x +y +z +z.b +zehn +zehnte +zehnten +zehnter +zehntes +zeit zu +zuerst +zugleich zum +zunächst zur +zurück +zusammen +zwanzig zwar +zwei +zweite +zweiten +zweiter +zweites zwischen -über +zwölf diff --git a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt index eec3311701..742cb22dbd 100644 --- a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt @@ -1,141 +1,172 @@ -# # This is a "long" stop word list for the English language. # # Sources: +# # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# http://www.lextek.com/manuals/onix/stopwords1.html +# http://xpo6.com/list-of-english-stop-words/ +# https://countwordsfree.com/stopwords +# https://gist.github.com/sebleier/554280 (NLTK stop words) # https://github.com/arc12/Text-Mining-Weak-Signals/wiki/Standard-set-of-english-stopwords +# https://github.com/berkmancenter/mediacloud-sentence-splitter/blob/develop/sentence_splitter/non_breaking_prefixes/en.txt +# https://github.com/stopwords-iso/stopwords-en/blob/master/stopwords-en.txt # https://www.link-assistant.com/seo-stop-words.html -# some English non-breaking prefixes +# https://www.ranks.nl/stopwords # +# (Lightly edited to remove words in the original lists that are actually meaningful) +'ll +'tis +'twas +'ve +10 +39 +A +Adj +Adm +Adv +Apr +Art +Asst +Aug +B +Bart +Bldg +Brig +Bros +C +Capt +Cmdr +Co +Col +Comdr +Con +Corp +Cpl +D +DR +Dec +Dr +Drs +E +Ens +F +Feb +Fig +G +Gen +Gov +H +Hon +Hosp +Hr +I +I'd +I'll +I'm +I've +Inc +Insp +J +Jan +Jr +Jul +Jun +K +L +Lt +M +MM +MR +MRS +MS +Maj +Mar +Messrs +Mlle +Mme +Mr +Mrs +Ms +Msgr +N +No +Nos +Nov +Nr +O +Oct +Okt +Op +Ord +P +Pfc +Ph +Ph.D +PhD +Prof +Pvt +Q +R +Rep +Reps +Res +Rev +Rt +S +Sen +Sens +Sep +Sept +Sfc +Sgt +Sr +St +Supt +Surg +T +U +V +W +X +Y +Z a a's -abandoned -abbr -ability able -aboard +ableabout about above -abroad -absence -absent -absolute -absolutely -absorbed -abstract -academic -accept -acceptable -acceptance -accepted -accepting -access -accident -accompanied -accomplish -accomplished +abst accordance according accordingly -account -accounts -accuracy -accurate -accurately -accused -achieve -achieved -achievement -achievements -acquire -acquired across act -acting -action -actions -active -activities -activity -actor -acts -actual actually ad -add added -adding -addition -additional -address -addressed -addresses -adequate adj -adjusted -adjustment -adjustments -adm -administration -admission -admit -admitted adopted -adult -adults -adv -advance -advanced -advantage -advantages -advertising -advice -advised -aesthetic -affair -affairs -affect +ae +af affected +affecting affects -afford -afraid after -afternoon afterwards +ag again against -age -agencies -agency -agent -agents -ages ago -agree -agreed +ah ahead -aid -aids -aim -aimed +ai ain't -air +aint al -alert -alienation -align -alike -alive all -alliance -allied -allies -allotment allow -allowances -allowed -allowing allows almost alone @@ -143,1257 +174,257 @@ along alongside already also -altered -alternative although -altogether always am -amazing -ambiguous -amendment amid amidst among amongst +amoungst amount -amounts an -analysis -ancient and -anger -angle -angry -animal -animals -anniversary -announced -announcement -annual -anode +announce another -answer -answered -answers -anti-Semitism -anti-trust -anticipated -anticipation -anxiety -anxious any anybody anyhow +anymore anyone anything anyway anyways anywhere +ao apart -apartment -apparatus -apparent apparently -appeal appear -appearance -appeared -appears -apple -application -applications -applied -apply -applying -appointed -appointment -appreciate -appreciation -approach -approached -approaches -approaching -appropriate -approval -approved approximately -apr -april -arbitrary -arc -architect +aq +ar are area areas aren aren't -argue -argued -argument +arent arise -arm around -aroused -arrange -arranged -arrangement -arrangements -arrest -arrested -arrival -arrive -arrived -art -artery -article -articles -artist -artistic -artists -arts +arpa as aside ask asked asking asks -asleep -aspect -aspects -assembled -assessment -assessors -assigned -assignment -assist -assistance -assistant -associate associated -association -asst -assume -assumed -assumption -assumptions -assure -assured -astronomy at -atmosphere -atom -atomic -atoms -attached -attack -attacked -attacks -attain -attempt -attempted -attempting -attempts -attend -attended -attending -attention -attitude -attitudes -attorney -attract -attracted -attractive -audience -aug -august -aunt -authentic -author -authorities -authority -authorized -authors -auto -automatic -automatically -automobile -automobiles -autumn -availability +au +auth available -average -avoid -avoided -awake -award -aware -awareness +aw away awfully -axis +az b -baby +ba back backed -background backing backs backward backwards -bad -badly -bag -balance -balanced -ball -band -bank -banks -bar -bare -barely -barn -barrel -bars -bart -base -baseball -based -basement -bases -basic -basically -basis -bat -bath -battle -bay +bb +bd be -beach -bear -beard -bearing -beat -beautiful -beauty became because become becomes becoming -bed -bedroom -beef been -beer before beforehand began begin beginning +beginnings begins -begun -behalf -behavior behind being beings -belief -beliefs believe -believed -believes -bell -belly -belong -belongs below -bench -beneath -benefit -benefits -bent beside besides best -bet better between beyond -bgcolor -bid +bf +bg +bh +bi big -bigger -biggest bill billion -bills -binding -binomial -biological -bird -birds -birth -bit -bitter -black -blame -blanket -bldg -blind -block -blockquote -blocks -blog -blonde -blood -blow -blue -blues -board -boards -boat -boating -boats -bod -bodies -body -bold -bomb -bombs -bond -bonds -bone -bones -book -books -border -bore -born -boss +biol +bj +bm +bn +bo both -bother -bottle bottom -bought -bound -box -boy -boys br -branch -branches -brave -bread -break -breakfast -breaking -breath -breathing -brick -bride -bridge -bridges brief briefly -brig -bright -brilliant -bring -bringing -brings -broad -broke -broken -bronchial -bros -brother -brought -brown -browser -brush -brushed -budget -build -builder -building -buildings -built -bullet -bullets -bundle -burden -bureau -burn -burned -burning -burns -burst -bus -business -businesses -busy +bs +bt but -butter buy -buying +bv +bw by +bz c c'mon c's -cafe -calculated -calendar +ca call -called -calls -calm came -camera -camp can can't -candidate -candidates cannot cant -cap -capabilities -capable -capacity -capital -capt -captain caption -car -carbon -card -care -career -careful -carefully -carried -carries -carry -carrying -cars case cases -cash -cast -casual -cat -catch -categories -category -cattle -caught cause -caused causes -cdt -ceiling -cell -cellar -cells -cent -center -centers -central -cents -centuries -century +cc +cd certain certainly -certainty cf -chain -chair -chairman -chairs -challenge -champion -chance -chances -change -changed +cg +ch changes -changing -channels -chapel -char -character -characteristic -characteristics -characterized -characters -charge -charged -charges -charm -charming -charoff -chart -charter -cheap -check -checked -cheek -chemical -chest -chick -chicken -chief -chiefly -chlorine -choice -cholesterol -choose -chord -chose -chosen -church -churches -cigarette -circle -circles -circular -circumstances -cite -cited -cities -citizen -citizens -city -civic -civil -civilian -civilization -claim -claimed -claims -clarity -class -classes -classic -classical -classification -clean -cleaning +ci +ck +cl clear -cleared clearly -clerk click -climb -climbed -clinical -clock -close -closed -closely -closer -closing -cloth -clothe -clothes -clothing -cloud -clouds -club -cmdr +cm +cmon +cn co co. -coach -coast -coat -coating -cocktail -code -coffee -col -cold -colleagues -collect -collected -collection -collective -colonel -colonial -colony -color -colored -colorful -colors -column -columns com -combat -combination -combined -comdr come -comedy comes -comfort -comfortable -coming -command -commander -comment -comments -commerce -commercial -commissioner -committed -committee -commodities -common -commonly -communication -communications -communism -communities -community -companies -companion -company -comparable -compare -compared -comparison -compete -competition -competitive -complained -complement -complete -completed -completely -completion -complex -complicated -component -components -composed -composer -composition -compromise -computed +computer con -conceived -concentrated -concentration -concept -conception -concepts -concern -concerned concerning -concerns -concert -concerts -concluded -conclusion -conclusions -concrete -condemned -condition -conditioned -conditions -conduct -conducted -conductor -conference -conferences -confidence -confirmed -conflict -confronted -confused -confusion -congressional -connect -connected -connection -conscience -conscious -consciousness -consequence -consequences consequently consider -considerable -considerably -consideration -considerations -considered considering -consisted -consistent -consistently -consisting -consists -consonant -conspiracy -constant -constantly -constitute -constitutional -constructed -construction -consumer -contact -contacts contain -contained containing contains -contemporary -content -contest -context -continent -continually -continue -continued -continues -continuing -continuity -continuous -continuously -contract -contracts -contrary -contrast -contribute -contributed -contributions -control -controlled -controlling -controls -controversy -convenience -convenient -conventional -conversation -conversion -converted -conviction -convictions -convinced -cook -cooking -cool -cooling -cooperative -cope copy -core -corn -corner -corp -correct -correspondence corresponding -cost -costs -cottage -cotton could +could've +couldn couldn't -council -count -counter -counties -countries -country -county -couple -courage +couldnt course -courses -court -courts -cousin -cover -coverage -covered -covering -covers -cow -cpl -crack -craft -crash -crawled -crazy -cream -crease -create -created -creating -creation -creative -creatures -credit -crew -critic -critical -criticism -critics -crop -cross -crossed -crossing -crowd -crowded -crown -crucial +cr cry -cst -cultural -culture -cure -curiosity -curious -current +cs +cu currently -curt -curve -customer -customers -cut -cuts -cutting -cycle +cv +cx +cy +cz d -D -dad -daily -damage -damn -dance -dancer -dancers -dances -dancing -danger -dangerous dare daren't -dark -darkness -data +darent date -dates -datetime -daughter -dawn -day -days -dead -deal -dealer -dealers -dealing -dealt +de dear -death -debate -dec -decade -decades -december -decent -decide -decided -decimal -decision -decisions -deck -declaration -declared -decline -dedicated -dedication -deep -deeper -deeply -defeat -defend -defense -define -defined -definite definitely -definition -degree -degrees -del -delay -delayed -deliberately -delicate -delight -delightful -delivered -delivery -demand -demanded -demanding -demands -democracy -demonstrate -demonstrated -demonstration -denied -density -deny -department -departments -depend -dependent -depending -depends -depression -depth -derived describe described -describes -description -desegregation -desert -design -designed -designs -desirable -desire -desired -desires -desk -despair -desperate -desperately despite -destiny -destroy -destroyed -destruction -destructive detail -detailed -details -detergent -determination -determine -determined -determining -develop -developed -developing -development -developments -device -devices -devil -devoted -diameter -dictionary did didn didn't -die -died -diet +didnt differ -difference -differences different differently -difficult -difficulties -difficulty -diffusion -dignity -dilemma -dimensions -dining -dinner -diplomatic -dir -direct -directed -direction -directions directly -director -directors -dirt -dirty -disappeared -disaster -discharge -discipline -discuss -discussed -discussion -discussions -disease -dishes -disk -displacement -display -displayed -displays -disposal -dispute -distance -distant -distinct -distinction -distinctive -distinguished -distributed -distribution -districts -disturbed -div -dive -divide -divided -divine -division -divorce +dj +dk +dm do -doctor -doctors -doctrine -documents does doesn doesn't -dog -dogs +doesnt doing -dollar -dollars -domestic -dominant -dominated don don't done -door -doors -double -doubt +dont doubtful down downed downing downs -downtown downwards -dozen -dr -draft -drama -dramatic -drank -draw -drawing -drawings -drawn -dream -dreamed -dreams -dress -dressed -dressing -drew -dried -drill -drink -drinking -drinks -drive -driven -driver -drivers -driving -drop -dropped -drove -drs -drug -drugs -drunk -dry -drying -duck due -dull during -dust -duties -duty -dying -dynamic +dz e +e.g each -eager -ear -earlier -earliest early -earnings -ears -earth -ease -easier -easily -east -easy -eat -eating -ecumenical -edge -edges -edition -editor -editorial -edt +ec +ed edu +ee effect -effective -effectively -effectiveness -effects -efficiency -efficient -effort -efforts eg -egg -eggs +eh eight -eighteenth -eighth eighty either -elaborate -elected -electric -electrical -electricity -electron -electronic -electronics -element -elements eleven -eliminate -eliminated else elsewhere -em -email -emerged -emergency -emission -emphasis -emphasize -empirical -employed -employee -employees -employment empty -enable -encounter -encountered -encourage -encouraged -encouraging end ended ending -endless ends -enemies -enemy -energy -enforced -enforcement -engaged -engagement -engine -engineer -engineering -engineers -enjoy -enjoyed -enjoyment -enormous enough -ens -enter -entered -entering -enterprise -entertainment -enthusiasm -enthusiastic -entire entirely -entitled -entrance -entries -entry -envelope -environment -equal -equally -equate -equation -equipment -equipped -equivalent -era -error -errors -escape +er +es esp especially -essential -essentially -est -establish -established -establishing -establishment -estate -estimate -estimated -estimates et +et-al etc -etc. -eternal -ethical -ethics -evaluation -eve even -evening evenly -event -events -eventually ever evermore every @@ -1401,2775 +432,679 @@ everybody everyone everything everywhere -evidence -evident -evidently -evil ex -exact exactly -examination -examine -examined example -examples -excellent except -exception -exceptions -excess -excessive -exchange -excite -excited -excitement -exciting -exclusive -exclusively -excuse -executive -exercise -exercises -exhibit -exhibition -exist -existed -existence -existing -exists -expanded -expanding -expansion -expect -expectations -expected -expects -expenditures -expense -expenses -expensive -experience -experienced -experiences -experiment -experimental -experiments -expert -experts -explain -explained -explains -explanation -explicit -exploration -exposed -exposure -express -expressed -expressing -expression -extend -extended -extending -extension -extensive -extent -extra -extraordinary -extreme -extremely -eye -eyes f -fabrics face -faced faces -facilities -facing -fact -factor -factories -factors -factory -facts -faculty -fail -failed -failure -faint -fair fairly -faith -fall -fallen -falling -fallout -familiar -family -famous -fans -fantastic far -farm -farmer -farmers farther -fascinating -fashion -fast -fat -fate -father -fathers -fault -favor -favorable -favorite -fear -fears -feature -features -feb -february -fed -federal -feed -feel -feeling -feelings -feels -fees -feet -fell -fellow felt -female -fence -festival few fewer -fiber -fibers -fiction -field -fields +ff +fi fifteen fifth fifty -fig -fight -fighting -figure -figured -figures -file -filed -filing -fill -filled -filling -film -films -final -finally -finance -financial -financing +fify find -finding -findings finds -fine -finger -fingers -finish -finished -fire -fired -firing -firm -firmly -firms first -fiscal -fish -fishing -fist -fit -fitted five -fixed -flagicon -flash -flat -fled -flesh -flew -flexible -flight -floor -flow -flower -flowers -fluid -flux -fly -flying -foam -foams -focus -fog -folk -folks -follow +fix +fj +fk +fm +fo followed following follows -font -food -foods -fool -foot -football for -force -forced -forces -foreign -forest -forests forever -forget -forgive -form -formal -formation -formed former formerly -forming -forms -formula -formulas -fort forth -fortune -forum +forty forward -fought found -founded four -fourteen -fourth -fraction -fractions -frame -frames +fr free -freight -frequencies -frequency -frequent -frequently -fresh -friday -friend -friendly -friends -friendship -frightened from front -frozen -fruit -ft. full -full-time fully -fun -function -functional -functions -fund -funds -furnish -furnished -furniture further furthered furthering furthermore furthers -future +fx g -gain -gained -gains -game -games -gang -garage -garden -gardens -gas -gather -gathered -gathering +ga gave -gay -gear -gen +gb +gd +ge general generally -generation -generations -generous -genius -gentle -gentleman -gentlemen -gently -genuine -gesture get gets getting -giant -gift -gin -girl -girls +gf +gg +gh +gi give given gives giving -glad -glance -glanced -glass -glasses -glory +gl +gm +gmt +gn go -goal -goals goes going -gold -golden -golf gone good goods -gorton got gotten gov -govern -governing -government -governmental -governments -governor -grabbed -grade -grades -gradually -graduate -grain -grains -grand -grant -granted -grants -grass -grateful -grave -gray -great -greater -greatest -greatly -green -greeted -greetings -grew -grinned -grip -gross -ground -grounds +gp +gq +gr group grouped grouping groups -grow -growing -grown -grows -growth -guards -guess -guest -guests -guidance -guide -guided -guilt -guilty -gun -guns -guy -guys -gyro +gs +gt +gu +gw +gy h -habit -habits had hadn't -hair +hadnt half -halign -hall -ham -hand -handed -handle -handled -handling -hands -hang -hanging -happen -happened -happening happens -happily -happiness -happy -hard -harder hardly -harm -harmony has hasn hasn't -hat -hate -hated -hatred +hasnt have +haven haven't +havent having he he'd he'll he's -head -headed -heading -headquarters -heads -health -healthy -hear -heard -hearing -heart -hearts -heat -heaven -heavily -heavy -heels -height -heights -held +hed hell hello help -helped -helpful -helping -helpless -helps -hen hence her -herd here here's hereafter hereby herein +heres hereupon -heritage -hero -heroic hers +herse +herse" herself -hesitated +herse” +hes hi -hidden -hide +hid high higher highest -highly -hill him +himse +himse" himself -hired +himse” his -historian -historians -historic -historical -history -hit hither -hits -hold -holder -holding -holds -hole -holes -hollywood +hk +hm +hn home -homes -hon -honest -honey -honor -honored -hope -hoped +homepage hopefully -hopes -hoping -horizon -horse -horses -hosp -host -hot -hotels -hour -hours -house -household -houses -housing how +how'd +how'll how's howbeit however hr -href +ht +htm html http -huge -human -humanity +hu hundred -hundreds -hung -hungry -hunt -hunting -hurried -hurry -husband -hydrogen -hypothalamic i -I i'd i'll i'm i've -ice -idea -ideal -ideas -identical -identification -identified -identify -identity -ideological +i.e +i.e. +id ie if ignored -illness -illusion -illustration -image -images -imagination -imagine -imagined -imitation +ii +il +ill +im immediate immediately -impact -implications importance important -imposed -impossible -impressed -impression -impressions -impressive -improve -improved -improvement -improvements -impulse in -inadequate inasmuch inc inc. -inch -inches -incident -inclined -include -included -includes -including -income -increase -increased -increases -increasing -increasingly -incredible indeed index indicate indicated indicates -indication -indirect -individual -individuals -industrial -industry -inevitable -inevitably -influence -information -informed -inherent -initial -initiative -injured -injury inner -innocence -innocent -input -insect -insects inside -insight -insist -insisted insofar -insp -inspection -inspired -installed -instance -instances -instant instead -institution -institutions -instruction -instructions -instrument -instruments -insurance -insure -integration -intended -intense -intensity -intention -intentions -interest -interested -interesting -interests -interference -interior -internal -international -interpretation -interpreted -intervals -intervention -interview -interviews -intimate +int into -introduced -introduction -invariably -invent -invention -inventory -investigation -investigations -investment -invited -involve -involved -involves -involving -inward -iron +io +iq +ir is -island isn isn't -isolated -issue -issued -issues +isnt it it'd it'll it's -item -items +itd +itll its +itse" itself +itse” +ive j -jacket -jail -jan -january -jazz -jet -job -jobs +je +jm +jo join -joined -joint -joke -journey -joy -jr -judges -judgment -judgments -jul -july -jump -jumped -jun -june -jungle -junior -juniors +jp just -justice -justified -justify k +ke keep -keeping keeps kept -key -keys -kid -kids -kill -killed -killer -killing +kg +kh +ki kind -kinds -king -kingdom -kitchen -knee -knees +km +kn knew -knife -knocked know -knowing -knowledge known knows +kp +kr +kw +ky +kz l la -label -laboratory -labour -lack -lacked -lacking -ladder -ladies -lady -laid -lake -land -landing -lands -landscape -lang -language -languages large largely -larger -largest last -late lately later latest latter latterly -laugh -laughed -laughing -laughter -launched -law -laws -lawyer -lawyers -lay -lb. -lead -leaders -leadership -leading -leads -lean -leaned -leaped -learn -learned -learning +lb +lc least -leather -leave -leaves -leaving -led -left -leg -legal -legend -legislation -legislative -legislators -legs length -lengths less -lesson lest let let's lets -letter -letters -letting -level -levels -liberal -liberty -libraries -library -license -lid -lie -lies -lieutenant -life -lift -lifted -light -lighted -lighting -lightly -lights +li like liked likely -likes likewise -limit -limitations -limited -limits line -linear -lines -link -link-en -lips -liquid -liquor -list -listed -listen -listened -listeners -listening -lists -literally -literary -literature little -live -lived -lively -lives -livestock -living -lo -load -loaded -loan -loans -lobby -local -locate -located -location -lock -locked -locking -log -logical -lone -lonely +lk +ll long -long-range -long-term longer longest look -looked looking looks -loop -loose -lose -losing -loss -losses -lost -lot -lots -loud -love low lower -lowered -loyalty +lr +ls lt ltd -luck -lucky -lumber -lunch -luncheon -lungs -luxury -lying +lu +lv +ly m ma -machine -machinery -machines -mad made -magazine -magazines -magic -magnet -magnetic -magnificent -magnitude -maid -mail -main mainly -maintain -maintained -maintaining -maintenance -maj -major -majority make -makers makes making -male -males man -manage -managed -management -manager -managers -mankind -manner -manufacturer -manufacturers -manufacturing many -map -mar -marble -march -marginal -mark -marked -market -marketing -markets -marks -marriage -marriages -married -marshall -mass -masses -massive -master -match -matching -mate -material -materials -mathematical -mathematics -matter -matters -mature -maturity -maximum may maybe mayn't -mdt +maynt +mc +md me -meal -meals mean -meaning -meaningful -meanings means -meant meantime meanwhile -measure -measured -measurement -measures -measuring -meat -mechanical -mechanism -medical -medicine -medium -meet -meeting -meetings -meets -melody -melting member members -membership -memory men -mental -mention -mentioned -merchant -merchants -mere merely -merger -merit -mess -message -messrs -met -meta -metal -method -methods mg -middle -middle-class -midnight +mh might +might've mightn't -mighty -mile -miles -military -milk -milligrams +mightnt +mil +mill million -millions -mind -minds mine -mines -minimal -minimum -minor -minority minus -minute -minutes -mirror miss -missed -missile -missing -mission -mistake -mix -mixed -mixture -mlle +mk +ml mm -mme -mobile -mode -model -moderate -modern -modest -mold -molecule -moment -monday -money -month -monthly -months -monument -mood -moon -moral -morality +mn +mo more moreover -morning most mostly -mother -mothers -motion -motive -motives -motor -mount -mountain -mounted -mouth move -moved -movement -movements -moves -movie -movies -moving +mp +mq mr mrs ms -msgr -mss -mst +msie +mt +mu much -mud -multiple -multiply -municipal -murder -muscle -muscles -music -musical -musician -musicians +mug must -mustard +must've mustn't -mutual +mustnt +mv +mw +mx my +myse" myself -mysterious -mystery -myth +myse” +mz n -naked +na name -named namely -names -narrative -narrow -nation -national -natural -naturally -nature -naval +nay +nc nd +ne near -nearby -nearest nearly -neat necessarily necessary -necessity -neck need needed needing needn't +neednt needs -negative -negotiations -neighbor -neighborhood -neighboring -neighbors neither -nerves -nervous -nest net -network -neutral +netscape never neverf neverless nevertheless -new -newer -newest -newly -news -newspaper -newspapers next -nice -night -nights +nf +ng +ni nine -nineteenth ninety -ninth +nl no no-one -noble nobody -nodded -noise non none nonetheless -noon noone nor -normal normally -norms -north -nose +nos not -notable -note noted -notes nothing -notice -noticed -notion notwithstanding -noun -nov -novel -novels -november now nowhere -nude +np +nr +nu +null number numbers -numeral -numerous -nuts +nz o -object -objective -objectives -objects -obligations -obliged -observation -observations -observe -observed -observers obtain obtained -obvious obviously -occasion -occasional -occasionally -occasions -occupation -occupied -occur -occurred -occurrence -occurring -occurs -ocean -oct -october -odd of off -offer -offered -offering -offers -office -officer -officers -offices -official -officials often oh -oil ok okay -okt -old -older -oldest +om on once one one's ones only -onset onto -op open opened opening -openly opens -operate -operated -operating -operation -operational -operations -operator -opportunities -opportunity -opposed -opposite -opposition -optimal or -oral -orchestra ord -order -ordered -ordering -orderly -orders -organ -organic -organization -organizations -organized -origin -original -originally +org other others otherwise ought oughtn't +oughtnt our ours +ours +ourselves ourselves out -outcome -outdoor -outlook -output outside -outstanding over -over-all overall -overcome -overseas -overwhelming +owing own -owned -owner -owners -ownership -oxidation -oxygen p -pace -pack -package -packed -page -pages -paid -pain -painful -paint -painted -painter -painting -paintings -pair -pale -panel -panels -panic -paper -papers -parade -paragraph -parallel -parent -parents -parked -parking -parks +pa part -part-time parted -partially -participate -participation -particle -particles particular particularly -parties parting -partisan -partly -partner parts -party -pass -passage -passages -passed -passenger -passengers -passes -passing -passion past -patent -path -pathology -patience -patient -patients -patrol -pattern -patterns -pause -paused -pay -paying -payment -payments -pdt -peace -peaceful -peas -peculiar -peered -pencil -penny -people -peoples +pe per -percent -percentage -perception -perfect -perfectly -perform -performance -performances -performed perhaps -period -periods -permanent -permission -permit -permits -permitted -person -personal -personality -personally -personnel -persons -perspective -persuaded -pertinent -petitioner -pfc +pf +pg ph -ph.d -phase -phases -phd -phenomena -phenomenon -philosophical -philosophy -phone -phrase -physical -physically -physics -piano -pick -picked -picture -pictures -piece -pieces -pile -pilot -pink -pioneer -pipe -pistol -pitch +pk +pl place placed places -placing -plain -plan -plane -planes -planet -planetary -planets -planned -planning -plans -plant -plants -plaster -plastic -plastics -plate -plates -platform -play -played -player -players -playing -plays -pleasant please -pleased -pleasure -plenty -plot -plug -plural plus pm -pocket -poem -poems -poet -poetic -poetry -poets +pmid +pn point pointed pointing points -police -policeman -policies -policy -political -politicians -politics -polynomial -pond -pool -poor -popular -populate -population -porch -port -portion -pose -position -positions -positive -possessed -possession -possibilities -possibility +poorly possible possibly -post -posted -posts -pot -potential -pound -pounds -poured -poverty -powder -power -powerful -powers +potentially pp -practical -practically -practice -practices -preceding -precious -precise -precisely -precision -prefer -preferred -preliminary -preparation -prepare -prepared -preparing -presence +pr +predominantly present -presentation presented presenting presents -preserve -president -press -pressed -pressing -pressure -pressures -prestige presumably -pretty -prevent -prevented -prevention -previous previously -price -prices -pride primarily -primary -prime -primitive -principal -principle -principles -print -printed -prior -prison -prisoners -private -prize -probabilities -probability -probable probably -problem -problems -procedure -procedures -proceeded -process -processes -processing -procurement -produce -produced -producing -product -production -productive -products -prof -profession -professional -profit -profits -profound -program -programs -progress -project -projects -prominent -promise -promised -promises -promising -promote -promotion promptly -proof -propaganda -proper -properly -properties -property -proportion -proposal -proposals -proposed -prospect -prospective -prospects -protect -protected -protection -protein -protest -proud -prove -proved -provide provided provides -providing -provision -provisions -pst -psychological -public -publication -publicity -publicly -published -publisher -pull -pulled -pulling -pulmonary -punishment -pupil -pupils -purchase -purchased -pure -purely -purpose -purposes -pursuant -pursue -push -pushed +pt put puts -putting -pvt +pw +py q -qualified -qualities -quality -quantity -quarrel -quart -quarter -quarters +qa que -question -questioned -questioning -questionnaire -questions -quick -quickly -quiet -quietly quite -quoted -quotient qv r -race -races -racial -racing -radar -radiation -radio -rail -railroad -rain -raise -raised -raising ran -ranch -rang -range -ranging -rank -ranks -rapid -rapidly -rare -rarely -rate -rates rather -ratio -rational -raw rd re -reach -reached -reaches -reaching -reaction -reactionary -reactions -read -reader -readers readily -reading -ready -real -realism -realistic -reality -realization -realize -realized really -rear -reason -reasonable reasonably -reasons -recall -recalled -receive -received -receives -receiving recent recently -reception -recognize -recognized -recommend -recommendation -recommendations -recommended -record -recorded -recording -records -recovery -recreation -rector -red -reduce -reduced -reducing -reduction ref -refer -reference -referred -reflect -reflected -reflection -reflects -reform -refrigerator -refund -refused -regard -regarded +refs regarding regardless regards -regime -regiment -region -regional -regions -register -registered -registration -regular -regularly -regulations -rehabilitation -rejected related -relating -relation -relations -relationship -relationships -relative relatively -relatives -release -released -relevant -reliable -relief -relieved -religion -religious -remain -remainder -remained -remaining -remains -remark -remarkable -remarked -remarks -remember -remembered -reminded -remote -removal -remove -removed -rendered -rent -reorganization -rep -repair -repeat -repeated -replace -replaced -replacement -replied -reply -report -reported -reporter -reporters -reports -represent -representative -representatives -represented -representing -represents -reprint -reps -republic -reputation -request -require -required -requirement -requirements -requires -res -research -reserve -reserved -residence -residential -residents -resist -resistance -resolution -resolved -resources -respect -respectable -respective respectively -respects -respond -responded -response -responses -responsibilities -responsibility -responsible -rest -restaurant -restrictions -result -resulted -resulting -results -resumed -retained -retired -retirement -return -returned -returning -returns rev -reveal -revealed -reveals -revenues -review -revolution -revolutionary -rhythm -rich -rid -ride -riding -rifle -rifles right -rights -rigid ring -rise -rises -rising -risk -ritual -river -road -roads -rock -rocks -rode -role -roles -roll -rolled -romantic -roof -room -rooms -root -roots -rope -rose -rough -roughly -round -route -routine -row -rt -rub -rule -ruled -rules -ruling +ro +ru run -running -runs -rural -rush -rushed +rw s -sacred -sacrifice -sad -saddle -safe -safety +sa said -sail -sailing -sake -salary -sale -sales -saline -salt same -sample -sampling -sand -sang -sat -satisfaction -satisfactory -satisfied -saturday -sauce -save -saved -saving -savings saw say saying says -scale -scarcely -scared -scattered -scene -scenes -schedule -scheduled -scheme -scholars -scholarship -school -schools -science -scope -score -screen -sea -search -searching -season -seat -seated +sb +sc +sd +se +sec second -secondary secondly seconds -secret -secretary -secrets section -sections -secure -security see -seed -seeds seeing -seek -seeking seem seemed seeming seems seen sees -segment -seized -seldom -select -selected -selection self -sell -selling selves -sen -senator -send -sending -senior -sens -sense -sensible -sensitive -sensitivity sent -sentence -sentiment -sep -separate -separated -sept -september -sequence -sergeant -series -serious -seriously -servants -serve -served -serves -service -services -serving -session -sessions -set -sets -setting -settle -settled -settlement seven -seventh +seventy several -severe -sewage -sex -sexual -sfc -sgt -shade -shadows -shaking +sg +sh shall -shame shan't -shape -shapes -share -shared -shares -sharing -sharp -sharply +shant she she'd she'll she's -shear -sheep -sheet +shed shell -shelter -shelters -shift -shine -shining -ship -shipping -ships -shirt -shock -shoe -shoes -shook -shoot -shooting -shop -shopping -shore -short -shortly -shorts -shot -shots +shes should -shoulder -shoulders +should've shouldn shouldn't -shout -shouted -shouting +shouldnt show showed showing shown +showns shows -shut -sick +si side sides -sidewalk -sighed -sight -sign -signal -signals -signed -significance significant -signs -silence -silent -silver +significantly similar similarly -simple -simply -sin since -sing -singing -single -sink -sister -sit +sincere site -sitter -sitting -situation -situations six -sixteen -sixties sixty -size -skill -skilled -skills -skin -skirt -sky -skywave -slave -sleep -slender -slept -slid -slide -slight +sj +sk +sl slightly -slim -slip -slipped -slow -slowly -small -smaller -smallest -smart -smell -smile -smiled -smoke -smooth -snake -snakes -snapped -snow +sm +sn so -so-called -soap -social -socialism -societies -society -soft -softly -soil -sold -soldier -solely -solid -solution -solve -solved some somebody someday somehow someone +somethan something sometime sometimes somewhat somewhere -son -song -songs soon -sophisticated -sorry -sort -sought -soul -souls -sound -sounded -sounds -source -sources -south -sovereign -sovereignty -space -span -spare -speak -speaker -speaking -special -specialists -species -specific specifically specified specify specifying -specimen -spectacular -speech -speeches -speed -spell -spend -spending -spent -sphere -spirit -spirits -spiritual -spite -splendid -spoke -spoken -sponsor -sponsored -spot -spots -spread -spring -square sr st -stable -staff -stage -stages -staining -stairs -stake -stand -standard -standards -standing -stands -star -stared -staring -stars -start -started -starting -startled -starts state -stated -statement -statements states -station -stations -statistics -status -stay -stayed -stead -steadily -steady -steam -steel -stem -stems -step -stepped -steps -stick -sticks -stiff still -stock -stockholders -stomach -stone -stood stop -stopped -storage -store -stored -stores -stories -storm -story -straight -strain -strange -stranger -strategic -strategy -stream -street -streets -stress -stressed -stresses -stretch -stretched -strictly -strike -strikes -striking -string -strip -stroke -strong -stronger -strongest strongly -struck -structural -structure -structures -struggle -struggling -stuck -student -studio -study -stuff -stumbled -stupid -style -styles +su sub -subject -subjected -subjects -submitted -substance -substances -substantial substantially -substitute -substrate -subtle -subtract -suburban -succeeded -success -successes -successful successfully -succession such -sudden -suddenly -suffer -suffered -suffering -sufficient sufficiently -suffix -sugar suggest -suggested -suggestion -suggestions -suggests -suit -suitable -suitcase -suite -suited -suits -sum -summary -summer -sun -sunday sup -supervision -supper -supplement -supplied -supplies -supply -support -supported -supporting -suppose -supposed -supt sure -surely -surface -surfaces -surg -surplus -surprise -surprised -surprising -surrender -surrounded -survey -survival -survive -suspect -suspected -suspended -suspicion -sweat -sweet -swept -swift -swim -swimming -swing -switch -switches -swung -syllable -symbol -symbolic -symbols -sympathetic -sympathy -system -systems +sv +sy +sz t t's -table -tables -tactics -tag -tagged -tags -tail take taken -takes taking -tale -talent -talents -talk -talked -talking -tall -tangent -tangible -tape -target -task -tasks -taste -taught -teach -team -teams -tears -technical -technique -techniques -technology -teeth -telephone -television +tc +td tell -telling -tells -temperature -temperatures -temporarily -temporary ten -tend -tended -tendency tends -tension -tent -term -terms -terrible test -tested -testimony -testing -tests text -textile +tf +tg th than -thank -thanks -thanx that that'll that's that've +thatll thats +thatve the -theater their theirs them -theme themselves then thence -theological -theoretical -theories -theory there there'd there'll @@ -4178,332 +1113,130 @@ there's there've thereafter thereby +thered therefore therein +therell +thereof +therere theres +thereto thereupon -thermal +thereve these they they'd they'll they're they've +theyd +theyll +theyre +theyve thick -thickness thin thing things -think -thinking -thinks third thirty this thorough thoroughly those +thou though -thought -thoughts +thoughh thousand -thousands -threat -threatened -threatening three -threw -throat +throug through throughout -throw -thrown thru -thrust -thursday thus -thyroid -tie -tied -tight +til till -time -times -tiny tip -tire -tired -tissue -title +tis +tj +tk +tm +tn to -toast today -toes together -told -tomorrow -tone -tones -tongue -tonight -tons too took -tool -tools -tooth top -torn -tossed -total -totally -touch -touched -tough -tour -tournament toward towards -town -towns -trace -track -tractor -trade -traders -trading -tradition -traditional -traditions -traffic -tragedy -tragic -train -trained -training -transfer -transferred -transformation -transformed -transition -transportation -trap -travel -traveled -treat -treated -treatment -tree -trees -trembling -tremendous -trend -trends -trial -trials -triangle -tribute +tp +tr tried tries -trim -trip -trips -triumph -troops -trouble -troubled -troubles -truck -trucks -true -truly -trust -truth +trillion try trying -tsunami -tube -tubes -tuesday -turn -turned -turning -turns +ts +tt +tv +tw +twas +twelve twenty -twenty-five twice two -type -types -typical +tz u -ugly -ultimate +ua +ug +uk +um un -unable -uncertain -uncle -unconscious under -underground -underlying underneath -understand -understanding -understood undoing -undoubtedly -uneasy -unexpected -unfortunate -unfortunately -unhappy -uniform -union -unions -unique -unit -units -unity -universal -universe -universities -unknown unless unlike -unlikely until unto -unusual up -update upon -upper -upstairs -upward -upwards -urban -urge -urged -urgent -url +ups us use used -useful -user -username uses using -usual usually -utc -utility -utopian -utterly +uucp +uy +uz v -vacation -vacuum -valid -valign -valley -valuable +va value -variable -variables -variation -variations -varied -variety various -vary -varying -vast -vehicle -vehicles -vein -velocity -venture -verb -verbal -verse -version +vc +ve versus very -veteran +vg +vi via -vice -video -view -viewed -views -vigorous -village -virtually -virtue -visible -vision -visit -visited -visiting -visitors -visual -vital -vivid viz -vocational -voice -voices +vn vol -volume -volumes -voluntary -volunteers -vote -voted -voters -votes -voting -vowel +vols vs +vu w -wage -wages -wagon -wait -waited -waiting -wake -walk -walked -walking -wall -walls want wanted wanting wants -war -ward -warfare -warm -warmth -warned -warning -warrant was -wash -washed -washing +wasn wasn't -waste -watch -watched -watching -water -waters -wave -waves +wasnt way ways we @@ -4511,44 +1244,39 @@ we'd we'll we're we've -weak -weakness -wear -wearing -weather -web -wedding -wednesday -week -weekend -weekly -weeks -weight -welcome well wells went were +weren weren't -west -wet +werent +weve +wf what +what'd what'll what's what've whatever -wheel -wheels +whatll +whats +whatve when +when'd +when'll when's whence whenever where +where'd +where'll where's whereafter whereas whereby wherein +wheres whereupon wherever whether @@ -4556,128 +1284,69 @@ which whichever while whilst -whip -whisky -whispered -white +whim whither who who'd who'll who's +whod whoever whole -wholly +wholl whom whomever +whos whose why +why'd +why'll why's -wide widely -widespread -widow width -wife -wild -wildlife -wildly will willing -win -wind -window -windows -winds -wine -wines -wing -wings -winning -winter -wiped -wire -wisdom wish -wished -wishes -wit with within without -witness -witnesses -wives -woman -women won won't -wonder -wondered -wonderful -wondering -wood -wooden -woods -word -words -wore -work -worked -worker -workers -working -works -workshop -world -worn -worried -worries -worry -worse -worst -worth -worthy +wont would +would've wouldn wouldn't -wound -wounded -write -writers -writes -writing -written -wrong -wrote +wouldnt +ws www x -xml y -yard -yards +ye year -year-old years -yelled -yellow yes -yesterday yet -yield -york you you'd you'll you're you've +youd +youll young younger youngest -youngsters your +youre yours yourself yourselves +youve +yt +yu z +za zero +zm +zr \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/es/es_stop_words.txt b/apps/common/src/python/mediawords/languages/es/es_stop_words.txt index 4f08f76cb8..38af73f9af 100644 --- a/apps/common/src/python/mediawords/languages/es/es_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/es/es_stop_words.txt @@ -1,69 +1,200 @@ -# -# This is a stop word list for the Spanish language. -# # Sources: +# # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-es/blob/master/stopwords-es.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) # a acerca +actualmente +adelante +ademas además adónde +afirmó +agregó +ahi +ahí +ahora al algo +algún +alguna algunas +alguno algunos +alli +allí +alrededor +ambos +ampleamos +añadió +antano +antaño ante +anterior antes +apenas +aproximadamente aquel +aquél aquella +aquélla aquellas +aquéllas +aquello aquellos +aquéllos +aqui +aquí +arriba +arribaabajo +aseguró aseveró +asi así +atras +aun +aún aunque +ayer +b +bajo +bastante +bien +breve +buen +buena +buenas +bueno +buenos +c cada +casi +cerca +cierta +ciertos +cinco +claro +comentó como +cómo con +conmigo +conocer +conseguimos +conseguir +considera +consideró +consigo +consigue +consiguen +consigues +contigo contra +cosas +creo cual +cuál cuales +cuáles +cualquier cualquiera cuando -cuál -cuáles +cuándo +cuanta +cuánta +cuantas +cuántas +cuanto cuánto +cuantos +cuántos +cuatro +cuenta +d +da +dado +dan +dar de +debajo debe +deben +debido +decir +dejó del +delante +demás +demasiado +dentro desde +despues después destacó +detras +detrás +dia +día +dias +días +dice +dicen dicho +dieron +diferente +diferentes +dijeron dijo +dio donde +dónde +dos durante e +ejemplo el +él ella ellas +ello ellos +empleais +emplean +emplear +empleas +empleo en +encima +encuentra +enfrente +entonces entre era erais +eramos +éramos eran eras eres es esa +ésa esas +ésas ese +ése eso esos +ésos esta +está +ésta estaba estabais +estábamos estaban estabas estad @@ -71,154 +202,233 @@ estada estadas estado estados +estais +estáis estamos +estan +están estando estar -estaremos estará estarán estarás estaré estaréis +estaremos estaría estaríais estaríamos estarían estarías estas +estás +éstas este +esté +éste +estéis estemos +estén +estés esto estos +éstos estoy estuve estuviera estuvierais +estuviéramos estuvieran estuvieras estuvieron estuviese estuvieseis +estuviésemos estuviesen estuvieses estuvimos estuviste estuvisteis -estuviéramos -estuviésemos estuvo -está -estábamos -estáis -están -estás -esté -estéis -estén -estés +ex excepto +existe +existen +explicó expresó +f fue fuera fuerais +fuéramos fueran fueras fueron fuese fueseis +fuésemos fuesen fueses fui fuimos fuiste fuisteis -fuéramos -fuésemos +g +general +gran +grandes +gueno +h ha +habéis +haber +habia +había +habíais +habíamos +habían +habías habida habidas habido habidos habiendo -habremos +habla +hablan habrá habrán habrás habré habréis +habremos habría habríais habríamos habrían habrías -habéis -había -habíais -habíamos -habían -habías hace +haceis +hacemos +hacen hacer +hacerlo +haces hacia hacía +haciendo +hago han has hasta hay haya +hayáis hayamos hayan hayas -hayáis he +hecho hemos hicieron hicimos +hizo +horas +hoy hube hubiera hubierais +hubiéramos hubieran hubieras hubieron hubiese hubieseis +hubiésemos hubiesen hubieses hubimos hubiste hubisteis -hubiéramos -hubiésemos hubo +i +igual +incluso indicó +informo informó +intenta +intentais +intentamos +intentan +intentar +intentas +intento +ir +j +junto +k +l la lado lados +largo las le +lejos les +llegó lleva +llevar lo los luego +lugar +m +mal +manera +manifestó +mas +más +mayor me mediante +medio +mejor +mencionó +menos +menudo mi +mí +mia +mía +mias +mías +mientras +mio +mío +mios +míos mis misma +mismas mismo +mismos +modo +momento +mucha +muchas mucho muchos muy -más -mí -mía -mías -mío -míos +n nada +nadie ni +ningún +ninguna +ningunas +ninguno +ningunos no nos nosotras @@ -227,147 +437,291 @@ nuestra nuestras nuestro nuestros +nueva +nuevas +nuevo +nuevos +nunca o obstante +ocho os otra otras otro otros +p +pais +paìs para +parece parte +partir +pasada +pasado +peor pero +pesar +poca +pocas poco +pocos +podeis +podemos +poder +podrá +podrán +podria +podría +podriais +podriamos +podrian +podrían +podrias +poner por +por qué porque porqué +posible +primer +primera +primero +primeros +principalmente +pronto +propia +propias +propio +propios +proximo +próximo +próximos pudieron pudiese pudimos +pudo +pueda puede +pueden +puedo +pues +q +qeu que +qué +quedó +queremos quien +quién quienes -qué +quiénes +quiere +quiza +quizá +quizas +quizás +r +s +sabe +sabeis +sabemos +saben +saber +sabes +sal +salvo se +sé sea +seáis seamos sean seas +segun según -seremos +segunda +segundo +seis +señaló +ser +sera será serán serás seré seréis +seremos sería seríais seríamos serían serías -seáis -señaló si +sí sido +siempre siendo +siete +sigue +siguiente sin +sino sobre sois -solo +sola +solamente +solas solía +solo +sólo +solos somos son soy +soyos su suele +supuesto sus suya suyas suyo suyos -sí -sólo +t +tal +tambien también +tampoco +tan tanto +tarde te -tendremos +temprano tendrá tendrán tendrás tendré tendréis +tendremos tendría tendríais tendríamos tendrían tendrías tened +teneis +tenéis tenemos +tener tenga +tengáis tengamos tengan tengas tengo -tengáis -tenida -tenidas -tenido -tenidos -teniendo -tenéis tenía teníais teníamos tenían tenías +tenida +tenidas +tenido +tenidos +teniendo +tercera ti +tiempo tiene tienen tienes toda todas +todavia +todavía todo todos +total +trabaja +trabajais +trabajamos +trabajan +trabajar +trabajas +trabajo tras +trata través +tres tu +tú tus tuve tuviera tuvierais +tuviéramos tuvieran tuvieras tuvieron tuviese tuvieseis +tuviésemos tuviesen tuvieses tuvimos tuviste tuvisteis -tuviéramos -tuviésemos tuvo tuya tuyas tuyo tuyos -tú +u +última +últimas +ultimo +último +últimos un una unas uno unos +usa +usais +usamos +usan +usar +usas +uso +usted +ustedes +v +va +vais +valor +vamos +van +varias +varios +vaya +veces +ver +verdad +verdadera +verdadero vez vosotras vosotros +voy vuestra vuestras vuestro vuestros +w +x y ya yo -él -éramos +z diff --git a/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt b/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt index aa2cb4cdf7..961a260907 100644 --- a/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt @@ -1,39 +1,210 @@ -# -# This is a stop word list for the Finnish language. -# # Sources: +# # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt +# +# (Lightly edited to remove words in the original lists that are actually meaningful) # +aiemmin +aika +aikaa +aikaan +aikaisemmin +aikaisin +aikajen +aikana +aikoina +aikoo +aikovat +aina +ainakaan +ainakin +ainoa +ainoat +aiomme +aion +aiotte +aist +aivan +ajan +alas +alemmas +alkuisin +alkuun +alla +alle +aloitamme +aloitan +aloitat +aloitatte +aloitattivat +aloitettava +aloitettevaksi +aloitettu +aloitimme +aloitin +aloitit +aloititte +aloittaa +aloittamatta +aloitti +aloittivat +alta +aluksi +alussa +alusta +annettavaksi +annetteva +annettu +ansiosta +antaa +antamatta +antoi +aoua +apu +asia +asiaa +asian +asiasta +asiat +asioiden +asioihin +asioita +asti +avuksi +avulla +avun +avutta +edellä +edelle +edelleen +edeltä +edemmäs +edes +edessä +edestä +ehkä ei +eikä +eilen eivät +eli +ellei +elleivät +ellemme +ellen +ellet +ellette emme en +enää +enemmän +eniten +ennen +ensi +ensimmäinen +ensimmäiseksi +ensimmäisen +ensimmäisenä +ensimmäiset +ensimmäisiä +ensimmäisiksi +ensimmäisinä +ensimmäistä +ensin +entinen +entisen +entisiä +entistä +entisten +eräät +eräiden +eräs +eri +erittäin +erityisesti +esi +esiin +esillä +esimerkiksi et -ette +eteen +etenkin +etessa että -he -heidän -heidät -heihin -heille -heillä -heiltä -heissä -heistä -heitä +ette +ettei +haikki +halua +haluaa +haluamatta +haluamme +haluan +haluat +haluatte +haluavat +halunnut +halusi +halusimme +halusin +halusit +halusitte +halusivat +halutessa +haluton hän häneen -hänelle hänellä +hänelle häneltä hänen hänessä hänestä hänet häntä +he +hei +heidän +heidät +heihin +heillä +heille +heiltä +heissä +heistä +heitä +helposti +heti +hetkellä +hieman +hitaasti +hoikein +huolimatta +huomenna +hyvä +hyvää +hyvät +hyviä +hyvien +hyviin +hyviksi +hyville +hyviltä +hyvin +hyvinä +hyvissä +hyvistä +ihan +ilmeisesti itse +itseään +itsensä ja +jää +jälkeen +jälleen +jo johon joiden joihin @@ -46,63 +217,211 @@ joissa joista joita joka +jokainen +jokin +joko joksi +joku jolla jolle +jolloin jolta +jompikumpi jona jonka +jonkin +jonne +joo +jopa jos +joskus jossa josta jota +jotain +joten +jotenkin +jotenkuten jotka +jotta +jouduimme +jouduin +jouduit +jouduitte +joudumme +joudun +joudutte +joukkoon +joukossa +joukosta +joutua +joutui +joutuivat +joutumaan +joutuu +joutuvat +juuri +kahdeksan +kahdeksannen +kahdella +kahdelle +kahdelta +kahden +kahdessa +kahdesta +kahta +kahteen +kai +kaiken +kaikille +kaikilta +kaikkea +kaikki +kaikkia +kaikkiaan +kaikkialla +kaikkialle +kaikkialta +kaikkien +kaikkin +kaksi +kannalta +kannattaa kanssa +kanssaan +kanssamme +kanssani +kanssanne +kanssasi +kauan +kauemmas +kaukana +kautta +kehen keiden keihin keiksi -keille keillä +keille keiltä keinä keissä keistä keitä +keitten keneen keneksi -kenelle kenellä +kenelle keneltä kenen kenenä kenessä kenestä kenet -ketkä -ketkä +kenettä +kennessästä +kenties +kerran +kerta +kertaa +keskellä +kesken ketä +ketkä +kiitos +kohti +koko +kokonaan +kolmas +kolme +kolmen +kolmesti koska +koskaan +kovin kuin +kuinka +kuinkan +kuitenkaan +kuitenkin kuka +kukaan +kukin +kukka +kumpainen +kumpainenkaan +kumpi +kumpikaan +kumpikin kun +kuten +kuuden +kuusi +kuutta +kyllä +kylliksi +kymmenen +kyse +lähekkäin +lähellä +lähelle +läheltä +lähemmäs +lähes +lähinnä +lähtien +läpi +liian +liki +lisää +lisäksi +lla +luo +luona +mahdollisimman +mahdollista me meidän meidät meihin -meille meillä +meille meiltä meissä meistä meitä +melkein +melko +menee +meneet +menemme +menen +menet +menette +menevät +meni +menimme +menin +menit +menivät +mennessä +mennyt +menossa mihin -miksi mikä -mille +mikään +mikäli +mikin +miksi millä +mille +milloin +milloinkan miltä +minä minkä -minkä +minne minua minulla minulle @@ -112,23 +431,71 @@ minussa minusta minut minuun -minä -minä missä mistä -mitkä mitä +mitään +miten +mitkä +moi +molemmat +mones +monesti +monet +moni +monta +muassa +muiden +muita +muka mukaan +mukaansa +mukana mutta +muu +muualla +muualle +muualta +muuanne +muulloin +muun +muut +muuta +muutama +muutaman +muuten +myöhemmin +myös +myöskään +myöskin +myötä +näiden +näihin +näiksi +näillä +näille +näiltä +näin +näinä +näissä +näissähin +näissälle +näissältä +näissästä +näistä +näitä +nämä ne +neljä +neljää +neljän niiden niihin niiksi -niille niillä +niille niiltä niin -niin niinä niissä niistä @@ -144,24 +511,21 @@ noina noissa noista noita +nro nuo nyt -näiden -näihin -näiksi -näille -näillä -näiltä -näinä -näissä -näistä -näitä -nämä +ohi +oikea +oikealla +oikein ole olemme olen olet olette +oleva +olevan +olevat oli olimme olin @@ -176,21 +540,85 @@ olitte olivat olla olleet +olli ollut +oma +omaa +omaan +omaksi +omalle +omalta +oman +omassa +omat +omien +omiin +omiksi +omille +omilta +omissa +omista on +onkin +onko ovat +päälle +paikoittain +paitsi +pakosti +paljon +paremmin +parempi +parhaillaan +parhaiten +peräti +perusteella +pian +pieneen +pieneksi +pienellä +pienelle +pieneltä +pienempi +pienestä +pienin poikki +puolesta +puolestaan +saakka +sadam +sama +samaa +samaan +samalla +samallalta +samallassa +samallasta +saman +samat +samoin +sata +satojen se +seitsemän sekä sen +seuraavat +siellä +sieltä siihen siinä +siis siitä +sijaan siksi -sille -sillä sillä +sille +silloin siltä +silti +sinä +sinne sinua sinulla sinulle @@ -200,21 +628,65 @@ sinussa sinusta sinut sinuun -sinä -sinä sitä +siten +sitten +ssa +sta +suoraan +suuntaan +suuret +suuri +suuria +suurin +suurten +taa +taas +taemmas +tähän +tahansa tai +takaa +takaisin +takana +takia +täksi tallä +tälle +tältä +tämä +tämän +tänä +tapauksessa +tarpeeksi +tässä +tästä +tätä +tavalla +tavoitteena te teidän teidät teihin -teille teillä +teille teiltä teissä teistä teitä +tietysti +todella +toinen +tois +toisaalla +toisaalle +toisaalta +toiseen +toiseksi +toisella +toiselle +toiselta +toisemme tuo tuohon tuoksi @@ -226,16 +698,6 @@ tuona tuossa tuosta tuotä -tähän -täksi -tälle -tältä -tämä -tämän -tänä -tässä -tästä -tätä vaan vai vaikka diff --git a/apps/common/src/python/mediawords/languages/fr/fr_stop_words.txt b/apps/common/src/python/mediawords/languages/fr/fr_stop_words.txt index 2f7ed427ca..c9ec02754b 100644 --- a/apps/common/src/python/mediawords/languages/fr/fr_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/fr/fr_stop_words.txt @@ -1,18 +1,46 @@ -# -# This is a stop word list for the French language. -# # Sources: +# # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-fr/blob/master/stopwords-fr.txt +# +# (Lightly edited to remove words in the original lists that are actually meaningful) # +a +à +â a-t-on +abord +absolument +afin +ah ai -aie aient aies +ailleurs +ainsi ait +allaient +allo +allô +allons +alors +anterieur +anterieure +anterieures +apres +après as +assez +attendu au +aucun +aucune +aucuns +aujourd +aujourd'hui +aupres +auquel aura aurai auraient @@ -24,41 +52,190 @@ auriez aurions aurons auront +aussi +autant +autre +autrement +autres +autrui aux +auxquelles +auxquels avaient avais avait +avant avec avez aviez -avions +avoir avons ayant ayante ayantes -ayants ayez ayons +b +bah +bas +basee +beaucoup +bien +bigre +bon +boum +brrr c +ça +car ce +ceci +cela +celà celle +celle-ci +celle-là +celles +celles-ci +celles-là +celui +celui-ci +celui-là +cent +cependant +certain +certaine +certaines +certains +certes ces +cet +cette +ceux +ceux-ci +ceux-là +chacun +chacune +chaque +chère +chères +chers +chez +ci +cinq +cinquantaine +cinquante +cinquantième +cinquième +clac +clic +combien +comme +comment +comparable +comparables +compris +concernant +contre d d'une +da dans de +debout +début +dedans +dehors +deja +delà +depuis +dernier +derniere +derriere +derrière des +dès +desormais +désormais +desquelles +desquels +dessous +dessus +deux +deuxième +deuxièmement +devant +devers +devra +devrait +different +différent +différente +differentes +différentes +differents +différents +dire +dit +dite +dits +dix +dix-huit +dix-neuf +dix-sept +dixième +doit +doivent donc dont +dos +douze +douzième +dring +droite du +duquel +durant +e +effet +egalement +egales +eh elle +elle-même +elles +elles-mêmes en +encore +enfin +entre +envers +environ es +ès +essai est et +étaient +étais +était +etant +étant +étante +étantes +étants +état +etc +êtes +étiez +étions +etre +être eu eue eues +euh +eûmes eurent eus eusse @@ -67,10 +244,27 @@ eusses eussiez eussions eut -eux -eûmes eût eûtes +eux +eux-mêmes +exactement +excepté +extenso +exterieur +f +façon +fais +faisaient +faisant +fait +faites +feront +fi +flac +fois +font +fûmes furent fus fusse @@ -79,56 +273,270 @@ fusses fussiez fussions fut -fûmes fût fûtes +g +gens +h +ha +haut +hé +hein +hélas +hem +hep +hi +ho +holà +hop +hormis +hors +hou +houp +hue +hui +huit +huitième +hum +i +ici il ils +importe j je +jusqu +jusque +juste +k l l' la +là +laisser +laquelle +las le +lequel les +lès +lesquelles +lesquels leur leurs +longtemps +lors +lorsque lui +lui-meme +lui-même m ma +maint +maintenant mais +malgre +malgré me +meme +même +memes +mêmes +merci mes +mien +mienne +miennes +miens +mille +mince +mine +minimale moi +moi-meme +moi-même +moindres +moins mon -même +mot +moyennant +multiple +multiples n n'a n'est +na +naturelles ne +neanmoins +néanmoins +necessaire +necessairement +neuf +neuvième ni +nombreuses +nommés +non nos +notamment notre +nôtre +nôtres nous +nous-mêmes +nouveau +nouveaux +nul +o +ô +oh +ohé +olé +ollé on ont +onze +onzième +ore ou où +ouf +ouias +oust +ouste +ouvert +ouverte +ouverts +o| +p +paf +pan par +parce +parfois +parle +parlent +parler +parmi +parole +parseme +partant +particulier +particulière +particulièrement pas +passé +pendant +pense +permet +personne +personnes +peu +peut +peuvent +peux +pff +pfft +pfut +pièce +pif +pire +plein +plupart +plus +plusieurs +plutôt +possessif +possessifs +possible +possibles +pouah pour +pourquoi +pourrais +pourrait +pouvait +prealable +precisement +premier +première +premièrement +pres +près +probable +probante +procedant +proche +psitt +pu +puis +puisque +pur +pure +q qu qu'elle qu'il qu'on qu'une quand +quant +quant-à-soi +quanta +quarante +quatorze +quatre +quatre-vingt +quatrième +quatrièmement que +quel +quelconque +quelle +quelles +quelqu'un +quelque +quelques +quels qui +quiconque +quinze +quoi +quoique +r +rare +rarement +relative +relativement +rend +rendre +restant +reste +restent +retour +revoici +revoilà +rien s s'est sa +sacrebleu +sait +sans +sapristi +sauf se +sein +seize +selon +semblable +semblaient +semble +semblent +sent +sept +septième sera serai seraient @@ -141,45 +549,125 @@ serions serons seront ses +seul +seule +seulement si +sien +sienne +siennes +siens +sinon +six +sixième +soi +soi-même soient sois soit +soixante sommes son sont +sous +souvent soyez soyons +stop +strictement +suffit suis +suit +suivant +suivante +suivantes +suivants +suivre +sujet sur +surtout t ta +tac +tandis +tant +tardive te +té +telle +tellement +telles +tels +tenant +tend +tenir +tente tes +tic +tien +tienne +tiennes +tiens +toc toi +toi-même ton +touchant +toujours +tous +tout +toute +toutefois +toutes +treize +trente +tres +très +trois +troisième +troisièmement +trop +tsoin +tsouin tu +u un une +unes +uniformement +unique +uniques +uns +v va vais +valeur +vas +vé +vers +via +vif +vifs +vingt +vivat +vive +vives +vlan +voici +voie +voient +voilà +voire +vont vos votre +vôtre +vôtres vous +vous-mêmes +vu +w +x y -à -étaient -étais -était -étant -étante -étantes -étants -étiez -étions -été -étée -étées -étés -êtes -être +z +zut diff --git a/apps/common/src/python/mediawords/languages/ha/ha_stop_words.txt b/apps/common/src/python/mediawords/languages/ha/ha_stop_words.txt index 07c7723d36..9aebb2f755 100644 --- a/apps/common/src/python/mediawords/languages/ha/ha_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ha/ha_stop_words.txt @@ -3,6 +3,7 @@ # # Sources: # https://github.com/stopwords-iso/stopwords-ha/blob/master/raw/gh-stopwords-json-ha.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) # a diff --git a/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt b/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt index 27440bfb15..63f9448b7a 100644 --- a/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt @@ -1,228 +1,273 @@ -# -# This is a stop word list for the Hindi language. -# # Sources: -# http://www.ranks.nl/stopwords/hindi +# # http://members.unine.ch/jacques.savoy/clef/hindiST.txt -# https://sites.google.com/site/kevinbouge/stopwords-lists # http://resgtholpadi.blogspot.com/2012/07/hindi-stop-words-list.html +# http://www.ranks.nl/stopwords/hindi +# https://github.com/stopwords-iso/stopwords-hi/blob/master/stopwords-hi.txt +# https://sites.google.com/site/kevinbouge/stopwords-lists +# +# (Lightly edited to remove words in the original lists that are actually meaningful) # -अंदर +न +व अत +अप +अब +आज +आप +इन +इस +उन +उस +एक +एस +ओर +और +कइ +कई +कम +कल +का +कि +की +के +को +गई +गए +जब +जा +जो +तक +तब +तो +था +थि +थी +थे +दो +ना +ने +पर +पे +भि +भी +मे +मै +यह +या +ये +वह +वे +से +सो +हि +ही +हे +है +हो +अदि +अभि +अभी +आदि +इसि +इसी +इसे +उसि +उसी +उसे +ऊपर +एवं +एसे +ऐसा +ऐसे +कभी +कहा +किए +कुछ +कुल +कोइ +कोई +कोन +कौन +गया +गयी +गये +जिन +जिस +तथा +तरह +तिन +तिस +तुम +दूर +फिर +बनि +बनी +बहि +बाद +बीच +मगर +में +यदि +यहि +यही +यिह +रहा +रहे +लिए +वुह +संग +सभि +सभी +समय +साथ +हुअ +हुआ +हुइ +हुई +हुए +हें +हैं +के +अंदर अथवा अन्य अपना +अपनि अपनी अपने -अब -अभी -आज -आदि -आप -इत्यादि -इन -इन इनका इनके -इन्हीं -इन्हें -इन्हों -इस इसका +इसकि इसकी इसके -इसमें -इसी -इसे उच्च -उत्तर -उन उनका +उनकि उनकी उनके उनको -उन्हीं -उन्हें -उन्हें -उन्हों -उस उसकी उसके -उसी -उसे -ऊपर -एक -एवं -एस -ऐसा -ऐसे -और -कई -कभी -कम -कर करता करते करना करने करें -कल कहते -कहा -का +काफि काफ़ी -कि -किए -कितना -किन्हें -किन्हों किया -किर -किस किसी किसे -की -कुछ -कुल -के -को -कोई -कौन -कौनसा -गई -गए -गया -गयी -गये -घर -जब जहाँ -जा +जहां जाता जाती जाते जाने -जितना -जिन -जिन्हें -जिन्हों -जिस -जिसमें -जिससे +जिधर जिसे जीधर +जेसा +जेसे जैसा जैसे -जो -तक -तथा -तब -तरह -तिन -तिन्हें -तिन्हों -तिस तिसे -तुम -तो -था -थी -थे -दबारा दिया -दुसरा -दूर -दूसरे -दो -दोनों -द्वारा -न +नहिं नहीं -ना -निहायत +निचे नीचे -ने -पर -पर -परंतु पहले +पुरा पूरा पूरे -पे -प्रति -फिर बड़ा बड़े -बनी -बही बहुत -बाद बाला बाहर -बिलकुल -बीच -भी +भितर भीतर -मगर मध्य मानो -मे -में -मै -यदि -यह यहाँ यहां -यही -या -यिह -ये रखें रहती -रहा -रहे -ऱ्वासा -लिए लिया लिये लेकर -लेकिन -व वर्ग -वह -वह वहाँ +वहां +वहिं वहीं वाले -वुह -वे -वग़ैरह -संग सकता सकती सकते सबसे -सभी -समय -साथ -साबुत -साभ सारा -से -सो -स्थान -ही -हुआ -हुई -हुए हुये -है -हैं -हो होता +होति होती होते होना होने -के +इंहिं +इंहें +इंहों +इसमें +उंहिं +उंहें +उंहों +उत्तर +कितना +कोनसा +कौनसा +जितना +जिससे +दबारा +दवारा +दुसरा +दुसरे +दूसरे +दोनों +परंतु +प्रति +रवासा +लेकिन +वगेरह +वग़ैरह +साबुत +स्थान +इतयादि +इन्हीं +इन्हें +इन्हों +उन्हीं +उन्हें +उन्हों +किंहें +किंहों +जिंहें +जिंहों +जिसमें +तिंहें +तिंहों +द्वारा +निहायत +बिलकुल +ऱ्वासा +इत्यादि +किन्हें +किन्हों +जिन्हें +जिन्हों +तिन्हें +तिन्हों diff --git a/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt b/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt index 13c70d9d6f..b6f4db9c6c 100644 --- a/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt @@ -1,206 +1,794 @@ -# -# This is a stop word list for the Hungarian language. -# # Sources: +# # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-hu/blob/master/stopwords-hu.txt +# +# (Lightly edited to remove words in the original lists that are actually meaningful) # a +abba abban +abból +addig ahhoz ahogy ahol +akár aki akik akkor +alá +alád +alájuk +alám +alánk +alapján +alátok alatt +alatta +alattad +alattam +alattatok +alattuk +alattunk +alól +alóla +alólad +alólam +alólatok +alóluk +alólunk +által +általában +ám amely +amelybol amelyek amelyekben amelyeket amelyet +amelyik amelynek ami +amíg amikor amit amolyan -amíg +amott annak +annál arra arról +át +attól az +azért +aznap azok +azokat +azokba +azokban +azokból +azokért +azokhoz +azokig +azokká +azokkal +azoknak +azoknál +azokon +azokra +azokról +azoktól azon azonban +azonnal azt aztán azután +azzá azzal -azért +bal +balra +ban +bár +bárcsak +bármilyen be +belé +beléd +beléjük +belém +belénk +belétek belül +belőle +belőled +belőlem +belőletek +belőlük +belőlünk +ben benne -bár +benned +bennem +bennetek +bennük +bennünk +búcsú cikk cikkek cikkeket csak +csakhogy +csupán de +dehogy e +ebbe ebben +ebből eddig +egész +egészen egy +egyéb +egyebek +egyebet +egyedül +egyelőre egyes +egyet egyetlen egyik +egymás egyre -egyéb -egész +egyszerre +együtt ehhez ekkor el -ellen -elsõ +elé +eléd elég +eleinte +eléjük +elém +elénk +elétek +éljen +ellen +ellenére +ellenes +elleni +elmondta elõ elõször elõtt +elsõ +első +elsők +elsősorban +elsőt +elő +előbb +elől +előle +előled +előlem +előletek +előlük +előlünk +először +előtt +előtte +előtted +előttem +előttetek +előttük +előttünk +előző emilyen +én +engem ennek +ennél +ennyi +enyém +éppen erre +erről +érte +érted +értem +értetek +értük +értünk +és +esetben +ettől +év +évben +éve +évek +éves +évi +évvel ez ezek +ezekbe +ezekben +ezekből +ezeken +ezekért +ezeket +ezekhez +ezekig +ezekké +ezekkel +ezeknek +ezeknél +ezekre +ezekről +ezektől ezen +ezentúl +ezer +ezért +ezret ezt +ezután +ezzé ezzel -ezért fel +fél +fele felé +felek +felet +felett +fent +fenti +fölé +gyakran +ha +halló +hamar hanem +hány +hányszor +harmadik +harmadikat +hármat +harminc +három +hat +hát +hátha +hatodik +hatodikat +hatot +hátulsó +hatvan +helyett +hét +hetedik +hetediket +hetet +hetven +hiába +hirtelen hiszen hogy hogyan +hol +holnap +holnapot +honnan +hova +hozzá +hozzád +hozzájuk +hozzám +hozzánk +hozzátok +hurrá +húsz +huszadik +ide +ide-оda +idén +igazán igen +így ill ill. illetve ilyen ilyenkor +immár +inkább +is ismét ison itt -jobban +jelenleg jó +jobban +jobbra jól +jólesik +jóval +jövőre kell +kellene kellett +kelljen +képest +kérem keressünk keresztül +kérlek +kész +késő +később +későn +két +kétszer +ketten +kettő +kettőt +kevés ki +kiben +kiből +kicsit +kicsoda +kié +kiért +kihez +kik +kikbe +kikben +kikből +kiken +kikért +kiket +kikhez +kikké +kikkel +kiknek +kiknél +kikre +kikről +kiktől +kilenc +kilencedik +kilencediket +kilencet +kilencven +kin +kinek +kinél +kire +kiről +kit +kitől +kivé +kivel kívül +korábban +körül +köszönhetően +köszönöm +közben +közé +közel +közepén +közepesen között közül +külön +különben +különböző +különbözőbb +különbözőek +lassan +le legalább legyen lehet +lehetetlen lehetett +lehetőleg +lehetőség lenne +lennék +lennének lenni lesz +leszek +lesznek +leszünk lett +lettek +lettem +lettünk +lévő +ma maga +magad +magam magát +magatokat +magukat +magunkat +mai majd -majd +majdnem +manapság +már +más +másik +másikat +másnap +második +másodszor +mások +másokat +mást meg +még +megcsinál +megcsinálnak +megint +mégis +megvan +mellé +melléd +melléjük +mellém +mellénk +mellétek mellett +mellette +melletted +mellettem +mellettetek +mellettük +mellettünk +mellől +mellőle +mellőled +mellőlem +mellőletek +mellőlük +mellőlünk mely melyek +melyik +mennyi mert mi +miatt +miatta +miattad +miattam +miattatok +miattuk +miattunk +mibe +miben +miből +miért +míg +mihez +mik +mikbe +mikben +mikből +miken +mikért +miket +mikhez +mikké +mikkel +miknek +miknél mikor +mikre +mikről +miktől milyen +min +mind +mindegyik +mindegyiket minden +mindenesetre mindenki mindent +mindenütt mindig +mindketten +minek +minél mint mintha +mire +miről mit +mitől +mivé mivel -miért +mögé +mögéd +mögéjük +mögém +mögénk +mögétek +mögött +mögötte +mögötted +mögöttem +mögöttetek +mögöttük +mögöttünk +mögüle +mögüled +mögülem +mögületek +mögülük +mögülünk +mondta most -már -más -másik -még -míg +mostanáig +múltkor +múlva +na nagy nagyobb nagyon +nála +nálad +nálam +nálatok +náluk +nálunk +naponta +napot ne +négy +negyedik +negyediket +négyet +negyven +néha +néhány +neked nekem neki +nekik +nektek +nekünk +nélkül nem +nemcsak +nemrég nincs -néha -néhány -nélkül +nyolc +nyolcadik +nyolcadikat +nyolcat +nyolcvan +o +õ +oda +ok +õk +õket olyan +ön +önbe +önben +önből +önért +önhöz +onnan +önnek +önnel +önnél +önök +önökbe +önökben +önökből +önökért +önöket +önökhöz +önökkel +önöknek +önöknél +önökön +önökre +önökről +önöktől +önön +önre +önről +önt +öntől +össze +öt +óta +ötödik +ötödiket +ötöt ott +ötven +pár pedig +például persze rá +rád +rajta +rajtad +rajtam +rajtatok +rajtuk +rajtunk +rájuk +rám +ránk +rátok +régen +régóta +rendben +részére +rögtön +róla +rólad +rólam +rólatok +róluk +rólunk +rosszul s saját +se sem semmi +semmilyen +semmiség +senki +soha sok +sokáig +sokan sokat sokkal +sokszor +során +stb. +számára +száz +századik +százat szemben +szépen szerint +szerinte +szerinted +szerintem +szerintetek +szerintük +szerintünk +szervusz +szét szinte -számára +szíves +szívesen +szíveskedjék +sőt talán +tavaly +távol +te +téged +tegnap +tegnapelőtt tehát +tele teljes +tényleg +tessék +ti +tied +titeket +tíz +tizedik +tizediket +tizenegy +tizenegyedik +tizenhárom +tizenhat +tizenhét +tizenkét +tizenkettedik +tizenkettő +tizenkilenc +tizennégy +tizennyolc +tizenöt +tizet +több +többi +többször tovább továbbá -több -ugyanis -utolsó +további +túl +tőle +tőled +tőlem +tőletek +tőlük +tőlünk +úgy +ugyanakkor +ugyanez +ugyani +ugye +úgyis +úgynevezett +új +újabb +újra +úr +urak +uram +urat után utána +utoljára +utolsó vagy vagyis vagyok +vagytok +vagyunk +vajon +valahol valaki +valakit +valamelyik valami valamint való van vannak +végén +végre +végül vele +veled +velem +veletek +velük +velünk vissza +viszlát viszont +viszontlátásra volna +volnának +volnék volt voltak voltam voltunk -által -általában -át -én -éppen -és -így -õ -õk -õket -össze -úgy -új -újabb -újra +ő +ők +őket +őt diff --git a/apps/common/src/python/mediawords/languages/it/it_stop_words.txt b/apps/common/src/python/mediawords/languages/it/it_stop_words.txt index 4448e81c70..b0d8b6a12d 100644 --- a/apps/common/src/python/mediawords/languages/it/it_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/it/it_stop_words.txt @@ -1,27 +1,57 @@ -# -# This is a stop word list for the Italian language. -# # Sources: +# # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-it/blob/master/stopwords-it.txt +# +# (Lightly edited to remove words in the original lists that are actually meaningful) # a +abbastanza abbia abbiamo abbiano abbiate +accidenti ad +adesso +affinché agl agli +ahime +ahimè ai al +alcuna +alcuni +alcuno all alla alle allo +allora +altre +altri +altrimenti +altro +altrove +altrui anche +ancora +anni +anno +ansa +anticipo +assai +attesa +attraverso +avanti avemmo avendo +avente +aver +avere +averlo avesse avessero avessi @@ -35,6 +65,7 @@ avevano avevate avevi avevo +avrà avrai avranno avrebbe @@ -45,22 +76,67 @@ avremo avreste avresti avrete -avrà avrò avuta avute avuti avuto +basta +ben +bene +benissimo +brava +bravo +buono c +caso +cento +certa +certe +certi +certo che chi +chicchessia +chiunque ci +ciascuna +ciascuno +cima +cinque +cio +ciò +cioe +cioè +circa +citta +città +co +codesta +codesti +codesto coi col +colei +coll +coloro +colui come +cominci +comprare +comunque con +concernente +conclusione +consecutivi +consecutivo contro +cos +cosa +cosi +così cui +d da dagl dagli @@ -70,6 +146,8 @@ dall dalla dalle dallo +dappertutto +davanti degl degli dei @@ -78,25 +156,58 @@ dell della delle dello +dentro +detto +deve +devo di +dice +dietro +dire +dirimpetto +diventa +diventare +diventato +dopo +doppio dov dove +dovra +dovrà +dovunque +due +dunque +durante e +è ebbe ebbero ebbi +ecc +ecco ed +effettivamente +egli +ella +entrambi +eppure era erano eravamo eravate eri ero +esempio +esse essendo +esser +essere +essi +ex +fa faccia facciamo facciano -facciate faccio facemmo facendo @@ -114,8 +225,10 @@ facevi facevo fai fanno +farà farai faranno +fare farebbe farebbero farei @@ -124,44 +237,112 @@ faremo fareste faresti farete -farà -farò +fatto +favore fece fecero feci +fin +finalmente +finche +fine +fino +forse +forza fosse fossero fossi fossimo foste fosti +fra +frattempo fu fui fummo +fuori furono +futuro +generale +gente +gia +già +giorni +giorno +giu gli +gliela +gliele +glieli +glielo +gliene +grande +grazie +gruppo ha +haha hai hanno ho i +ie +ieri il in +inc +indietro +infatti +inoltre +insieme +intanto +intorno +invece io l la +là +lasciato +lato le lei li lo +lontano loro lui +lungo +luogo ma +macche +magari +mai +male +malgrado +malissimo +me +medesimo +mediante +meglio +meno +mentre +mesi +mezzo mi mia mie miei +mila +miliardi +milioni +minimi mio +modo +molta +molti +moltissimo +molto +momento +mondo ne negl negli @@ -171,29 +352,127 @@ nell nella nelle nello +nemmeno +neppure +nessun +nessuna +nessuno +niente +no noi +nome non +nondimeno +nonostante +nonsia nostra nostre nostri nostro +novanta +nove +nulla +nuovi +nuovo o +od +oggi +ogni +ognuna +ognuno +oltre +oppure +ora +ore +osi +ossia +ottanta +otto +paese +parecchi +parecchie +parecchio +parte +partendo +peccato +peggio per +perche perché +perchè +percio +perciò +perfino +pero +però +persino +persone +piedi +pieno +piglia +piu più +piuttosto +po +pochissimo +poco +poi +poiche +possa +possedere +posteriore +posto +potrebbe +preferibilmente +presa +press +prima +primo +principalmente +probabilmente +promesso +proprio +puo +può +pure +purtroppo +qua +qualche +qualcosa +qualcuna +qualcuno quale +quali +qualunque +quando quanta quante quanti quanto +quantunque +quarto +quasi +quattro +quel quella quelle quelli quello +quest questa queste questi questo +qui +quindi +quinto +realmente +recente +recentemente +riecco +salvo +sara +sarà sarai saranno sarebbe @@ -204,21 +483,41 @@ saremo sareste saresti sarete -sarà sarò +scorso se +secondo +seguente +seguito sei +sembra +sembrare +sembrato +sembrava +sembri +sempre +senza +sette si sia siamo siano siate siete +sig +solito +solo +soltanto sono +sopra +soprattutto +sotto +spesso sta stai stando stanno +starà starai staranno starebbe @@ -229,8 +528,11 @@ staremo stareste staresti starete -starà starò +stata +state +stati +stato stava stavamo stavano @@ -238,10 +540,12 @@ stavate stavi stavo stemmo +stessa stesse stessero stessi stessimo +stesso steste stesti stette @@ -254,6 +558,9 @@ stiate sto su sua +subito +successivamente +successivo sue sugl sugli @@ -265,22 +572,56 @@ sulle sullo suo suoi +tale +tali +talvolta +tanto +te +tempo +terzo +th ti +titolo tra +tranne +tre +trenta +triplo +troppo +trovato tu tua tue tuo tuoi +tutta +tuttavia +tutte tutti tutto +uguali +ulteriore +ultimo un una uno +uomo +va +vai +vale +vari +varia +varie +vario +verso vi +vicino +visto +vita voi +volta +volte vostra vostre vostri vostro -è diff --git a/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt b/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt index bfff6d32ff..1f5e21f4f3 100755 --- a/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt @@ -1,6 +1,7 @@ # # This is a stop word list for the Japanese language. # +# (Lightly edited to remove words in the original lists that are actually meaningful) # Sources: # https://github.com/stopwords/japanese-stopwords/blob/master/data/japanese-stopwords.txt # Lucene's stopwords_ja.txt @@ -50,7 +51,6 @@ url いつ いま います -いや いる いろいろ う @@ -92,13 +92,11 @@ url これから これら ご -ごっちゃ ごと ごろ さ さま さまざま -さらい さらに される さん @@ -252,7 +250,6 @@ url ひと ひとつ ふく -ぶり へ への へん @@ -360,7 +357,6 @@ url 作 作ら 例 -係 俺 個 億 @@ -400,38 +396,15 @@ url 向け 向こう 和 -哀 -品 -員 -喜 -器 四 回 -国 -土 在 -地 -報じ -場 -場合 -境 -士 -夏 -外 多く 大 女 奴 婦 子 -字 -安 -官 -室 -家 -対 -小 -屋 巡る 左 市 @@ -442,79 +415,49 @@ url 店 府 度 -式 形 役 彼 彼女 後 -怒 -思わ -性 -情 -感 -感じ 我々 所 手 手段 -扱い 数 文 新た -新着 方 方法 日 -春 時 時点 時間 -更新 -書 月 期 -木 未満 -末 -本 本当 -村 -束 -枚 -校 -楽 様 様々 次 歳 -歴 段 毎 毎日 -気 -水 求め -法 -派 -火 点 -版 特に 玉 用 男 町 -界 略 百 -的 目 相 県 確か 示し -社 私 私達 秋 @@ -530,7 +473,6 @@ url 結局 続き 線 -署 考え 者 自体 @@ -544,34 +486,23 @@ url 計 話 話し -誌 語っ 読む 誰 課 -調べ -論 貴方 貴方方 輪 近く 述べ -通 -速報 -連 週 道 達 違い 選 -部 -都 -金 -銭 開か 間 関 -関係 関連 際 集 diff --git a/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt b/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt old mode 100755 new mode 100644 index 69707d4e8c..766f17d86f --- a/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt @@ -1,19 +1,78 @@ -# -# This is a stop word list for the Lithuanian language. -# # Sources: +# # http://www.filewatcher.com/p/punbb-1.2.16.tbz.620109/www/punbb/lang/Lithuanian/stopwords.txt.html +# https://github.com/stopwords-iso/stopwords-lt/blob/master/stopwords-lt.txt # auto-generated sources # +# a +á +abi +abidvi +abiejø +abiejose +abiejuose +abiem +abigaliai +abipus +abu +abudu +ai +ákypai +ana +anaiptol +anaisiais +anàja +anàjá +anàjà +anajai +anajam +anajame +anapus +anas +anasai +anàsias +anasis +anei +aniedvi +anieji +aniesiems +anoji +anojo +anøjø +anojoje +anokia +anoks +anosiomis +anosioms +anosios +anosiose +anot +ant +antai +anuodu +anuoju +anuosiuose +anuosius apie +aplink ar arba +argi +arti +ástriþai +aukðèiau +að aš be bei +beje +bemaþ +bent bet +betgi +beveik bus buvo būti @@ -22,54 +81,178 @@ d dabar dar darbo +dargi daryti daug daugiau daugiausia +daugmaþ dažnai +deja +dëka +dël +dëlei +dëlto dieną dėl +ech +et +gal +galbût +galgi gali +gan +gana gauna gauti +gi +greta +idant iki ir +irgi +it +itin +ið +iðilgai +iðvis iš +jaisiais +jájá +jàja +jàjà +jajai +jajam +jajame jam +jàsias jau jei +jeigu ji jie +jiedu +jiedvi +jieji +jiesiems +jinai jis +jisai jo jog +joji +jojo +jøjø +jojoje jos +josiomis +josioms +josios +josiose +judu +judvi +juk +jumis +jums +jumyse +juodu +juoju +juosiuose +juosius +jus +jûs +jûsiðkë +jûsiðkis +jûsø jį jų kad kada +kadangi kai kaip +kaipgi kam -kartą kas +katra +katras +katriedvi +kaþin +kaþkas +kaþkatra +kaþkokia +kaþkuri +kaþkuris +kiaurai +kiek kiekvienas +kieno +kita kitas +kitokia +kitoks klausimas klausti +kodël +kokia +koks +kol +kolei kovo +kuomet kur +kuri kurie +kuriedvi kurios kuris +kuriuodu kurių labai +lai lietuva lietuvoje lietuvos +lig +ligi +link +lyg m man +manaisiais +manàja +manàjá +manàjà +manajai +manajam +manajame +manas +manæs +manasai +manàsias +manasis +mane +manieji +maniesiems +manim +manimi +maniðkis mano +manoji +manojo +manøjø +manojoje +manosiomis +manosioms +manosios +manosiose +manuoju +manuosiuose +manuosius +manyje +mat +maþdaug +maþne mažai mažas mažiau @@ -78,56 +261,280 @@ metais metu metus metų +mudu +mudvi +mumis +mums +mumyse +mus +mûsiðkë +mûsiðkis +mûsø mūsų +na +nagi ne +në +nebe +nebent negali +negi negu nei +nejau nes net +netgi +netoli +neva niekada niekas nors nuo nėra o +ogi +oi +paèiais +paèiam +paèiame +paeiliui +paèiø +paèiu +paèiuose +paèius pagal +pakeliui +palaipsniui +palei +pas pasak pasakė +paskos +paskui +paskum pat +patá +pati +patiems +paties +pats +patys per +pernelyg +pirm +pirma +pirmiau po prašau prie +prieð +prieðais prieš +pro +pusiau r +rasi reikia +rodos sakyti sakė +sau +savaisiais +savàja +savàjá +savàjà +savajai +savajam +savajame +savas +savæs +savasai +savàsias +savasis +save +savieji +saviesiems +savimi savo +savoji +savojo +savøjø +savojoje +savosiomis +savosioms +savosios +savosiose +savuoju +savuosiuose +savuosius +savyje +skersai +skradþiai +staèiai su +sulig +ta +tad +taèiau tai +taigi taip taip pat +taipogi +tàja +tàjá +tàjà +tajai +tajam +tajame +tamsta tarp +tarsi +tartum +tarytum tas +tasai +tàsias +tau +tavaisiais +tavàja +tavàjà +tavajai +tavajam +tavajame +tavas +tavæs +tavàsias +tavasis +tave +tavieji +taviesiems +tavimi +taviðkë +taviðkis tavo +tavoji +tavojo +tavøjø +tavojoje +tavosioms +tavosios +tavosiose +tavuoju +tavuosiuose +tavuosius +tavyje tačiau +te +tegu +tegul +tiedvi +tieji +ties +tiesiems +tiesiog tik tikrai +tikriausiai +tiktai to todėl +toji +tojo +tøjø +tojoje +tokia +toks +tol +tolei +toliau +tosiomis +tosioms +tosios +tosiose +tu +tûlas tuo +tuodu +tuoju +tuosiuose +tuosius +turbût turi turėjo +uþ +uþtat +uþvis už +va +vai val +vël +vëlgi +viduj +vidury +vien vienas +vienokia +vienoks +vietoj +virð +virðuj +virðum +vis +vis dëlto +visa +visas +visgi visi +visokia +visoks +vos +ypaè yra -čia +ðájá +ðalia +ðe +ði +ðiaisiais +ðiàja +ðiàjà +ðiajai +ðiajam +ðiajame +ðiapus +ðiàsias +ðiedvi +ðieji +ðiesiems +ðioji +ðiojo +ðiøjø +ðiojoje +ðiokia +ðioks +ðiosiomis +ðiosioms +ðiosios +ðiosiose +ðis +ðisai +ðit +ðita +ðitas +ðitiedvi +ðitokia +ðitoks +ðituodu +ðiuodu +ðiuoju +ðiuosiuose +ðiuosius +ðtai +þemiau į šalia -šalies +čia šios žmonių diff --git a/apps/common/src/python/mediawords/languages/nl/nl_stop_words.txt b/apps/common/src/python/mediawords/languages/nl/nl_stop_words.txt index 1ee9a2887d..97e6741d4f 100644 --- a/apps/common/src/python/mediawords/languages/nl/nl_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/nl/nl_stop_words.txt @@ -1,108 +1,417 @@ -# -# This is a stop word list for the Dutch language. -# # Sources: +# # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-nl/blob/master/stopwords-nl.txt +# +# (Lightly edited to remove words in the original lists that are actually meaningful) # aan +aangaande +aangezien +achte +achter +achterna +af +afgelopen al +aldaar +aldus +alhoewel +alias +alle +allebei +alleen alles als +alsnog altijd +altoos +ander andere +anders +anderszins +beetje +behalve +behoudens +beide +beiden ben +beneden +bent +bepaald +betreffende bij +bijna +bijv +binnen +binnenin +blijkbaar +blijken +boven +bovenal +bovendien +bovengenoemd +bovenstaand +bovenvermeld +buiten +bv daar +daardoor +daarheen +daarin +daarna +daarnet +daarom +daarop +daaruit +daarvanlangs dan dat de +deden +deed der +derde +derhalve +dertig deze +dhr die +dikwijls dit doch +doe doen +doet door +doorgaand +drie +duizend dus +echter een eens +eer +eerdat +eerder +eerlang +eerst +eerste +eigen +eigenlijk +elk +elke en +enig +enige +enigszins +enkel er +erdoor +erg +ergens +etc +etcetera +even +eveneens +evenwel +gauw ge +gedurende geen +gehad +gekund +geleden +gelijk +gemoeten +gemogen +genoeg geweest +gewoon +gewoonweg haar +haarzelf had +hadden +hare heb hebben +hebt +hedden heeft +heel hem +hemzelf +hen het +hetzelfde hier +hierbeneden +hierboven +hierin +hierna +hierom hij +hijzelf hoe +hoewel +honderd hun +hunne +ieder +iedere +iedereen iemand iets ik +ikzelf in +inderdaad +inmiddels +intussen +inzake is ja je +jezelf +jij +jijzelf +jou +jouw +jouwe +juist +jullie kan +klaar kon +konden +krachtens +kun kunnen +kunt +laatst +later +liever +lijken +lijkt +maak +maakt +maakte +maakten maar +mag +maken me meer +meest +meestal men met +mevr +mezelf mij mijn +mijnent +mijner +mijzelf +minder +miss +misschien +missen +mits +mocht +mochten +moest +moesten moet +moeten +mogen +mr +mrs +mw na naar +nadat +nam +namelijk +nee +neem +negen +nemen +nergens +net +niemand niet niets +niks +noch +nochtans nog +nogal +nooit nu +nv of +ofschoon om omdat +omhoog +omlaag +omstreeks +omtrent +omver +ondanks onder -ons +ondertussen +ongeveer +onszelf +onze +onzeker +ooit ook op +opnieuw +opzij over +overal +overeind +overige +overigens +paar +pas +per +precies +recent reeds +rond +rondom +samen +sedert +sinds +sindsdien +slechts +sommige +spoedig +steeds +tamelijk te -tegen +tenzij +terwijl +thans +tien +tiende +tijdens +tja toch +toe toen +toenmaals +toenmalig tot +totdat +tussen +twee +tweede u uit +uitgezonderd uw +vaak +vaakwat van +vanaf +vandaan +vanuit +vanwege veel +veeleer +veertig +verder +verscheidene +verschillende +vervolgens +via +vier +vierde +vijf +vijfde +vijftig +vol +volgend +volgens voor +vooraf +vooral +vooralsnog +voorbij +voordat +voordezen +voordien +voorheen +voorop +voorts +vooruit +vrij +vroeg +waar +waarom +waarschijnlijk +wanneer want waren was wat +we +wederom +weer +weg +wegens +weinig +wel +weldra +welk +welke werd +werden +werder wezen +whatever wie +wiens +wier +wij +wijzelf wil +wilden +willen +word worden wordt zal ze +zei +zeker zelf +zelfde +zelfs +zes +zeven zich +zichzelf zij zijn +zijne +zijzelf zo +zoals +zodat +zodra zonder zou +zouden +zowat +zulk +zulke +zullen +zult diff --git a/apps/common/src/python/mediawords/languages/no/no_stop_words.txt b/apps/common/src/python/mediawords/languages/no/no_stop_words.txt index 2fd8a00993..f29adcd41e 100644 --- a/apps/common/src/python/mediawords/languages/no/no_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/no/no_stop_words.txt @@ -1,13 +1,19 @@ -# -# This is a stop word list for the Norwegian language. -# # Sources: +# # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-no/blob/master/stopwords-no.txt +# +# (Lightly edited to remove words in the original lists that are actually meaningful) # +å alle +andre +arbeid at av +både +båe bare begge ble @@ -15,9 +21,10 @@ blei bli blir blitt -både -båe +bort +bruke da +då de deg dei @@ -39,7 +46,6 @@ ditt du dykk dykkar -då eg ein eit @@ -47,15 +53,26 @@ eitt eller elles en +ene +eneste +enhver enn er et ett etter +få +folk for +før fordi +forsûke fra -før +fûr +gå +gjorde +gjûre +god ha hadde han @@ -84,11 +101,11 @@ hvorfor i ikke ikkje -ikkje ingen ingi inkje inn +innen inni ja jeg @@ -104,8 +121,15 @@ kvarhelst kven kvi kvifor +lage +lang +lik +like +må +makt man mange +måte me med medan @@ -113,13 +137,21 @@ meg meget mellom men +mens +mer +mest mi min mine mitt mot +mye mykje +nå +når +navn ned +nei no noe noen @@ -128,8 +160,7 @@ noko nokon nokor nokre -nå -når +ny og også om @@ -137,35 +168,47 @@ opp oss over på +part +punkt +så samme +sånn +sant seg selv si -si sia sidan siden sin sine +sist sitt sjøl skal skulle slik +slutt so som -som somme somt -så -sånn +start +stille +tid til +tilbake +tilstand um +under upp ut uten var +vår +være vart +vært varte ved vere @@ -173,11 +216,9 @@ verte vi vil ville +vite vore +vöre vors vort -vår -være -være -vært -å +vört diff --git a/apps/common/src/python/mediawords/languages/pt/pt_stop_words.txt b/apps/common/src/python/mediawords/languages/pt/pt_stop_words.txt index d49861eea5..c518b2c5eb 100644 --- a/apps/common/src/python/mediawords/languages/pt/pt_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/pt/pt_stop_words.txt @@ -1,11 +1,13 @@ -# -# This is a "long" stop word list for the Portuguese language. -# +# (Lightly edited to remove words in the original lists that are actually meaningful) # Sources: +# # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-pt/blob/master/stopwords-pt.txt +# # a +à a meta abaixo abastecimento @@ -22,18 +24,17 @@ abrir abriu absoluta absolutamente -absurdo -abuso acaba acabam acabar acabaram acabou -academia +ação acaso aceita aceitar aceitou +acerca acertar acertou acesso @@ -43,8 +44,6 @@ achar achei acho achou -acidente -acidentes acima acompanha acompanhada @@ -60,144 +59,52 @@ acontecerá aconteceu acontecido acontecimentos -acordo acredita acreditam acreditar acredito acrescenta acrescentou -acumulado -acusado -acusados -acusação -acusações -adequada -adequado -adesão +adeus adianta adiante adiantou -administrador -administrar -administrativa -administrativo -administração -admite -admitiu -adolescente -adolescentes -adotar -adoção -adquirir -adultos -adversário -adversários -advogada -advogado -advogados -aeroporto afastado afastar afinal -afirma -afirmam afirmando afirmar afirmou -agenda -agente -agentes agir agora agosto agradecer -agressão -agricultores -agricultura -agrícola aguarda aguardar -agência -agências +águas ah +aí ainda -ajuda -ajudam -ajudar -ajudou ala alcançar alega alegou -alegre -alegria -alemão +alem +além alerta +algmas algo +alguém algum alguma algumas alguns -alguém ali -aliado -aliados -aliança -alimentar -alimentação -alimento -alimentos aliás -alma -almoço -alta -altas -alteração -alterações -alternativa -alternativas -alto -altos -altura -aluguel -aluno -alunos alves -alvinegro -alvo -além -ama amanhã -amarelo ambas -ambientais -ambiental -ambiente ambos -ameaça -ameaças -americano -americanos -amiga -amigo -amigos -amizade -amor -ampla -ampliar -ampliação -amplo -analisa -analisar -analistas -anda -andamento -andar -animais -animal -animação -aniversário ano anos ante @@ -212,151 +119,63 @@ anual anuncia anunciado anunciar -anunciou -análise anúncio +anunciou ao +aonde aos aparece aparecem aparecer apareceu aparecida -aparelho -aparelhos -apartamento -apelo apenas apesar +aplicação aplicada aplicado aplicar -aplicação apoia apoiar apoio aponta apontam +apontar apontou -aposentado -aposentadoria -aposentados -aposta -apreensão -aprender -aprendizado -apresenta -apresentada -apresentadas -apresentado -apresentados -apresentam -apresentar -apresentaram -apresentação -apresentações -apresentou -aprovada -aprovado -aprovados -aprovar -aprovação -aproveitar -aproveitou +apos +após aproximadamente apuração -após aquela aquelas aquele aqueles aqui aquilo -aquisição ar -areia -arena -argumento -argumentos -arma -armado -armas -arrecadação -arroz -arruda -art -arte -artes -artigo -artigos -artilheiro -artista -artistas +área +áreas as -asfalto -aspecto -aspectos -assaltantes -assalto -assassinato -assembleia -assessor -assessores -assessoria +às assim -assinado -assinar -assinatura -assinou -assistente -assistir -assistência -associados -associação -associações assume assumir assumiu assunto assuntos at -atacante -atacar -ataque -ataques +até atende atendendo atender atendidas -atendido -atendidos -atendimento -atendimentos -atento -atenção -atinge -atingido -atingir -atingiu -atitude -atitudes -ativa -atividade -atividades -atleta -atletas ato -ator -atores atos atrair +atrás atraso através -atração -atrações -atriz -atrás -atua +atuação atuais atual atualizado @@ -364,102 +183,27 @@ atualmente atuam atuando atuar -atuação atuou -até -auditório -audiência -aula -aulas aumenta aumentando aumentar aumento aumentou -ausência -automóveis -automóvel -autonomia -autor -autores -autoria -autoridade -autoridades -autorização -autos -auxiliar -auxílio -avalia -avaliar -avaliação avaliou -avançar -avanço -avanços -avançou -avenida -avisa -avião -avó -azul -ação -ações -aérea -aí -baiano -baile -bairro -bairros -baixa -baixo -baixos -balanço -bancada -banco -bancos -banda -bandas -bandeira -bandido -bandidos -banheiro -banho -bar -barato -barco -bares -barra -barreiras -barros -barulho base baseado bases +básica +básicas +básico basta bastante -bastidores -batalha -batalhão -bate -bater -bateria -bateu -beber -bebida -bebidas -bebê -beira -bela -beleza -belo bem beneficiar benefício benefícios bens bernardo -biblioteca -bicicleta bilhão bilhões bloco @@ -467,175 +211,28 @@ blocos blog boa boas -boca -bola -boletim -bolsa -bolsas -bolso bom -bomba -bombeiros -bonita -bonito bons -branca -branco -brancos -brasileiras -brasileiro -brasileiros -braço -braços -breve -briga -brilhante -brincadeira -brincar -brinquedos -bruto -buraco -buracos -busca -buscam -buscando -buscar -básica -básicas -básico +cá cabe cabeceou -cabelo -cabelos -cabeça -cabo -cachorro cada -cadastro -cadeia -cadeira -cadeiras cadê -cai -cair -caiu -caixa -caixas -calendário -calma -calor -calçada -cama -caminhada -caminho -caminhos -caminhão -caminhões -camisa -campanha -campanhas -campeonato -campeão -campeões -campo -campos -campus -cana -canal -candidata -candidato -candidatos -candidatura -candidaturas -canto -cantor -cantora -caos -capa -capacidade -capacitação -capaz -capazes -capitais -capital -capitão -capixaba -capítulo -cara -característica -características -caras -carga -cargo -cargos -carinho -carioca -carne -caro -carreira -carro -carros -carta -cartas -carteira -cartão -cartório -cartões -caruaru -caráter -casa -casado -casal -casamento -casar -casas -caso -casos -cassado -cassação -castelo catarinense categoria categorias -causa -causar -causas -causou -caíram -cd -cedo -celular -cem -cemitério -cena +cenário cenas centenas cento centrais central -centro -centros -cenário -cerca -cerimônia -certa -certamente -certas -certeza -certo -certos -cerveja -chama chamada chamado chamados chamar chamou -chance -chances -chapa -chave -chefe +chão chega chegada chegam @@ -645,204 +242,59 @@ chegar chegaram chegou cheguei -cheia -cheio -cheiro -cheque -choque -chute -chutou -chuva -chuvas -chão -ciclo -cidadania -cidade -cidades -cidadão -cidadãos -cientistas cima cinco -cinema -circo -circuito -circulação -cirurgia cita citado citar citou -civil -civis -ciência -ciências -classe -classes -classificação -cliente -clientes -clima -clique -clube -clubes -clássico -clínica -cobertura -cobra -cobrança -cobrar -cobrou -cofres coisa coisas colaboradores -colega -colegas -coleta -coletiva -coletivo -coleção -coligação coloca +colocação colocada colocado colocados colocando colocar -colocação colocou coloque -colorado -coluna -colunista -colégio com -comandante -comando -combate -combater -combustível -comecei -comemora -comemorar -comemoração -comemorou -comenta -comentar -comentou -comentário -comentários -comer -comerciais -comercial -comercialização -comerciante -comerciantes começa começam começando começar começaram +comecei começo começou -comida +comenta +comentar +comentário +comentários +comentou comigo -comissão -comitê como -companheiro -companheiros -companhia -companhias -comparação -competente -competição -competições -competência complementar -completa completamente -completar -completo -completou -complexo -complicado -compor -comportamento -composição -composta -composto -compra -comprar -compras -compreensão -compromisso -compromissos -comprou -computador -computadores -comum -comunicado -comunicação -comunidade -comunidades -comuns -comércio -conceito -conceitos -concentração -concessão -conclui -concluir -concluiu -conclusão -concorda concordo -concorrentes -concorrer -concorrência -concreto -concurso -concursos -condenado -condenação -condição -condições -conduta -confederação conferir -conferência confiança confira confirma confirmado confirmar confirmou -conflito -conflitos conforme -conforto -confronto -confusão conhece conhecem conhecer conheceu conhecida conhecido -conhecidos -conhecimento -conhecimentos conheço conjunto -conquista -conquistar -conquistas -conquistou -consciente -conscientização -consciência consegue conseguem consegui @@ -851,45 +303,21 @@ conseguimos conseguir conseguiram conseguiu -conselheiro -conselho -conselhos -consenso -conservação considera +consideração considerada considerado considerados considerando considerar -consideração considerou consigo consta -constante -constantes -constitucional -construir -construção -construída -construído -consulta -consultas -consultoria -consumidor -consumidores -consumo -consórcio -conta -contam -contando contar contará -contas contato contatos conter -contexto conteúdo continua continuam @@ -900,492 +328,147 @@ continuidade continuou contou contra -contrapartida -contratado -contratados -contratar -contratação -contratações -contrato -contratos -contribuinte -contribuir -contribuição -controlar -controle contrário contudo -convencer -convenção -conversa -conversar -conversas -convidado -convidados -convite -conviver -convivência -convênio -cooperativa -coordenador -coordenadora -coordenação -cor -coragem -coração -cores -coronel -corpo -corpos -corre -correndo -corrente -correr -correta -correto -correção -corrida -cortar -corte -cortes costas costuma costumam cotidiano -cozinha -credibilidade -creio -cresce -crescendo -crescente -crescer -cresceu -crescimento -cria -criada criado criando -crianças criar -criatividade -criação -crime -crimes -criminal -criminalidade -criminosos criou -crise -criticar -criticou -critério -critérios -cruzamento -cruzes -cruzou -crédito -créditos -crítica -críticas -crítico -cuidado -cuidados -cuidar cuja +cujas cujo -culpa -cultura -culturais -cultural -cumprimento -cumprir -cumpriu -cunha -currículo -curso -cursos -curto -custa -custo -custos -cá -cães -cérebro -céu -código -cúpula -da -dada -dado +cujos dados +daí +dança dando -danos dantas -dança daquela +daquelas daquele daqueles daqui -dar -daria -dará -das -data -dava -daí de -de deus -debaixo -debate -debates -decide -decidir -decidiu -decisão -decisões -declaração -declarações -declarou -decoração -decreto -dedicação -defende -defender -defendeu -defensor -defesa -deficiência -define -definida -definido -definir -definitivamente -definiu -definição -deixa deixado -deixam -deixando -deixar -deixaram -deixe -deixou -dela -delas -dele -delegacia -delegado -deles -demais -demanda -demandas -demissão -democrático -demonstra -demonstrou -demora -demorou dentre dentro -denunciar -denúncia -denúncias -departamento -depende dependendo depender -depoimento -depoimentos depois -deputada -deputado -deputados -der -deram -derrota -derrotado -desafio -desafios -descoberta -descobre -descobrir -descobriu -desconto -desculpas desde -deseja -desejar -desejo -desembargador -desempenho -desemprego -desenvolver -desenvolvido -desenvolvimento -desespero -desfile -despesas +desligado dessa dessas desse desses desta -destaca -destacar -destacou -destaque -destaques destas deste destes -destinado -destinados -destino -desvio -desviou -detalhe -detalhes -determina -determinado -determinação -determinou deu deve deve-se devem devemos dever -deveria -deveriam deverá deverão +deveria +deveriam devia devido dez +dezanove +dezasseis +dezassete dezembro dezenas +dezoito dia dia-a-dia -diagnóstico diante +diária diariamente +diárias +diário +diária +diariamente +diárias +diário +diária +diariamente +diárias +diário dias dica dicas -diferente -diferentes diferença diferenças +diferente +diferentes +difíceis +difícil dificilmente dificuldade dificuldades -difíceis -difícil diga digital dignidade digo diminuir -diminuição -dinheiro -diploma +direção +direita direito direta diretamente -direto -diretor -diretora -diretores -diretoria -diretório -direção diria -dirigente -dirigentes -dirigir disciplina -disco -discurso -discussão -discussões -discutir +dispõe +dispoem disponíveis disponível -disposição disposto -disputa -disputar -dispõe disse disseram disso -distante -distribuição -distribuídos distrito dito -diversas diversos -divisão -divulgada -divulgado -divulgados -divulgar -divulgação -divulgou diz dizem dizendo dizer dizia -diálogo -diária -diárias -diário do -doação -doações -doce -documentação -documento -documentos -doente -doença -doenças dois domingo -domínio -dona -dono -donos -dor -dores -dormir dos -dose -doutor +doze duas -duelo -dupla -dura -durante duração -duro +durante durou dutra -dvd -dão -década -décadas -déficit -dívida -dívidas -dólar -dólares -dúvida -dúvidas e -e-mail -economista -econômica -econômicas -econômico -econômicos -edital -edição -edições -educacional +é efeito efeitos -efetivamente -efetivo -eficiente -eficiência eis ela -elaboração elas ele -eleger -elegeu -eleita -eleito -eleitor -eleitorado -eleitorais -eleitores -eleitos -eleição -eleições -elementos -elenco eles -eletrônica -eletrônico -elevado -elevação -elite -elogios -elétrica em embora -emenda -emendas -emergência -emissora -emissoras -emissão -emocional -emoção -emoções -empate -empatou -empenho -empreendimento -empreendimentos -empregados -emprego -empregos -empresa -empresarial -empresas -empresário -empresários -empréstimo -empréstimos -encaminhado -encaminhados -encarar -encerramento -encerrou -enchentes -encontra -encontrada -encontrado -encontrados -encontram -encontrar -encontraram -encontro -encontros -encontrou -endereço -energia -enfatizou enfim -enfrenta -enfrentar -engenharia -engenheiro -enorme enquanto -ensinar -ensino entanto -entende -entender -entendeu -entendimento +entao +então entendo entidade entidades @@ -1395,117 +478,41 @@ entram entrar entraram entre -entrega -entregar -entregou -entregue -entregues -entretanto -entrevista -entrevistados -entrevistas -entrou -então -enviado -enviar -enviou -envolve -envolvendo -envolvido -envolvidos -envolvimento -episódio -equilíbrio -equipamento -equipamentos -equipe -equipes -equivalente era eram -errado -erro -erros -escala -escanteio -esclarecer -escola -escolar -escolas -escolha -escolher -escolheu -escolhido -esconder -escrever -escreveu -escrita -escrito -escritor -escritório -esforço -esforços -espanhol -espaço -espaços -especiais -especial -especialista -especialistas -especializada +éramos +és especialmente específica específico espera esperado esperamos -esperando esperança +esperando esperar esperava espero -espetáculo -espiritual -esportiva -esportivo -esposa -espécie -espécies -espírito -esquecer -esquema esquerdo esquina essa essas esse -essencial esses esta -estabelece -estabelecer -estabelecimento -estabelecimentos -estabilidade -estacionamento -estado -estados -estaduais -estadual +está estamos +estão estar -estaria -estariam estará estarão +estaria +estariam estas -estatal -estatuto -estatística -estatísticas +estás estava estavam -estação +estávamos este esteja estejam @@ -1514,176 +521,62 @@ estes esteve estilo estimativa -estimular estive estivemos estiver estivera estiveram +estivéramos estiverem estivermos estivesse estivessem -estivéramos estivéssemos +estiveste +estivestes estou -estrada -estradas -estrangeiros -estranho -estratégia -estratégias -estreia -estrela -estrelas -estrutura -estudar -estudo -estudos -está -estádio -estágio -estávamos -estão -etapa -etapas etc etc. eu -evento -eventos -eventual -evidente -evitar -evolução -ex-deputado -ex-governador -ex-prefeito -ex-presidente -exame -exames exatamente -excelente -excelência -excesso -exceção exclusivamente -executiva -executivo -execução -exemplo -exemplos -exercer -exercício -exercícios -exige -exigir -exigência -exigências -existe -existem -existentes -existia -existir -existência -expandir -expansão -expectativa -expectativas -experiência -experiências explica -explicar explicação explicações +explicar explicou -exploração -exportações -exposição -expressão -expulso -extensão -exterior -externa extra extremamente -exército +façam face -facilidade +fácil facilitar facilmente -faculdade +faço faixa -faixas -fala -falam -falando -falar falei -falha -falhas falou -falta -faltam -faltando -faltou -fama -familiar -familiares -famoso -famílias -faria -farmácia -farroupilha fará -farão -fase -fato -fator -fatores -fatos -faturamento -favor -favorável +faria faz +fazeis fazem fazemos fazenda fazendo fazer +fazes fazia -faça -façam -faço -febre fechada fechado fechados fechamento fechar fechou -federais -federal -federação -feira feita feitas feito feitos -felicidade -feliz -felizes -feminina -feminino -fenômeno -feriado -feridos -ferramenta -ferramentas -ferro -festa -festas -festival fevereiro fez fica @@ -1691,852 +584,143 @@ ficam ficamos ficando ficar -ficaram -ficaria ficará +ficaram ficarão +ficaria ficava -ficha fico ficou -fiel -figura -fila -filha -filhas -filho -filhos -filme -filmes -filosofia -fim -finais final -finalidade finalizou finalmente -financeira -financeiras -financeiro -financeiros -financiamento -finanças fins fique fiquei -firme -fiscais -fiscal -fiscalizar -fiscalização fiz fizemos fizeram -fiéis -flagrante -flor -flores -floresta -fluxo foco -fogo foi -folha fomos -fonte -fontes for -fora foram +fôramos +forças forem -forma -formada -formado -formar -formas -formato -formação formos -forró -fortalecer -forte -força -forças fosse fossem -foto -fotos -fraco -francês -frase -fraude -freitas -frente -frio -frisou -fronteira -frota -frutas -fruto -frutos -fuga -fugir -fugiu -fui -funciona -funcionamento -funcionando -funcionar -funcionário -funcionários -fundamentais -fundamental -fundação -fundo -fundos -função -funções -furto -futebol -futsal -futuro -futuros -fábrica -fácil -fãs -fé -férias -física -físicas -físico -fórmula -fórum -fôramos fôssemos -gabinete -gado -galeria -ganha -ganham -ganhando -ganhar -ganho -ganhos -ganhou -garante -garantia -garantir -garantiu -garota -garoto -gastar -gasto -gastos -gaúcha -gaúchos -general -gente -geografia -gera -gerais -geral -geralmente -gerando -gerar -geração -gerente -gerou -gestor -gestores -gestão -ginásio -global -gol -goleiro -golpe -gols -gosta -gostam +foste +fostes +fui gostaria -gostei -gosto -gostou -governador -governadora -governadores -governantes -governar -governo -governos -gramado -grande -grandes -gratuita -gratuito -grau -grave -graves -graças -grossa -grosso -grupo -grupos -grãos -guarda -guia -gás -gênero -habitantes -habitação +ha +há haja hajam hajamos +hão harmonia havemos haver -haveria haverá +haveria havia haviam -hectares -hei -helena -hipótese -história -histórias -histórica -histórico hoje -homem -homenagem -homens -homicídio -homicídios -honra hora -horas -horizonte -horário -horários -hospital -hotel -hotéis houve houvemos houver houvera +houverá houveram +houvéramos +houverão houverei houverem houveremos houveria houveriam -houvermos -houverá -houverão houveríamos +houvermos houvesse houvessem -houvéramos houvéssemos -hugo -humana -humanidade -humano -humanos -humor -há -hábito -hão ia ibope ida -idade -ideal -identidade -identificado -identificar -identificação -idosos -idéia -idéias -iguais -igual -igualdade -ilegal -ilha -iluminação -imagem -imagens -imaginar -imediata -imediatamente -imediato -impacto -impede -impedir -implantar -implantação -impor -importa -importante -importantes -impossível -imposto -impostos -imprensa -impressão -imóveis -imóvel -inauguração -incentivar -incentivo -inclui -incluindo -inclusive -inclusão -incrível -incêndio -indenização -independente -independentemente -independência -indica -indicado -indicar -indicação -individuais -individual -indivíduo -indivíduos indo -industrial -indícios -indígena -indígenas -indústria -indústrias -infantil -infelizmente -inferior -influência -informa -informado -informar -informação -informações -informou -informática -infra-estrutura -infraestrutura -inglês -ingresso -ingressos -inicia -iniciada -inicial -inicialmente -iniciar -iniciativa -iniciativas -iniciou -inquérito -inscritos -inscrição -inscrições -instalada -instalar -instalação -instalações -institucional -instituição -instituições -instituto -instrumento -instrumentos -integra -integral -integrante -integrantes -integrar -integração -inteira -inteiro -inteligente -inteligência -intensa -intenso -intenção -intenções -inter -interessa -interessados -interessante -interesse -interesses -interior -interna -internacionais -internacional -internado -internet -interno -interpretação -intervalo -intervenção -intuito -invadiu -inverno -investidores -investigar -investigação -investigações -investimento -investimentos -investir invés -início -inúmeras -inúmeros ir -iria -irmã -irmão -irmãos -irregular -irregularidades irá irão +iria isso +ista +iste isto -italiano item itens +já jamais janeiro -janela -jantar -jardim -jc -jeito -joga -jogada -jogadas -jogador -jogadores -jogando -jogar -jogo -jogos -jogou -jornada -jornais -jornal -jornalismo -jornalistas -judicial -judiciário -juiz -julgamento -julgar julho junho -juntamente -junto -juntos -juros -jurídica -jurídico -justa -justamente -justifica -justificar -justificativa -justiça -justo -juventude -juíza -juízes -juízo -já km -laboratório -lado -lados -ladrões -lago -lamentável -lance -lança -lançado -lançamento -lançar -lançou -lar -larga -lateral -latina -lazer -leal -legais -legal -legenda -legislativa -legislativo -legislação -lei -leia -leilão -leis -leite -leitor -leitores -leitura -lembra -lembrando -lembrar -lembro -lembrou -ler -leste -lesão -letra -letras -leva -levada -levado -levados -levam -levando -levantamento -levantar levar levaram -leve levou lhe lhes -li -liberado -liberação -liberdade -licença -licitação -lidar -liderança -lideranças -liga -ligada -ligadas -ligado -ligados -ligar -ligação -ligações -liminar -limite -limites -limpa -limpeza -linda -linguagem -linha -linhas -lista -literatura -litoral -litros -livre -livres -livro -livros -lixo -lição -locais -local -localidade -localizada -localizado logo -loja -lojas -longa -longe -longo -lua -lucro -lucros -lugar -lugares -luta -lutar -luxo -luz -lá -lê -líder -líderes -língua -líquido -lógica -madeira -madrugada maio -maior -maiores -maioria -mais -mal -manda -mandado -mandar -mandato -mandatos -mandou -maneira -manhã -manifestação -mano -manter -manteve -mantido -mantém -manutenção -mar -marca -marcada -marcado -marcador -marcar -marcas -marcação -marcou -margem -margens -marido -marinho -marketing -março mas -masculino -massa -mata -matar -matemática -materiais -material -mato -matou -matriz -matéria -matérias -mau -mauro -maus me -medalha mediante -medida -medidas -medo -meia meio -meio-campo -meios -melhor -melhora -melhorar -melhores -melhoria -melhorias -melhorou -membro -membros -memória -menina -meninas -menino -meninos -menor -menores menos -mensagem -mensagens -mensais -mensal -mental -mente -mentira -mercado -mercadorias -mercados -merece -merecem -mesa +mês meses mesma mesmas mesmo mesmos -mestre -meta -metade -metas -metropolitana metros meu meus mil -milhares -milho milhão -milhões -militar -militares +milhares mim -mineiro -minha minhas -ministra -ministros -minuto -minutos -mirim -missão -mistura -mobilização -moda -modalidade -modelo -modelos -moderna -moderno modo -moeda momento momentos -montagem montante -montar -monte -mora -moradia -morador -moradora -moradores -morais -moral -moram -morar -morava -moro -morre -morrer -morreram -morreu -morro -morte -mortes -morto -mortos mostra mostram mostrando mostrar mostrou -motivo -motivos -moto -motor -motoristas -motos -movimentação -movimento -movimentos -moça -muda -mudança -mudanças -mudar -mudou muita muitas muito muitos -mulher -mulheres -multa -multas -mundial -mundo -municipais -municipal município municípios -muro -museu -musical -má -máquina -máquinas -máxima -máximo -mãe -mães -mão -mãos -média -médica -médio -mérito -mês -mídia -mínima -mínimo -mínimos -móveis -móvel -música -músicas -músicos na -nacionais -nacional -nada -namorada -namorado +nao +não naquela +naquelas naquele +naqueles nas nasceu nascido -nascimento -naturais -natural -naturalmente -natureza -nação -nações -necessidade -necessidades -necessita -necessária -necessárias -necessário -necessários -nega -negar -negativa -negativo -negociar -negociação -negociações -negou -negra -negro -negros -negócio -negócios nela nele nem @@ -2547,259 +731,83 @@ nessas nesse nesses nesta +nestas neste nestes -neto -news ninguém nisso no -nobre -noite -noites nome nomes -norma -normal -normalmente -normas norte nos +nós nossa nossas nosso nossos -nota -notas -notícia -notícias -nova novamente -novas nove novembro -novidade -novidades -novo -novos num numa -nunca -não -né -níveis -nível -nós -núcleo -número -números +numas +nuns o -objetivo -objetivos -objeto -objetos -obra -obras -obrigado -obrigados -obrigação -observa -observar -observou obter obteve ocasião ocorre ocorrem +ocorrência +ocorrências ocorrer ocorreram ocorreu ocorrido -ocorrência -ocorrências -ocupa -ocupar -ocupação -oeste -oferece -oferecem -oferecer -oferecido -oferecidos -oferta -oficiais -oficial -oficialmente -oficina -oficinas -ofício +oitavo oito -olha -olhando -olhar -olho -olhos -oliveira -olímpico onda onde -ong -online ontem -operação -operações -opinião -opiniões -oportunidade -oportunidades -optar -opção -opções +onze ora -ordem -organismo -organizada -organizado -organizar -organização -organizações -orientação -origem -original -orçamento os ou -ouro outra outras outro outros outubro -ouvi -ouvido -ouvidos -ouvir -ouviu -paciente -pacientes -paciência -pacote -padrão -padrões -paga -pagam -pagamento -pagamentos -pagando -pagar -pago -pagos -pagou -pai -paixão -palanque -palavra -palavras -palco -palestra -palestras -palácio -papai -papel -papéis -par para -parabéns -parada -parado -paralisação -paranaense -parar -parceiro -parceiros -parcela -parceria -parcerias parece parecem parecer parecia -paredes -parentes -parlamentar -parlamentares -parlamento -parou -parque parte partes -participa -participam -participantes -participar -participaram -participação -participou -particular -particulares -partida -partidas -partido -partidos -partidária -partir partiu passa passada -passado -passageiros -passagem -passagens passam passando passar -passaram passará +passaram passava passe passei -passeio -passo -passo fundo -passos -passou -pasta -patamar -patrimônio -pau -paula -paulistas -pauta -pavimentação -paz -país -países +pé +peça +peças pede -pedido -pedidos pedindo pedir -pediu -pedra -pedras -pega pegar pegou -peito -peixe -peixes pela pelas -pele -pelo pelos -pena -penal pensa pensam pensamento @@ -2809,360 +817,68 @@ pensar pensei penso pensou -pensão -pequena -pequenas -pequeno -pequenos -perante -percebe -perceber -percebeu -percentual -percurso -perda -perdas -perde -perdendo -perder -perderam -perdeu -perdido -perfeito -perfil -pergunta -perguntar -perguntas perguntou -perigo -perigoso permanece permanecer -permaneceu -permanente -permanência -permite -permitido -permitir -permitiu -perna -pernas -personagem -personagens -personalidade -perspectiva -pertence -perto -período -períodos -pesado -pesca -peso -pesquisa -pesquisadores -pesquisas -pessoa -pessoais -pessoal -pessoalmente -pessoas -peça -peças -piloto -pilotos -pintura -pior -piores -piso -pista -placa -placas -planejamento -planeta -plano -planos -planta -plantas -plantio -plantão -plateia -pleito -plena pleno -plenário -plástico -pneus -pobre -pobres -pobreza pode +pôde pode-se podem podemos podendo poder -poderes -poderia -poderiam poderá poderão +poderia +poderiam podia -poeta +põe +põem pois -policiais -policial -politicamente -polêmica -políticas -político -políticos -ponta -ponte -ponto -pontos -popular -populares -população por -porque -porta -portal -portanto -portas -porte -porto -português porém -posicionamento -positiva -positivo -positivos -posição -posições +porque +porquê possa possam -posse -possibilidade -possibilidades +possíveis +possível +possivelmente posso possuem possui -possíveis -possível -posteriormente -posto -postos -postura -potencial pouca poucas -pouco poucos -povo -povos pps pq pra -praia -praias -prata praticamente -praticar -prato -pratos -prazer -prazo -prazos -praça -praças -precisa -precisam -precisamos -precisar -precisava -preciso -precisou -preconceito -preencher -prefeita -prefeitos -prefeituras -prefere -preferiu -preferência -prejudicar -prejuízo -prejuízos -premiação -preocupa -preocupado -preocupar -preocupação -prepara -preparado -preparados -preparar -preparação -presa -presente -presentes -presença -preservar -preservação -presidencial -presidente -presidentes -presidência -preso -presos -pressão -prestar -prestação -presídio -preta -pretende -preto -prevenção -previdência -prevista -previstas -previsto -previstos -previsão -prevê -preço -preços -primavera primeira primeiras primeiro primeiros -principais -principal -principalmente -princípio -princípios -prioridade -prioridades -prisão -privada -privado -pro -problema -problemas -procedimento -procedimentos -processo -processos -procura -procurado -procurador -procuram -procurando -procurar -procure -procurou -produtividade -produto -produtor -produtores -produtos -produz -produzido -produzir -produção -professor -professora -profissionais -profissional -profissão -profunda -programa -programas -programação -progresso -proibido -projeto -projetos -prol -promessa -promessas -promete -prometeu -promotor -promove -promover -promovido -promoção pronta pronto -propaganda -proposta -propostas -propriedade -propriedades -proprietário -proprietários -propósito -propõe -proteger -protesto -proteção -prova -provar -provas -provavelmente -providências -provisória -provocar -provocou -provável -proximidades -prudente -prática -práticas -pré-candidato -prédio -prédios -prévia -prêmio -prêmios +propios própria próprias +proprio próprio próprios +provável +provavelmente próxima próximas +proximidades próximo próximos -publicada -publicado -publicação -publicidade +puderam pudesse -punição -pura -página -páginas -pátio -pão -pé -pés -pênalti -pólo -pública -públicas -público -públicos -quadra -quadrados -quadrilha -quadro -quadros quais +quáis qual -qualidade -qualificação qualquer quando quantas @@ -3172,279 +888,56 @@ quanto quantos quarta quarta-feira -quarto quase quatro que -quebra -quebrar -queda -queira +quê quem -quente -quer -querem -queremos -querendo -querer -queria -queriam querido quero -questionado -questão -questões -quilos +quieto quilômetros +quilos quinta quinta-feira quinto -quis -quiser -rainha -ramo -ranking -rapaz -rapidamente -razão -razões +quinze reais -reajuste real -realidade -realiza -realizada -realizadas -realizado -realizados -realizando -realizar -realização realizou realmente -reação -rebaixamento -recado -recebe -recebem -recebendo -receber -receberam -receberá -recebeu -recebi -recebido -receita -receitas recente recentemente recentes -reclama -reclamar -reclamação -reclamações -reclamou -reconhece -reconhecer -reconhecido -reconhecimento -recorde -recorrer -recuperar -recuperação -recurso -recursos -redação -rede -redes redonda redor -reduzir -redução -reeleito -reeleição -refere -referente -referência -reflete -refletir -reflexão -reforma -reformas -reforçar -reforço -regime -regionais -regional -registrada -registrado -registrados -registrar -registro -registros -registrou -região -regiões -regra -regras -regular -rei -reino -reivindicações relacionados -relacionamento -relacionamentos -relata -relator -relatou -relatório -relação -relações -religioso -remuneração -remédio -remédios -renda -rendimento -renovação -repasse -repente -repercussão -repetir -reportagem -representa -representam -representante -representantes -representar -representação -repórter -república -reserva -reservas -resgate -residência -residências -resistência -resolução -resolve -resolver -resolveu resolvido -respectivamente -respeitar -respeito -responde -responder -respondeu -responsabilidade -responsáveis -responsável resposta respostas ressalta ressaltar -ressaltou -resta -restante -restaurante -restaurantes resto resultado resultados -retirada -retirar retornar -retorno -reunir -reuniu -reunião -reuniões -revela -revelou -rever -reverter -revista -revistas -revisão -revolução -reúne -rica -rico -ricos -rio -rio de janeiro -rios -riqueza -risco -riscos -ritmo -rival -rock -rodada -rodadas -rodovia -rodovias -rodoviária -romance -rosto -roteiro -rotina -roubo -roupa -roupas -rua -ruas -rubro-negro -ruim -rumo -rurais -rural -rádio -rápida -rápido +sábado +sábados sabe -sabedoria sabem sabemos sabendo saber sabia -saco saem sai saia saiba saindo sair -saiu -sala -salarial -salas -saldo -salto -salvar -salário -salários -salão -saneamento -sangue -santista -satisfação -satisfeito -saudade -saudável -saída saíram +são se -secretaria -secretarias -secretário -secretários -sede -segmento -segmentos -segredo segue seguem seguida @@ -3458,230 +951,131 @@ segunda segunda-feira segundo segundos -segura -segurança -segurar seguro sei seis seja sejam sejamos -seleção sem semana semanas semelhante semelhantes -semestre -seminário sempre -senado -senador -senadora -senadores +senão sendo senhor senhora senhores -sensação senso sente -sentença sentido sentimento sentimentos sentindo sentir sentiu -senão sequer -sequência ser +será +serão serei serem seremos seres seria seriam -serve -servidor -servidores -servir -serviu -serviço -serviços -será -serão seríamos +série +sério +serve sessão sessões sete setembro +sétima +sétimo setor setores seu seus -sexo sexta sexta-feira -sexual -shopping -show -shows +sexto si sido -sigilo -sigla -significa -significado -silêncio sim simples simplesmente -sinais -sinal sinto -sintomas -sistema -sistemas site sites -situação -situações +sítio +só sob sobe sobra sobre sobretudo -sobrinho -sociais -social -socorro -sofre -sofrem -sofrendo -sofrer -sofreu -sofrimento -sol -soldados -solenidade -solicitação -solicitou -solidariedade +sois solo -solução -soluções -som -soma -sombra somente somos -sonho -sonhos -sono -sorriso -sorte -sorteio sou soube sousa sozinha -sozinho sua suas subir subiu -substituir -substituição -sucesso -sucessão sudeste suficiente suficientes sugere -sugestão -sugestões sujeito sul -super -superar -superintendente -superior -superiores -supermercado -superou -suplente -suporte -suposto -supremo -surge -surgiu -surpresa -suspeita -suspeito -suspeitos -suspensão -sábado -sábados -são -século -série -sério -sítio -só -sócios -tabela +tá tais tal -talento talvez -tamanho +tambem também tanta tantas tanto tantos -taques -tarde -tarefa -tarifa -taxa -taxas -taça +tão te -teatro -tecnologia -tecnologias -tela -telefone -telefones -televisão tem -tema -temas +tém +têm temos -temperatura -tempo temporada -tempos tende -tendo tendência +tendes +tendo tenha tenham tenhamos tenho +tens tenta tentam tentando tentar tentaram tentativa +tente +tentei tentou -teoria ter +terá +terão +terça +terça-feira terceira terceiro terei @@ -3689,92 +1083,52 @@ terem teremos teria teriam -termina -terminal -terminar -terminou -termo -termos -terra -terras -terreno -terrenos -território -terá -terão -terça -terça-feira teríamos -tese -tesouro -teste -testemunhas -testes -teto teu teus teve -texto -textos the ti tido -time times tinha tinham -tio +tínhamos tipo tipos -tira -tirar -tiro -tiros tirou -titular -titulares tive tivemos tiver tivera tiveram +tivéramos tiverem tivermos tivesse tivessem -tivéramos tivéssemos +tiveste +tivestes tocar -tocou toda todas todo todos -tom toma -tomada -tomadas tomando tomar tomou toneladas toque -torcedor -torcedores -torcida torna tornando tornar -torneio -torno tornou tornou-se -torres total totalmente trabalha -trabalhador -trabalhadores trabalham trabalhando trabalhar @@ -3782,281 +1136,101 @@ trabalhava trabalho trabalhos trabalhou -tradicionais -tradicional -tradição -traficantes -tragédia -trajetória -tranquilidade -transferência -transformar -transformação -transformou -transição -transmissão -transparência -transporte -transportes -trata +trás trata-se -tratado -tratamento -tratar -trave -travessão traz trazendo trazer -trecho -trechos -treinador -treinamento -treino -trem -tribuna -tribunal -tributária +três +treze trimestre trinta trio -triste -tristeza -troca -trocar -troféu trouxe -tráfego -trás -três tu tua tuas tudo -turismo -turistas -turma -tv -twitter -tá -tão -técnica -técnicas -técnico -técnicos -tém -término -tênis -tínhamos -título -títulos +última +últimas +último +últimos um uma umas -unidade +única unidades -unidos -unir -universidade -universidades -universitário -universo -união uns -urbana -urbano -urgência -urnas +usa usada usado usados usam usando usar -usina -usinas uso usou usuário usuários +útil +utilização utilizada utilizado utilizados utilizar -utilização -vacinação -vaga -vagas +vá vai +vais vale -valer -valor -valores -valorizar -valorização vamos -vantagem -vantagens -vara -variação +vão +várias +vários vc vcs +vê veio -vejo velha velho velhos -velocidade vem +vêm vemos -vence -vencedor -vencer -venceu -venda -vendas -vender -vendidos vendo venha -vento +vens ver vera -verba -verbas -verdade -verdadeira -verdadeiro -verdadeiros -verde -vereador -vereadora -vereadores -vergonha -verificar -vermelha -vermelho -versão -verão vez vezes -veículo -veículos vi via -viagem -viagens -viajar vias -vice -vice-governador -vice-prefeito -vice-presidente -vida -vidas vieram -vigor -vila vinda vindo vinha -vinho vinte -violência vir vira virada viram virar virou -virtude -visa -visando -visita -visitantes -visitar -visitas -vista -visto -visual -visão -vitória -vitórias -viu -viva -vive -vivem -vivemos -vivendo -viver -viveu -vivo -vizinho -vizinhos você vocês -volante volta voltada voltado -voltam -voltando voltar voltaram voltou -volume -voluntários vontade -voos vos -votado -votar -votação -voto -votos -votou +vós +vossa +vossas +vosso +vossos vou -voz -vá -várias -vários -várzea -vão -véspera -vê -vídeo -vídeos -vítima -vítimas -vôo -zagueiro -zero -zona -à -às -água -águas -árbitro -área -áreas -árvore -árvores -época -éramos -êxito -índia -índice -índices -óleo -órgão -órgãos -ótima -ótimo -ônibus -última -últimas -último -últimos -única -único -útil +zero \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt b/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt old mode 100755 new mode 100644 index 2afa1eb3de..5522f31479 --- a/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt @@ -1,440 +1,711 @@ +# A Romanian stop word list. +# (Lightly edited to remove words in the original lists that are actually meaningful) +# Sources: # -# This is a stop word list for the Romanian language. +# http://snowball.tartarus.org/otherapps/romanian/intro.html (romanian2.tgz) +# https://github.com/stopwords-iso/stopwords-ro/blob/master/stopwords-ro.txt # -# Source: http://snowball.tartarus.org/otherapps/romanian/intro.html (romanian2.tgz) # - # A Romanian stop word list. Comments begin with vertical bar. Each stop - # word is at the start of a line. - - # Many of the forms below are quite rare but included for completeness. - - # ARTICLE - # Indefinite article -o # a -unui -unei -unor -nişte # some - # Demonstrative/adjectival article -cel -cea -cei -cele -celui -celei -celor - # Possessive / genitival article -al # of a -ai -ale - # PREPOSITION AND ADVERB -pe # on -la # at -în # in -fără # without -sub # under -despre # about -către # to -cu # with -de # from -din # on -lângă # by -pentru # for -peste # over -spre # to -prin # through -dintre # between -printre # among -până # until -după # after -înspre # towards -ca # as - # ADJECTIVE -mai # more -decât # than -cum # how -foarte # very -mult # much -multă -mulţi -multe -puţin # little -puţină -puţini -puţine -destul # enough -destulă -destui -destule - # PRONOUN - # Personal pronoun -eu # I -tu # you -el # he -ea # she -noi # we -voi # you -ei # they -ele # they -mie # me -îmi -mi -mine -mă -m -ţie # you -îţi -ţi -tine -te -lui # him -îl -l -îi -i -nouă # us -ne -ni -vouă # you -vă -vi -v -lor # them -le -li - # Pronoun of politeness -dumneavoastră # you - # Reflexive pronoun -se # himself -îşi -sie -sieşi -sine - # Pronoun of reinforcement -însumi # myself -însămi -însuţi # youself -însăţi -însuşi # himself -însăşi # herself -înşine # ourselves -însene -înşivă # youselves -însevă -înşişi # themselves -înseşi -însele - # Possessive pronoun -meu # mine -mea -mei -mele -tău # yours -ta -tăi -tale -său # his -sa -săi -sale -nostru # ours -noastră -noştri -noastre -vostru # yours -voastră -voştri -voastre - # Demonstrative pronoun -acesta # this -ăsta -aceştia -ăştia -acestuia -ăstuia -acestora -ăstora +abia +acea aceasta -asta -acestea -astea -acesteia -ăsteia -acest -aceşti -acestui -acestor această -aceste -acestei -acela # that -ăla -acelui -ăluia -aceia -ăia -acelora -ălora aceea -aia -acelea -alea -aceleia -ăleia -acel +aceeasi +aceeaşi acei -acelor -acea -acele -acelei -acelaşi # the same +aceia aceiaşi -aceeaşi +acel +acela +acela +acelasi +acelaşi +acele +acelea aceleaşi -aceluiaşi -aceloraşi +acelei +aceleia aceleiaşi -celălalt # the other -celuilalt -ceilalţi -celorlalţi -cealaltă -celeilalte -celelalte -celorlalte - # Interrogative pronoun -ce # what -cine # who -cui # whom -care # which, what -cărui -cărei -căror -unde # where -când # when - # Indefinite pronoun -cineva # someone -cuiva -altcineva # someone else -altcuiva -oricine # anyone -oricui -orice # anything -unul # one -una -unii -unele -unuia -uneia -unora -altul # other -alta -alţii -altele +acelora +aceloraşi +acelui +aceluiaşi +acest +acesta +acesta +aceste +acestea +acestei +acesteia +acestia +acestor +acestora +acestui +acestuia +aceşti +aceştia +acolo +acord +acum +acum +adica +ai +aia +aibă +aici +aiurea +al +al +ala +alaturi +ale +alea alt -altă -alţi +alta +altceva +altceva +altcineva +altcineva +altcuiva alte -altuia +altei alteia +altele +altfel +alti +altii +altor altora altui -altei -altor -vreunul # somebody, some (of them) -vreuna -vreunii -vreunele -vreun -vreo -vreunuia -vreuneia -vreunora -vreunui -vreunei -vreunor -oricare # anyone -oricăruia -oricăreia -oricărora -oricărui -oricărei -oricăror -fiecare # everyone -fiecăruia -fiecăreia -fiecărui -fiecărei -cât # how, how many -câtă -câţi -câte -câtora -câtor -atât # this much +altuia +altul +altul +altă +alţi +alţii +am +am +amândoi +amândouă +amânduror +amândurora +ambele +ambelor +ambii +ambilor +anume +apoi +aproape +ar +are +as +asa +asemenea +asta +astazi +astea +astfel +astfel +astăzi +asupra +asupra +atare +atat +atât +atata atâta -atâţi -atâţia +atatea atâtea -atâtora +atatia atâtor -oricât # however much -oricâtă -oricâţi -oricâte -oricâtora -oricâtor -câtva # some -câţiva +atâtora +atâţi +atâţia +ati +atit +atita +atitea +atitia +atunci +atunci +au +avea +aveai +aveam +aveau +aveaţi +avem +aveţi +avut +avut +azi +aş +aşa +aşadar +aţi +b +ba +bine +bine +bucur +bună +c +ca +ca +cam +cand +când +când +capat +care +care +careia +carora +caruia +cat +cât +cât +câte câteva +câtor +câtora câtorva -tot # all -toată -toţi -toate -tuturor -totul -cutare # that -oarecare # some -ceva # something -altceva # something else - # Negative pronoun -nimeni # nobody -nimănui -nimic # nothing - # NUMERAL - # Cardinal numeral -unu # one -doi # two +catre +câtva +câtva +câtă +caut +câţi +câţiva +ce +ce +cea +cealaltă +ceea +cei +ceilalti +ceilalţi +cel +cele +celei +celeilalte +celelalte +celor +celorlalte +celorlalţi +celui +celuilalt +celălalt +ceva +ceva +chiar +chiar +ci +ci +cinci +cinci +cind +cînd +cine +cine +cineva +cineva +cit +cît +cita +cite +cîte +citeva +citi +citiva +cîtva +cîţi +conform +contra +cu +cu +cui +cui +cuiva +cum +cum +cumva +curând +curînd +cutare +că +că +căci +cărei +căror +cărui +către +către +d +da +daca +dacă +dacă +dar +dar +dat +dat +datorită +dată +dau +de +de +deasupra +decât +deci +deci +decit +degraba +deja +deoarece +departe +desi +despre +despre +destui +destul +destule +destulă +deşi +deşi +din +din +dinaintea +dintr +dintr- +dintre +dintre +doar +doar +doi +doi +doilea +doime doua -trei # three -patru # four -cinci # five -şase # six -şapte # seven -opt # eight -noua # nine -zece # ten - # Fractional numeral -doime # half -treime # third -sutime # hundredth - # Collective numeral -amândoi # both -amândouă -amândurora -amânduror -ambii -ambele -ambilor -ambelor - # Multiplicative numeral -îndoit # double -întreit # threefold -însutit # hundred-fold - # Ordinal numeral -întâiul # the first -întâia -primul # former -prima -primii -primele -primului -primei -primilor -primelor - # VERB - # To be -sunt # (I) am -s -eşti # (you) are -este # (he/she) is +două +drept +dumneavoastră +dupa +după +după +dă e -suntem # (we) are -sunteţi # (you) are -eram # (I) were -erai # (you) were -era # (he) was -eraţi # (you) were -erau # (they) were -fiu # be -fii +ea +ea +ei +ei +el +el +ele +ele +era +era +erai +eram +eram +erau +este +este +eu +eu +exact +există +eşti +eşti +f +face +face +fara +fata +faţă +fel +fi fie +fiecare +fiecare +fiecărei +fiecăreia +fiecărui +fiecăruia +fii +fiind fim +fiu +fiu fiţi -fi -fiind # being -fost # been - # Auxiliary verb -am # to have - all forms -aţi -au -are -avem -aveţi -aveam -aveai -avea -aveaţi -aveau -aş -ar -oi # to will +foarte +foarte +făcut +g +h +i +ia +iar +iar +ieri +ieri +ii +îi +il +îl +imi +îmi +împotriva +in +în +în +inainte +înainte +înaintea +inapoi +inca +încât +încât +incit +încît +încotro +încă +îndoit +insa +însele +însene +însevă +înseşi +înspre +însumi +însutit +însuşi +însuţi +însă +însămi +însăţi +însăşi +întâia +întâiul +intr +intre +între +întreit +întrucât +întrucît +înşine +înşivă +înşişi +isi +iti +îşi +îţi +j +k +l +la +la +lângă +lângă +le +li +lîngă +lor +lor +lui +lui +m +ma +mai +mai +mâine +mare +mare +mea +mei +mele +mereu +meu +meu +mi +mie +mie +mîine +mine +mod +mult +mult +multa +multe +multi +multă +mulţi +mulţumesc +mă +n +ne +nevoie +ni +nici +nici +niciodata +nicăieri +nimeni +nimeni +nimeri +nimic +nimic +nimănui +niste +nişte +nişte +noastre +noastră +noi +noi +noroc +nostri +nostru +nostru +nou +noua +noua +nouă +nouă +noştri +nu +nu +numai +numai +o +o +oarecare +oi om -oţi +opt +opt or -vei +ori +oricând +oricare +oricare +oricât +oricât +oricâte +oricâtor +oricâtora +oricâtă +oricâţi +orice +orice +oricînd +oricine +oricine +oricît +oricui +oricum +oricărei +oricăreia +oricăror +oricărora +oricărui +oricăruia +oriunde +oţi +p +pai +până +până +parte +patra +patru +patru +patrulea +pe +pe +pentru +pentru +peste +peste +pic +pina +pînă +plus +poate +poate +pot +pot +prea +prima +primei +primele +primelor +primii +primilor +primul +primul +primului +prin +prin +printr- +printre +putea +putini +puţin +puţin +puţina +puţine +puţini +puţină +r +rog +s +sa +sa-mi +sa-ti +sai +sale +sau +sau +se +se +si +sie +sieşi +sine +sint +sînt +sintem +sîntem +sînteţi +spate +spre +spre +spune +spus +sub +sub +sunt +sunt +suntem +suntem +sunteţi +sunteţi +sus +sutime +sută +să +săi +său +său +t +ta +tale +te +ti +timp +tine +toata +toate +toată +tocmai +tot +tot +toti +totul +totusi +totuşi +totuşi +toţi +trebuie +trei +trei +treia +treilea +treime +tu +tu +tuturor +tăi +tău +tău +u +ul +un +una +unde +unde +undeva +unei +uneia +unele +uneori +unii +unor +unora +unu +unu +unui +unuia +unul +unul +v va -vom +vei veţi +vi +voastre +voastră +voi +voi +vom vor - # CONJUNCTION -şi # and -nici # neither -dar # but -însă -iar # and, but, while, again -ci # but, so that -sau # or -ori -deci # so -aşadar -încât # so that -aşa # such -deşi # although -totuşi # though -dacă # if -atunci # then -că # that - # OTHER -nu # no - - # The following is a ranked list (commonest to rarest) of stopwords - # deriving from a large sample of text. - -poate # maybe -ieri # yesterday -mare # big -doar # just -trebuie # must -spus # said -acum # now -putea # can -chiar # even -face # do -astfel # such -pot # can -făcut # done -avut # had -parte # part -spune # says -bine # good -faţă # front -există # exists -încă # still -numai # only -dat # given -asupra # on -aproape # near +vostru +vostru +vouă +vouă +voştri +vreme +vreo +vreun +vreuna +vreunei +vreuneia +vreunele +vreunii +vreunor +vreunora +vreunui +vreunuia +vreunul +vă +x +z +zece +zece +zero +zi +zice +şapte +şapte +şase +şase +şi +ţi +şi +ăia +ţie +ţie +ăla +ălea +ăleia +ălora +ăluia +ăsta +ăstea +ăstuia +ştiu +ăştia \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt b/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt index e4a59dda4c..99e38c779c 100644 --- a/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt @@ -1,10 +1,15 @@ +# This is a stop word list for the Russian language. +# (Lightly edited to remove words in the original lists that are actually meaningful) +# +# Source: +# https://github.com/stopwords-iso/stopwords-ru/blob/master/stopwords-ru.txt # -# This is a "short" stop word list for the Russian language. # adriver amp bin +c cgi href html @@ -16,894 +21,691 @@ rnd sid style www -а А -августа -акций -Александр -Александра -Алексей -Анатолий -Андрей +Б +В +Г +Д +Е +И +К +Л +М +Н +О +П +С +У +Ю +Я +а +в +г +д +е +ж +и +й +к +м +о +с +т +у +х +я +с кем АО +Во +Вы +Да +До +За +Из +Их +Мы +НА +НЕ +На +Не +Ни +Но +Ну +Об +Он +От +По +То +бы +во +вы +го +да +до +ее +ей +ею +её +же +за +из +им +их +ли +мы +на +не +ни +но +ну +нх +об +он +от +по +со +та +те +то +ту +ты +уж +Без +Вот +Все +Для +Его +Еще +Как +Кто +Мне +Нет +Нью +Она +Они +При +Так +Там +Тем +Уже +Что +Эта +Эти +Это +без +был +вам +вас +ваш +вид +вон +вот +все +всю +вся +всё +где +год +дал +два +две +для +дни +дня +его +ему +еще +ещё +иди +или +ими +имя +как +кем +кто +лет +мая +мне +мог +мож +мои +мой +моя +моё +над +нам +нас +наш +нее +ней +нем +нет +нею +неё +ним +них +оба +она +они +оно +под +пол +пор +при +про +раз +ряд +сам +сих +сто +так +там +тем +тех +той +том +тот +тою +три +тут +уже +час +чем +что +эта +эти +это +эту +все еще +Ведь +Даже +Если +Есть +ИТАР +Один +Пока +ТАСС +Хотя +Этот +алло +блог +буду +будь +бывь +была +были +было +быть +вами +ваша +ваше +ваши +ведь +весь +виде +вниз +вряд +всей +всем +всех +всею +выше +глаз +года +году +годы +дает +даже +дать +двух +день +дней +днях +едва +если +есть +жить +зато +знаю +идет +идти +имел +июля +июня +кого +кому +куда +либо +лицо +лишь +люди +мало +мене +меня +мимо +мира +мире +мной +мною +могу +мочь +нами +наша +наше +наши +него +нему +ниже +ними +ныне +один +одна +одно +одну +отец +пока +пора +пути +путь +пяти +пять +раза +сама +сами +само +саму +свое +свои +свой +свою +себе +себя +семь +срок +стал +таки +твои +твой +твоя +твоё +тебе +тебя +теми +того +тоже +тому +тонн +трех +туда +хоть +хотя +часа +чаще +чего +чему +чтоб +чуть +этим +этих +этой +этом +этот +явно +Более +Здесь +Когда +Кроме +Между +Может +Можно +После +Потом +Среди +Таким +Тогда +Через +Чтобы +более +будем +будет +будто +будут +вверх +вдали +вдруг +везде +взять +видел +внизу +вновь +вовсе +время +вроде +всеми +всему +всюду +даром +делал +делаю +друго +ждать +занят +затем +зачем +здесь +знает +знать +знают +имеет +имени +иметь +имеют +иначе +итоге +какая +какие +каким +каких +какой +когда +конец +конца +конце +кроме +любая +любой +людей +между +менее +места +место +месяц +метра +минут +много +могла +могли +могут +может +можно +можхо +назад +найти +нашей +наших +никак +никто +ничто +нужно +одним +одной +одном +около +опять +ответ +очень +перед +позже +пойти +после +потом +почти +прямо +пятая +пятый +разве +ранее +решил +рядом +самим +самих +самое +самой +самом +самые +самый +самым +самых +своей +своем +своим +своих +снова +собой +собою +сразу +среди +стала +стали +стало +стате +стать +стоит +столь +сумму +такая +также +такие +таким +таких +такое +такой +тобой +тобою +тогда +тысяч +уметь +часов +части +часто +часть +через +числе +число +чтобы +шесть +этими +этого +этому +якобы +хотел бы +может быть +Именно +Кстати +Многие +Однако +Почему +Правда +Причем +Рейтер +Сейчас +Теперь +Только апрель апреля -Ассошиэйтед -Б -без -Без -блог -более -Более +близко больше -большинство -большой -Борис -будет -будто -будут -бы -бывшего +будете +будешь +бывает бывший -был -была -были -было быстро -быть -в -В -вам -вас -ваш -вдруг -ведь -Ведь -века -вести -весь +важная +важное +важные +важный весьма взгляд -взять -виде +видеть видимо -Виктор -вице -включая -Владимира власть вместе вместо -внимание -вновь -во -Во -вовсе -воды -возможно -возможности -возможность -войск вокруг вообще вопрос -вопросы -воскресение -вот -Вот -впервые +восемь вполне -Впрочем -времена -времени -время -вроде -вряд -все -Все всегда -всего -всей -всем -всему -всех -встречи -всю -вся -всё вторая второй -вы -Вы -выборах -выше -выяснилось -г -Г -где -глава -главе -главного -главное -главный -главным -главы -го -говорил -говорит -говорится -говорить говоря -говорят -год -года -году -годы -город +голова города городе -градусов -Грозном группа группы -д -Д -Да -да -давно -дает -даже -Даже -дал +давать далеко дальше -данным -дать -два -две -движения -двух -действий -действительно -действия -декабря -дел -дела +девять делаем -делам делать -деле -дело -Дело -делу -день -деньги десять -деятельности -деятельность -директора -для -Для -дней -дни -дня -днях -до -До -довольно -документы -долго должен должна должно должны -дом -дома -доме -достаточно -друг -друга +дорога +другая другие другим других другое другой другом -е -Е -его -Его -едва -ее -ей -ему -если -Если -естественно -есть -Есть -еще -Еще -же -женщин -женщины -жизни -жизнь -жителей -жить -за -За -завода -закон -зам -заместитель -затем +думать заявил -заявление -здесь -Здесь -земли -знает значит -знаю -знают -зрения -и -И -игры -идет -из -Из -Известий -Известия -Известиям -известно -или -Иллюстрация -им -имеет -имени именно -Именно -иметь -имеют -имя -иначе -интервью -интересы -информацию -история -ИТАР -итоге -их -Их -июля -июня -й -к -К +иногда каждая -каждого +каждое +каждые каждый -кажется -как -Как -какие -каким -каких -какой -касается -качестве -квартиры -километров -когда -Когда -кого -количество -команда -команды -комиссии -комитета -комментариев -компания -кому -конечно -Конечно -конференции -конца -конце -коп -корреспонденту -которая -которого -которое -которой -котором -которому -которую -которые -который -которым -которыми -которых -края -Кроме -кроме -крупных +кругом кстате -Кстати кстати -кто -Кто -куда -Л -легко -лет -летний -ли -либо -лидер -лиц -лица -лично -лишь -лучше -любая -любой -людей -люди -людям -М -м -мало -марта -массовой -мая -между -Между -мене -менее меньше -меня -мере -меры -места -месте -местных -место -месяц месяца -месяцев -метра метров -миллиарда -миллион -миллиона -минут -мире -мировой -Михаил -мне -Мне +минута многие -Многие многих -много -мог -могла -могли -могут -может -Может -можно -Можно -мой момент -мы -Мы -Н -на -На -НА -над -надо -назад -наиболее -найти -наконец -нам -например -народа -нас -находившегося -находится начала начале -начальник -начальника -наш -наша +начать нашего -нашей -наши -наших -не -Не -НЕ -невозможно -него -недавно недели неделю -нее -ней -некоторого -некоторые -некоторых -нельзя -нем немало -нему -необходимо -нескольких -несколько -несмотря -нет -Нет -ни -Ни нибудь -никак -никаких -никогда -Николай -никто -ним -ними -них +никуда ничего -но -Но -нового -новой -новостей -новые -новый -новых ноября -Ну -нужно -ныне -нынешнего -Нью -о -О -об -Об -области -образом +нужный обычно -один -Один -одна -Однако однако -одним -одно -одновременно одного -одной -одном -одну -оказалась -оказались -оказалось -оказался -около -октября -он -Он -она -Она -они -Они -оно -операции -опыт -опять -органы -основном -особенно -остается -от -От -ответ -отдела -отличие -отношении -отношения -очень -очередной -очередь -П -партия +откуда +отсюда первая -первого первой первую первые первый первым первых -перед период -письмо -площади -по -По поводу -под -подобная -позиции -пока -Пока -политики -полностью -положение -полтора -получил -получили -получить -помощи -помощь -помощью -понять -пор -порядке -посколько -поскольку -после -После -последнее -последние -последний -последних -пост -постоянно -потом -Потом потому похоже почему -Почему -почта -почти -Поэтому -поэтому -права -Правда -правда -правило -право -практически -предприятий -предприятия -председателя -представителей -представители прежде -прежнему -премьер -премьера -Пресс -пресс -при -При -придется -примерно -примеру -принять -приходится -Причем -пришлось -про -проблем -проблема -проблемы -провести -продукции -проект -производства -производство -произошло -происходит -прокуратуры просто -против -процента -процентов -процесс -прошла -прошлого -прошлом -прямо -пути -путь -пятая -пяти -пять -работа -работавшую -работает -работать -работе -работу -равно -раз -раза -развития -разных -района -районе -ранее раньше -резко -результате -Рейтер -речь -решения -решил решили -рода -роль -руб -рук -руках -руки -руководителей -руководитель -руководство -ряд -рядом -с -С -сам -сама -сами +решить +самими самого -самое -самой -самом -самые -самый -самым -самых -сборной -свет -свое +самому своего -своей -своем своему -свои -своим своими -своих -свой -свою -связи -сделаем сделал -сделать -себе -себя -сегодня -Сегодня сейчас -Сейчас -семьи -сентября -Сергей -Сергея -силу -силы -система -системы -ситуации -ситуацию -ситуация -сих +сидеть скажем сказал -сказать -сколько скорее -следует -слишком -слова -словам -случае -случай -смерти -снова -со -собой -собственности -событий -события -совершенно совсем -создать сообща -сообщил -сообщили -состоянии -сотрудники -сотрудников -специалистов -специалисты -сразу -среди -Среди -средств -средства -срок -ссылка -стал -стала -стали -стало станет -становится -стате -стать -степени -сто -стоит -столице -столь -столько -сторону -стороны -суббота -суда -сумму -сути -существует -счет -считает -считать -считают -т -так -Так -такая -также -таки -такие -таким -Таким -таких такого -такое -такой -там -Там -ТАСС -те -театра -тебе -тем -Тем теперь -Теперь -территории -тех -течение -то -То -тогда -Тогда -того -тоже -той только -Только -том -тому -тонн -тот -точки -точнее -трех -три -труда +третий трудно -туда -тут -ты -тысяч тысячи -у -У -удалось -уж -уже -Уже -уровень уровне -условия -условиях -утверждает -утверждают -участие -участников -факт -февраля -фирм -фирма -фирмы -фонда -Фото -х -ходе хорошо -хоть -хотя -Хотя -хочет -целом -центр -центра -центре -цены -час -часа -часов -части -частности -часто -часть -чаще -чего -человек -человека -чем -через -Через +хотеть +хочешь четыре -четырех -числе -число членов -что -Что -чтобы -Чтобы -чуть -шесть -эта -Эта -эти -Эти -этим -этих -это -Это -этого -этой -этом -этому -этот -Этот -эту -Ю -Юрий -я -Я -являетесь -является -явно -якобы +шестой января -ясно +Впрочем +Конечно +Поэтому +Сегодня +августа +включая +восьмой +впервые +впрочем +времена +времени +встречи +главное +главный +главным +говорил +говорит +говорят +далекий +девятый +декабря +десятый +должный +каждого +кажется +команда +команды +комната +конечно +которая +которое +которой +котором +которую +которые +который +которым +которых +месяцев +миллион +наверху +наконец +недавно +никаких +никакой +никогда +области +образом +однажды +октября +очередь +первого +подойди \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt b/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt index 0629e2deb2..9758857e62 100644 --- a/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt @@ -1,22 +1,56 @@ -# # This is a stop word list for the Swedish language. # +# (Lightly edited to remove words in the original lists that are actually meaningful) # Sources: # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# https://github.com/stopwords-iso/stopwords-sv/blob/master/stopwords-sv.txt # that one Swedish journalist # +aderton +adertonde +adjö +aldrig alla +allas allt +alltid +alltså +än +andra +andras +annan +annat +ännu +är +artonde +artonn +åt +åtminstone att +åtta +åttio +åttionde +åttonde av -blev -bli -blir -blivit -borde +även båda +bådas +bara +bland +borde +bort +borta +då +dag +dagar +dagarna +dagen +där +därför de +del +delen dem den denna @@ -29,125 +63,296 @@ detta dig din dina +dit ditt -dom +dock du -där -då +e efter eftersom egen ej +elfte eller +elva +emot en +enligt +ens er era +ers ert ett -fanns -finns -från +ettusen få +fanns +får +fått +fem +femte +femtio +femtionde +femton +femtonde +fick +finnas +fjärde +fjorton +fjortonde +fler +flera +flesta +följande för före +från +fyra +fyrtio +fyrtionde +gå +gälla +gäller +gällt +går +gärna +gått genom +gick gjorde gjort +god +goda +godare +godast +gör göra ha hade -hade +haft han -han -hans hans har +här +heller +hellre hen henne hennes +hit +högst hon honom +hundra +hundraen +hundraett hur -här -i i +ibland icke +idag +igår igen +imorgon +in +inför +inga ingen +ingenting +inget innan +inne inom inte +inuti +ja jag +jämfört +jo ju +just kan +kanske +knappast +kom +komma +kommer +kommit +kr kunde +kunna kunnat -lite +kvar +legat +ligga +ligger man +många +måste med +mej mellan men +mer +mera +mest mig min -min mina mitt +mittemot +möjlig +möjligen +möjligt +möjligtvis mot mycket -nere -ni -nu -när någon +någonting något några +nån +nånting +när +nästa +nåt +nederst +nej +ner +nere +ni +nio +nionde +nittio +nittionde +nitton +nittonde +nödvändig +nödvändiga +nödvändigt +nödvändigtvis +nog +noll +nr +nu +nummer och +också +ofta +oftast +olika +olikt om -oss +över +övermorgon +överst +övre på +rakt +rätt +redan +sa +så +sådan +sådana +sådant +sade +säga +säger +sagt samma +sån sedan sen +senare +senast +sent +sex +sextio +sextionde +sexton +sextonde sig sin sina +sist +sista +siste +sitt sitta själv +sjätte +sju +sjunde +sjuttio +sjuttionde +sjutton +sjuttonde +ska +skall skulle +slutligen +snart som -så -sådan -sådana -sådant -sån -till +ta +tack +tar till tills +tio +tionde +tjugo +tjugoen +tjugoett +tjugonde +tjugotre +tjugotvå +tjungo +tolfte +tolv +tre +tredje +trettio +trettionde +tretton +trettonde +två +tvåhundra under upp +ur ut utan +utanför ute +va vad +väl +vänster +vänstra var vara +våra +vårat varför +varifrån varit varje +varken vars +varsågod vart +vårt vem +vems +verkligen vi vid +vidare +viktigare +viktigast vilka vilkas vilken vilket -vår -våra -vårat -vårt -än -är -åt -över +vill diff --git a/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt b/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt old mode 100755 new mode 100644 index 2418be327c..03ff70f6d6 --- a/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt @@ -1,101 +1,238 @@ -# # This is a stop word list for the Turkish language. +# (Lightly edited to remove words in the original lists that are actually meaningful) # # Sources: # http://nlp.ceng.fatih.edu.tr/blog/?p=101 # http://www.ranks.nl/stopwords/turkish.html +# https://github.com/stopwords-iso/stopwords-tr/blob/master/stopwords-tr.txt # +INSERmi a -acaba +acep +açıkçası +adamakıllı +adeta +ait altmýþ +altmış altý altı ama +amma +anca ancak +arada +artýk artık asla aslında +aynen +ayrıca az b bana +bari bazen bazý bazı bazıları bazısı +baţka +başkası belki ben benden beni benim +beri +beriki beþ beş +beţ +bilcümle bile bin +binaen +binaenaleyh bir +biraz +birazdan +birbiri +birçok +birçokları +birçoğu +birden +birdenbire biri +birice +birileri birisi birkaç birkaçı birkez -birçok -birçokları -birçoğu +birlikte birþey birþeyi birşey +birţey birşeyi +bitevi +biteviye +bittabi biz +bizatihi +bizce +bizcileyin bizden bize bizi bizim +bizimki +bizzat +böyle +böylece +böylecene +böylelikle +böylemesine +böylesine bu buna bunda bundan +bunlar +bunları +bunların bunu bunun +buracıkta burada -böyle -böylece +buradan +burası +büsbütün bütün c +ç +çabuk +çabukça +çeşitli +çok +çokça +çokları +çoklarınca +çokluk +çoklukla +çoğu +çoğun +çoğuna +çoğunca +çoğunlukla +çoğunu +cümlesi +çünkü d da daha dahi +dahil +dahilen +daima +dair +dayanarak de defa +dek demek +demin +demincek +deminden +denli +derakap +derhal +derken +deđil değil +değin diye +diđer diğer diğeri diğerleri doksan dokuz dolayı +dolayısıyla dört +doğru e +edecek +eden +ederek +edilecek +ediliyor +edilmesi +ediyor +elbet elbette elli +emme en en gibi +enikonu +epey +epeyce +epeyi +esasen +esnasında +etmesi +etti +ettiği +ettiğini +evleviyetle +evvel +evvela +evvelce +evvelden +evvelemirde +evveli +eđer +eğer f fakat falan felan -filan +filanca g +gah +gayet +gayetle +gayri +gayrı +geçende +geçenlerde +gelgelelim gene +gerçi +gerek gibi +gibilerden +gibisinden +göre h +hakeza +hâlâ +halbuki +halen +halihazırda +haliyle +handiyse hangi hangisi hani +hariç +hasebiyle hatta +hele hem henüz hep @@ -104,32 +241,63 @@ hepsine hepsini her her biri +herhangi herkes herkese herkesi +herkesin hiç hiç kimse +hiçbir hiçbiri hiçbirine hiçbirini -hâlâ i +için +içinde +iken iki +ila ile -INSERmi +ilgili +ilk +illa +illaki +imdi +indinde +insermi ise -için -içinde -işte +ister +itibaren +itibariyle +itibarıyla +iyi +iyice +iyicene +iţte j k +kaçı kadar +kah +kala +kanýmca +karşın katrilyon -kaç +kaynak kendi +kendilerine kendine kendini +kendisi +kendisine +kendisini +kere kez +keza +kezalik +keşke +keţke ki kim kimden @@ -137,10 +305,26 @@ kime kimi kimin kimisi +kimse +kimsecik +kimsecikler +külliyen kýrk +kýsaca +kırk +kısaca l +lakin +lütfen m +maada madem +mademki +mamafih +mebni +meğer +meğerki +meğerse mi milyar milyon @@ -151,39 +335,109 @@ mı n nasýl nasıl +nasılsa +nazaran ne ne kadar ne zaman neden +nedeniyle +nedenle +nedense nedir nerde +nerden +nerdeyse +nere nerede nereden +neredeyse +neresi nereye nesi +netekim +neye +neyi neyse -niye +nice niçin +nihayet +nihayetinde +nitekim +niye o +ö +öbürkü +öbürü +olan +olarak +oldu +oldukça +olduklarını +olduğu +olduğunu +olmadı +olmadığı +olmak +olması +olmayan +olmaz +olsa +olsun +olup +olur +olursa +oluyor on +ön ona +onca +önce +önceden +önceleri +öncelikle +onculayın +onda ondan onlar onlara onlardan onlari onlarýn +onları onların onu onu otuz onun +oracık +oracıkta orada +oradan +oranca +oranla +oraya +öteki +ötekisi +ötürü +otuz +öyle +öylece +öylelikle +öylemesine oysa oysaki +öz p +pek +pekala +peki +peyderpey r rağmen s +sadece +sahi +sahiden sana sanki sekiz @@ -199,47 +453,74 @@ sizi sizin son sonra +sonradan +sonraları +sonunda t tabi +tabii +tam tamam +tamamen +tamamıyla +tarafından +tek trilyon tüm tümü u +ü +üç +üzere v var +vardı +vasıtasıyla ve +velev +velhasıl +velhasılıkelam veya veyahut y ya ya da +yahut +yakinen +yakında +yakından +yakınlarda +yalnız +yalnızca yani +yapacak +yapmak +yaptı +yaptıkları +yaptığı +yaptığını +yapılan +yapılması +yapıyor yedi +yeniden +yenilerde yerine yetmiþ +yetmiş +yetmiţ yine yirmi +yok yoksa +yoluyla yüz +yüzünden z +zarfında zaten +zati zira -ç -çok -çoğu -çoğuna -çoğunu -çünkü -ö -öbürü -ön -önce -ötürü -öyle -ü -üzere -üç þey þeyden þeyi @@ -253,17 +534,26 @@ zira ı ş şayet +ţayet şey şeyden şeye şeyi şeyler şimdi +ţimdi +şöyle +ţöyle şu +ţu şuna +şuncacık şunda şundan şunlar +şunları şunu şunun -şöyle +şura +şuracıkta +şurası \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt b/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt index 3eb0376f33..533e638f96 100644 --- a/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt @@ -1,4 +1,5 @@ # Appended Traditional Chinese characters (Note: This does not include all stopwords in Cantonese or Taiwanese Mandarin) +# (Lightly edited to remove words in the original lists that are actually meaningful) # Sources: # http://blog.csdn.net/shijiebei2009/article/details/39696571 # http://github.com/stopwords-iso/stopwords-zh @@ -129,38 +130,38 @@ sup 一 一. 一一 -一下 -一个 -一些 -一何 -一來 -一個 -一切 -一则 -一则通过 -一則 -一則通過 -一天 -一定 -一方面 -一旦 -一时 -一時 -一来 -一样 -一樣 -一次 -一片 -一番 -一直 -一致 -一般 -一起 -一轉眼 -一转眼 -一边 -一邊 -一面 +下 +个 +些 +何 +來 +個 +切 +则 +则通过 +則 +則通過 +天 +定 +方面 +旦 +时 +時 +来 +样 +樣 +次 +片 +番 +直 +致 +般 +起 +轉眼 +转眼 +边 +邊 +面 七 万一 三 @@ -204,8 +205,6 @@ sup 不光 不免 不再 -不力 -不勝 不单 不变 不只 @@ -224,8 +223,6 @@ sup 不如 不妨 不定 -不对 -不對 不少 不尽 不尽然 @@ -239,12 +236,8 @@ sup 不必 不怎么 不怎麼 -不怕 不惟 -不成 不拘 -不择手段 -不擇手段 不敢 不料 不断 @@ -259,8 +252,6 @@ sup 不止一次 不比 不消 -不满 -不滿 不然 不然的話 不然的话 @@ -277,7 +268,6 @@ sup 不管怎樣 不經意 不经意 -不胜 不能 不能不 不至于 @@ -306,8 +296,6 @@ sup 且說 且说 两者 -严格 -严重 並 並不 並不是 @@ -325,7 +313,6 @@ sup 中小 中間 中间 -丰富 串行 临 临到 @@ -341,7 +328,6 @@ sup 主张 主張 主要 -举凡 举行 乃 乃至 @@ -355,20 +341,12 @@ sup 之後 之所以 之类 -之類 -乌乎 -乎 -乒 乘 -乘势 乘勝 乘勢 乘机 乘機 乘胜 -乘虚 -乘虛 -乘隙 九 也 也好 @@ -397,7 +375,6 @@ sup 互相 五 些 -交口 亦 产生 亲口 @@ -437,9 +414,6 @@ sup 从中 从事 从今以后 -从优 -从古到今 -从古至今 从头 从宽 从小 @@ -491,17 +465,12 @@ sup 任憑 企图 企圖 -伙同 会 -伟大 传 -传说 -传闻 似乎 似的 但 但凡 -但愿 但是 但願 何 @@ -550,16 +519,9 @@ sup 依據 依照 依靠 -便 便于 便於 係 -促进 -促進 -保持 -保管 -保险 -保險 俺 俺们 俺們 @@ -584,7 +546,6 @@ sup 假使 假如 假若 -偉大 偏偏 做到 偶尔 @@ -592,8 +553,6 @@ sup 偶而 傥然 傳 -傳聞 -傳說 僅 僅僅 像 @@ -633,7 +592,6 @@ sup 八成 公然 六 -兮 共 共同 共总 @@ -676,8 +634,6 @@ sup 再說 再説 再说 -冒 -冲 决不 决定 决非 @@ -765,14 +721,8 @@ sup 加之 加以 加入 -加強 -加强 动不动 -动辄 -勃然 動不動 -動輒 -匆匆 十分 千 千万 @@ -831,8 +781,6 @@ sup 取道 受到 变成 -古來 -古来 另 另一个 另一個 @@ -843,7 +791,6 @@ sup 另行 只 只当 -只怕 只是 只有 只消 @@ -854,12 +801,8 @@ sup 叫做 召开 召開 -叮咚 -叮噹 -叮当 可 可以 -可好 可是 可能 可見 @@ -898,14 +841,11 @@ sup 吧 吧哒 吧噠 -吱 吶 呀 呃 呆呆地 呐 -呕 -呗 呜 呜呼 呢 @@ -913,29 +853,19 @@ sup 周圍 呵 呵呵 -呸 -呼哧 -呼啦 咁 咋 和 -咚 -咦 咧 咱 咱们 咱們 -咳 哇 哈 哈哈 哉 哎 -哎呀 -哎哟 哎喲 -哗 -哗啦 哟 哦 哩 @@ -957,7 +887,6 @@ sup 哪里 哼 哼唷 -唄 唉 唔 唯有 @@ -966,14 +895,10 @@ sup 啊哈 啊哟 啊喲 -問題 -啐 啥 啦 啪达 啪達 -啷噹 -啷当 喀 喂 喏 @@ -987,28 +912,16 @@ sup 嗚 嗚呼 嗡 -嗡嗡 嗬 嗯 -嗳 -嘅 嘍 嘎 -嘎嘎 嘎登 -嘔 -嘘 嘛 嘩 嘩啦 -嘻 嘿 嘿嘿 -噓 -噯 -嚇 -嚴格 -嚴重 四 因 因为 @@ -1019,7 +932,6 @@ sup 因着 因而 因著 -固 固然 在 在下 @@ -1040,7 +952,6 @@ sup 处在 处处 处理 -复杂 多 多么 多亏 @@ -1060,20 +971,13 @@ sup 夠瞧的 夥同 大 -大不了 -大举 -大事 大体 大体上 -大凡 -大力 大多 大多数 大多數 大大 大家 -大张旗鼓 -大張旗鼓 大批 大抵 大概 @@ -1081,24 +985,15 @@ sup 大約 大约 大致 -大舉 大都 大量 -大面儿上 -大面兒上 大體 大體上 -失去 -奇 -奈 -奋勇 -奮勇 她 她们 她們 她是 她的 -好 好像 好在 好的 @@ -1137,14 +1032,9 @@ sup 它們的 它是 它的 -安全 -完全 完成 定 -实现 实际 -宣布 -容易 密切 實現 實際 @@ -1152,22 +1042,17 @@ sup 寧可 寧肯 寧願 -对 对于 对应 对待 对方 对比 将 -将才 将要 将近 將 -將才 將要 將近 -專門 -對 對待 對應 對方 @@ -1221,7 +1106,6 @@ sup 岂但 岂止 岂非 -川流不息 左右 巨大 巩固 @@ -1234,7 +1118,6 @@ sup 已經 已经 巴 -巴巴 带 帮助 帶 @@ -1266,26 +1149,17 @@ sup 幾時 幾番 幾經 -广大 广泛 应当 应用 应该 -庶乎 -庶几 -庶幾 廣大 廣泛 开外 开始 开展 引起 -弗 -強烈 -強調 弹指之间 -强烈 -强调 彈指之間 归 归根到底 @@ -1300,8 +1174,6 @@ sup 当口儿 当地 当场 -当头 -当庭 当时 当然 当真 @@ -1329,27 +1201,16 @@ sup 得了 得出 得到 -得天独厚 -得天獨厚 -得起 從 從不 從中 從事 從今以後 從來 -從優 -從古到今 -從古至今 -從嚴 -從寬 -從小 -從新 從早到晚 從未 從此 從此以後 -從無到有 從而 從輕 從速 @@ -1363,8 +1224,6 @@ sup 必定 必将 必將 -必然 -必要 必須 必须 快 @@ -1375,16 +1234,12 @@ sup 怎么 怎么办 怎么样 -怎奈 怎样 怎樣 怎麼 怎麼樣 怎麼辦 怎麽 -怕 -急匆匆 -怪 怪不得 总之 总是 @@ -1394,7 +1249,6 @@ sup 总结 总而言之 恍然 -恐怕 恰似 恰好 恰如 @@ -1407,23 +1261,15 @@ sup 您們 您是 惟其 -惯常 意思 -愤然 愿意 慢說 慢説 慢说 慣常 -憑 -憑藉 -憤然 -應用 應當 應該 成为 -成年 -成年累月 成心 成為 我 @@ -1440,7 +1286,6 @@ sup 或者 或許 或许 -战斗 截然 截至 戰鬥 @@ -1451,20 +1296,10 @@ sup 所有 所謂 所谓 -才 -才能 -扑通 -打 -打从 -打开天窗说亮话 -打從 -打開天窗說亮話 -扩大 把 抑或 报导 报道 -抽冷子 拦腰 拿 指 @@ -1472,7 +1307,6 @@ sup 按 按时 按時 -按期 按照 按理 按說 @@ -1508,7 +1342,6 @@ sup 換句話說 換句話説 換言之 -撲通 據 據實 據悉 @@ -1516,7 +1349,6 @@ sup 據此 據稱 據說 -擴大 攔腰 放量 故 @@ -1538,7 +1370,6 @@ sup 方 方便 方才 -方能 方面 於 於是 @@ -1560,11 +1391,8 @@ sup 日漸 日益 日臻 -日見 -日见 时 时候 -昂然 明显 明确 明確 @@ -1578,8 +1406,6 @@ sup 显著 時 時候 -普通 -普遍 暗中 暗地裡 暗地里 @@ -1594,21 +1420,17 @@ sup 曾經 曾经 替 -替代 最 最后 最大 最好 最後 最近 -最高 會 月 有 有些 有关 -有利 -有力 有及 有所 有效 @@ -1645,10 +1467,6 @@ sup 极为 极了 极其 -极力 -极大 -极度 -极端 构成 果然 果真 @@ -1661,12 +1479,9 @@ sup 根據 根本 格外 -梆 -極 極了 極其 極力 -極大 極度 極為 極端 @@ -1677,12 +1492,9 @@ sup 欢迎 欤 歟 -歡迎 -正值 正在 正如 正巧 -正常 正是 此 此中 @@ -1701,7 +1513,6 @@ sup 歸 歸根到底 歸根結底 -歸齊 殆 毋宁 毋寧 @@ -1734,7 +1545,6 @@ sup 毫無 毫無例外 毫無保留地 -汝 決不 決定 決非 @@ -1743,22 +1553,14 @@ sup 沒有 沙沙 没 -没奈何 没有 沿 沿着 沿著 況且 注意 -活 -深入 -清楚 湊巧 準備 -满 -满足 -滿 -滿足 漫說 漫説 漫说 @@ -1771,8 +1573,6 @@ sup 為止 為此 為著 -烏乎 -焉 無 無寧 無法 @@ -1800,15 +1600,10 @@ sup 牢牢 特別是 特别是 -特殊 特点 -特約 -特约 特點 -犹且 犹自 独 -独媒特约 独自 猛然 猛然間 @@ -1816,15 +1611,12 @@ sup 猶且 猶自 獨 -獨媒特約 獨自 獲得 率尔 率然 率爾 -现代 现在 -現代 現在 理应 理当 @@ -1869,7 +1661,6 @@ sup 當前 當即 當口兒 -當地 當場 當庭 當時 @@ -1877,8 +1668,6 @@ sup 當真 當着 當著 -當頭 -白 白白 的 的确 @@ -1886,7 +1675,6 @@ sup 的話 的话 皆可 -盡 盡可能 盡如人意 盡心盡力 @@ -1937,8 +1725,6 @@ sup 确定 碰巧 確定 -社会主义 -社會主義 离 种 积极 @@ -1949,12 +1735,8 @@ sup 稱 積極 究竟 -穷年累月 突出 突然 -窃 -窮年累月 -竊 立 立刻 立即 @@ -2030,7 +1812,6 @@ sup 纵令 纵使 纵然 -练习 组成 经 经常 @@ -2044,7 +1825,6 @@ sup 绝非 绝顶 继之 -继后 继续 继而 维持 @@ -2052,8 +1832,6 @@ sup 缕缕 罢了 罷了 -老 -老大 老是 老老实实 老老實實 @@ -2074,11 +1852,6 @@ sup 而論 而论 联系 -联袂 -聯繫 -聯袂 -背地裡 -背地里 背靠背 能 能否 @@ -2098,7 +1871,6 @@ sup 自己 自後 自從 -自打 自身 臭 至 @@ -2112,17 +1884,13 @@ sup 與其說 與否 與此同時 -舉凡 舉行 般的 良好 若 -若夫 若是 -若果 若非 范围 -莫 莫不 莫不然 莫如 @@ -2147,12 +1915,10 @@ sup 行动 行動 行為 -衝 表明 表示 被 裡面 -複雜 要 要不 要不是 @@ -2178,7 +1944,6 @@ sup 許多 話說 該 -該當 認為 認爲 認真 @@ -2195,18 +1960,13 @@ sup 誰知 請勿 論 -論說 諸 諸位 諸如 謹 -譬喻 譬如 變成 讓 -认为 -认真 -认识 让 许多 论 @@ -2237,12 +1997,9 @@ sup 豈但 豈止 豈非 -豐富 -賊死 賴以 贼死 赖以 -赶 赶快 赶早不赶晚 起 @@ -2255,10 +2012,6 @@ sup 起见 起頭 起首 -趁 -趁便 -趁势 -趁勢 趁早 趁机 趁機 @@ -2267,8 +2020,6 @@ sup 趁着 趁著 越是 -趕 -趕快 趕早不趕晚 距 跟 @@ -2278,23 +2029,13 @@ sup 較之 較比 較為 -轉動 -轉變 -轉貼 -轟然 -转动 -转变 -转贴 -轰然 较 较为 较之 较比 边 达到 -达旦 迄 -迅速 过 过于 过去 @@ -2334,7 +2075,6 @@ sup 进入 进去 进来 -进步 进而 进行 连 @@ -2400,8 +2140,6 @@ sup 過去 過於 達到 -達旦 -適應 適用 適當 遭到 @@ -2468,7 +2206,6 @@ sup 關於 问题 间或 -防止 阿 附近 陈年 @@ -2487,7 +2224,6 @@ sup 除此而外 除開 除非 -陳年 随 随后 随时 @@ -2504,7 +2240,6 @@ sup 难说 难道 难道说 -集中 雖 雖則 雖然 @@ -2517,24 +2252,20 @@ sup 難說 難道 難道說 -雲爾 零 需要 非但 非常 非徒 非得 -非特 非独 非獨 靠 -鞏固 頂多 頃 頃刻 頃刻之間 頃刻間 -順 順着 順著 頓時 @@ -2555,12 +2286,10 @@ sup 風雨無阻 风雨无阻 飽 -餘外 餵 饱 首先 馬上 -騰 马上 高低 高兴 @@ -2568,8 +2297,6 @@ sup 麼 默然 默默地 -齊 -齐 ︿ ! # From 0d45c2a014566d37edd415041e8c8fa8248f1729 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 1 Apr 2021 22:35:32 +0300 Subject: [PATCH 042/175] Add API hack to show word clouds filtered with old and new stopword lists I've duplicated the current stopword lists as such: * `ca/ca_stop_words.txt` - the updated list of stopwords once @jtotoole finalizes the list * `ca/ca_stop_words_old.txt` - the old, overly aggressive list of stopwords * ... then did some ugly copy-pasting of some API code for the relevant endpoints to support the `old_stopwords` parameter. Once set to something like `1`, the API call will do the word cloud filtering using the "old" stopword list instead of the "new" one. Example: * `stories/list?q=...` - uses the new, non-aggressive stopword list; * `stories/list?q=...&old_stopwords=1` - uses the old, overly aggressive stopword list. The idea is that once @jtotoole comes up with his final list of "new" stopwords, we can merge his list on top of this PR, deploy, and that way we'll get a rather easy way to compare stopwords. API calls that use stopword filtering to create word clouds and thus support the `old_stopwords` parameter: * `wc/list` * `stories/word_matrix` * any call that lists stories (e.g. `stories/list`) *and* passes the `show_wc` parameter (so @dsjen, you can find all uses of the call by grepping for `show_wc`) https://github.com/mediacloud/api-client hasn't been updated accordingly so you might want to do that too. Also, I have no idea how the caching layer might treat this as I'm not familiar with it. @dsjen, is this something that you could use on your end? If so, @jtotoole, could you rebase your stopword branch on top of this one? --- .../Languages/Language/PythonWrapper.pm | 9 + .../python/mediawords/languages/__init__.py | 39 + .../languages/ca/ca_stop_words_old.txt | 781 ++++ .../languages/da/da_stop_words_old.txt | 101 + .../languages/de/de_stop_words_old.txt | 238 + .../languages/es/es_stop_words_old.txt | 373 ++ .../languages/fi/fi_stop_words_old.txt | 242 + .../languages/fr/fr_stop_words_old.txt | 185 + .../languages/ha/ha_stop_words_old.txt | 46 + .../mediawords/languages/hi/__init__.py | 3 + .../languages/hi/hi_stop_words_old.txt | 228 + .../languages/hu/hu_stop_words_old.txt | 206 + .../languages/it/it_stop_words_old.txt | 286 ++ .../languages/ja/ja_stop_words_old.txt | 636 +++ .../languages/lt/lt_stop_words_old.txt | 133 + .../languages/nl/nl_stop_words_old.txt | 108 + .../languages/no/no_stop_words_old.txt | 183 + .../languages/pt/pt_stop_words_old.txt | 4062 +++++++++++++++++ .../languages/ro/ro_stop_words_old.txt | 440 ++ .../languages/ru/ru_stop_words_old.txt | 909 ++++ .../languages/sv/sv_stop_words_old.txt | 153 + .../languages/tr/tr_stop_words_old.txt | 269 ++ .../mediawords/languages/zh/__init__.py | 3 + .../languages/zh/zh_stop_words_old.txt | 2727 +++++++++++ .../Controller/Api/V2/StoriesBase.pm | 28 +- .../perl/MediaWords/Controller/Api/V2/Wc.pm | 11 +- .../DBI/Stories/WordMatrixOldStopwords.pm | 152 + .../MediaWords/Solr/WordCountsOldStopwords.pm | 447 ++ 28 files changed, 12992 insertions(+), 6 deletions(-) create mode 100644 apps/common/src/python/mediawords/languages/ca/ca_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/da/da_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/de/de_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/es/es_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/fi/fi_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/fr/fr_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/ha/ha_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/hi/hi_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/hu/hu_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/it/it_stop_words_old.txt create mode 100755 apps/common/src/python/mediawords/languages/ja/ja_stop_words_old.txt create mode 100755 apps/common/src/python/mediawords/languages/lt/lt_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/nl/nl_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/no/no_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/pt/pt_stop_words_old.txt create mode 100755 apps/common/src/python/mediawords/languages/ro/ro_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/ru/ru_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/sv/sv_stop_words_old.txt create mode 100755 apps/common/src/python/mediawords/languages/tr/tr_stop_words_old.txt create mode 100644 apps/common/src/python/mediawords/languages/zh/zh_stop_words_old.txt create mode 100644 apps/webapp-api/src/perl/MediaWords/DBI/Stories/WordMatrixOldStopwords.pm create mode 100644 apps/webapp-api/src/perl/MediaWords/Solr/WordCountsOldStopwords.pm diff --git a/apps/common/src/perl/MediaWords/Languages/Language/PythonWrapper.pm b/apps/common/src/perl/MediaWords/Languages/Language/PythonWrapper.pm index 384318b8e6..67dad9a001 100644 --- a/apps/common/src/perl/MediaWords/Languages/Language/PythonWrapper.pm +++ b/apps/common/src/perl/MediaWords/Languages/Language/PythonWrapper.pm @@ -71,6 +71,15 @@ sub stop_words_map($) return $stop_words_map; } +# FIXME remove once stopword comparison is over +sub stop_words_old_map($) +{ + my $self = shift; + + my $stop_words_old_map = $self->{ _python_lang }->stop_words_old_map(); + return $stop_words_old_map; +} + sub stem_words($$) { my ( $self, $words ) = @_; diff --git a/apps/common/src/python/mediawords/languages/__init__.py b/apps/common/src/python/mediawords/languages/__init__.py index b985445af4..2371247c27 100644 --- a/apps/common/src/python/mediawords/languages/__init__.py +++ b/apps/common/src/python/mediawords/languages/__init__.py @@ -81,6 +81,12 @@ def stop_words_map(self) -> Dict[str, bool]: """ raise NotImplementedError("Abstract method.") + # FIXME remove once stopword comparison is over + @abc.abstractmethod + def stop_words_old_map(self) -> Dict[str, bool]: + """Return map of old stopwords.""" + raise NotImplementedError("Abstract method.") + @abc.abstractmethod def stem_words(self, words: List[str]) -> List[str]: """Return list of stems for a list of words. @@ -283,6 +289,9 @@ def __init__(self): # Stop words map (lazy initialized) self.__stop_words_map = None + # FIXME remove once stopword comparison is over + self.__stop_words_old_map = None + def stop_words_map(self) -> Dict[str, bool]: """Return stop word map read from a file.""" if self.__stop_words_map is None: @@ -312,3 +321,33 @@ def stop_words_map(self) -> Dict[str, bool]: self.__stop_words_map = stop_words return self.__stop_words_map + + # FIXME remove once stopword comparison is over + def stop_words_old_map(self) -> Dict[str, bool]: + if self.__stop_words_old_map is None: + + stop_words_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + self.language_code(), + '%s_stop_words_old.txt' % self.language_code(), + ) + if stop_words_path is None: + raise McLanguageException("Stop words file path is None.") + + if not os.path.isfile(stop_words_path): + raise McLanguageException("Stop words file does not exist at path '%s'." % stop_words_path) + + stop_words = dict() + with open(stop_words_path, 'r', encoding='utf-8') as f: + for stop_word in f.readlines(): + # Remove comments + stop_word = re.sub(r'\s*?#.*?$', '', stop_word) + + stop_word = stop_word.strip() + + if len(stop_word) > 0: + stop_words[stop_word] = True + + self.__stop_words_old_map = stop_words + + return self.__stop_words_old_map diff --git a/apps/common/src/python/mediawords/languages/ca/ca_stop_words_old.txt b/apps/common/src/python/mediawords/languages/ca/ca_stop_words_old.txt new file mode 100644 index 0000000000..eaf6168385 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/ca/ca_stop_words_old.txt @@ -0,0 +1,781 @@ +# +# This is a stop word list for the Catalan language. +# +# Sources: +# https://raw.githubusercontent.com/stopwords-iso/stopwords-ca/master/stopwords-ca.txt +# http://latel.upf.edu/morgana/altres/pub/ca_stop.htm +# https://www.ranks.nl/stopwords/catalan +# + +a +abans +abans-d'ahir +abintestat +ací +adesiara +adàgio +adés +adéu +ah +ahir +ai +aitambé +aitampoc +aitan +aitant +aitantost +aixà +així +això +al +aleshores +algun +alguna +algunes +alguns +algú +alhora +allà +allèn +allí +allò +almenys +als +alto +altra +altre +altres +altresí +altri +al·legro +alça +amargament +amb +ambdues +ambdós +amunt +amén +anar +anc +andante +andantino +anit +ans +antany +apa +aprés +aqueix +aqueixa +aqueixes +aqueixos +aqueixs +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aquèn +aquí +ara +arran +arrera +arrere +arreu +arri +arruix +atxim +au +avall +avant +aviat +avui +açò +bah +baix +baldament +ballmanetes +banzim-banzam +bastant +bastants +ben +bis +bitllo-bitllo +bo +bé +ca +cada +cadascuna +cadascunes +cadascuns +cadascú +cal +cap +car +caram +catorze +cent +centes +cents +cerca +cert +certa +certes +certs +cinc +cinquanta +cinquena +cinquenes +cinquens +cinquè +com +comsevulla +consegueixo +conseguim +conseguir +consigueix +consigueixen +consigueixes +contra +cordons +corrents +cric-crac +d +d'un +d'una +d'unes +d'uns +daixonses +daixò +dallonses +dallò +dalt +daltabaix +damunt +darrera +darrere +davall +davant +de +debades +dedins +defora +dejorn +dejús +del +dellà +dels +dementre +dempeus +demà +demés +des +des de +desena +desenes +desens +després +dessobre +dessota +dessús +desè +deu +devers +devora +deçà +diferents +dinou +dins +dintre +disset +divers +diversa +diverses +diversos +divuit +donat +doncs +dos +dotze +dues +durant +e +ecs +eh +el +ela +elis +ell +ella +elles +ells +els +em +emperò +en +enans +enant +encara +encontinent +endalt +endarrera +endarrere +endavant +endebades +endemig +endemà +endemés +endins +endintre +enfora +engir +enguany +enguanyasses +enjús +enlaire +enlloc +enllà +enrera +enrere +ens +ensems +ensota +ensús +entorn +entre +entremig +entretant +entrò +envers +envides +environs +enviró +ençà +ep +era +erem +eren +eres +ergo +es +escar +essent +esser +est +esta +estada +estades +estan +estant +estar +estaran +estarem +estareu +estaria +estarien +estaries +estarà +estaràs +estaré +estaríem +estaríeu +estat +estats +estava +estaven +estaves +estem +estes +esteu +estic +estiguem +estigueren +estigueres +estigues +estiguessis +estigueu +estigui +estiguin +estiguis +estigué +estiguérem +estiguéreu +estigués +estiguí +estos +està +estàs +estàvem +estàveu +et +etc +etcètera +ets +excepte +fa +faig +fan +fas +fem +fer +fer faig +feu +fi +fins +fora +foren +fores +força +fos +fossin +fossis +fou +fra +fui +fóra +fórem +fóreu +fóssim +fóssiu +gaire +gairebé +gaires +gens +girientorn +gratis +ha +hagi +hagin +hagis +haguda +hagudes +hagueren +hagueres +haguessin +haguessis +hagut +haguts +hagué +haguérem +haguéreu +hagués +haguéssim +haguéssiu +haguí +hala +han +has +hauran +haurem +haureu +hauria +haurien +hauries +haurà +hauràs +hauré +hauríem +hauríeu +havem +havent +haver +haveu +havia +havien +havies +havíem +havíeu +he +hem +heu +hi +ho +hom +hui +hàgim +hàgiu +i +igual +iguals +inclusive +inclòs +ja +jamai +jo +l +l'hi +la +leri-leri +les +li +li'n +lla +llarg +llavors +llevat +lluny +llur +llurs +lo +los +ls +m +m'he +ma +mai +mal +malament +malgrat +manco +mant +manta +mantes +mantinent +mants +massa +mateix +mateixa +mateixes +mateixos +me +mentre +mentrestant +menys +mes +meu +meua +meues +meus +meva +meves +mi +mig +mil +mitges +mitja +mitjançant +mitjos +mode +moixoni +molt +molta +moltes +molts +mon +mons +mos +més +n +n'he +n'hi +na +ne +ni +ningú +no +nogensmenys +només +noranta +nos +nosaltres +nostra +nostre +nostres +nou +novena +novenes +novens +novè +ns +nòs +nós +o +oh +oi +oidà +on +onsevulga +onsevulla +onze +pas +pel +pels +pengim-penjam +per +per que +perquè +pertot +però +piano +pla +poc +poca +pocs +podem +poden +poder +podeu +poques +potser +prest +primer +primera +primeres +primers +pro +prompte +prop +propi +prou +puc +puix +pus +pàssim +qual +quals +qualsevol +qualsevulla +qualssevol +qualssevulla +quan +quant +quanta +quantes +quants +quaranta +quart +quarta +quartes +quarts +quasi +quatre +que +quelcom +qui +quin +quina +quines +quins +quinze +quisvulla +què +ran +re +rebé +renoi +rera +rere +res +retruc +s +s'ha +s'han +sa +sabem +saben +saber +sabeu +salvament +salvant +salvat +sap +saps +se +segon +segona +segones +segons +seguida +seixanta +semblant +semblants +sempre +sengles +sens +sense +ser +seran +serem +sereu +seria +serien +series +serà +seràs +seré +seríem +seríeu +ses +set +setanta +setena +setenes +setens +setze +setè +seu +seua +seues +seus +seva +seves +si +sia +siau +sic +siguem +sigues +sigueu +sigui +siguin +siguis +sinó +sis +sisena +sisenes +sisens +sisè +sobre +sobretot +soc +sol +sola +solament +soles +sols +som +son +sons +sos +sota +sots +sou +sovint +suara +sí +sóc +són +t +t'ha +t'han +t'he +ta +tal +tals +també +tampoc +tan +tanmateix +tant +tanta +tantes +tantost +tants +te +tene +tenim +tenir +teniu +tercer +tercera +terceres +tercers +tes +teu +teua +teues +teus +teva +teves +tinc +ton +tons +tos +tost +tostemps +tot +tota +total +totes +tothom +tothora +tots +trenta +tres +tret +tretze +tu +tururut +u +uf +ui +uix +ultra +un +una +unes +uns +up +upa +us +va +vagi +vagin +vagis +vaig +vair +vam +van +vares +vas +vau +vem +verbigràcia +vers +vet +veu +vint +vora +vos +vosaltres +vostra +vostre +vostres +vostè +vostès +vuit +vuitanta +vuitena +vuitenes +vuitens +vuitè +vàreig +vàrem +vàreu +vés +vós +xano-xano +xau-xau +xec +àdhuc +àlies +ça +ço +érem +éreu +és +éssent +ésser +ídem +òlim +últim +última +últimes +últims +únic +única +únics +úniques +ús diff --git a/apps/common/src/python/mediawords/languages/da/da_stop_words_old.txt b/apps/common/src/python/mediawords/languages/da/da_stop_words_old.txt new file mode 100644 index 0000000000..220a35602a --- /dev/null +++ b/apps/common/src/python/mediawords/languages/da/da_stop_words_old.txt @@ -0,0 +1,101 @@ +# +# This is a stop word list for the Danish language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +ad +af +alle +alt +anden +at +blev +blive +bliver +da +de +dem +den +denne +der +deres +det +dette +dig +din +disse +dog +du +efter +eller +en +end +er +et +for +fra +ham +han +hans +har +havde +have +hende +hendes +her +hos +hun +hvad +hvis +hvor +i +ikke +ind +jeg +jer +jo +kunne +man +mange +med +meget +men +mig +min +mine +mit +mod +ned +noget +nogle +nu +når +og +også +om +op +os +over +på +selv +sig +sin +sine +sit +skal +skulle +som +sådan +thi +til +ud +under +var +vi +vil +ville +vor +være +været diff --git a/apps/common/src/python/mediawords/languages/de/de_stop_words_old.txt b/apps/common/src/python/mediawords/languages/de/de_stop_words_old.txt new file mode 100644 index 0000000000..aad240c48c --- /dev/null +++ b/apps/common/src/python/mediawords/languages/de/de_stop_words_old.txt @@ -0,0 +1,238 @@ +# +# This is a stop word list for the German language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +aber +alle +allem +allen +aller +alles +als +also +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +auch +auf +aus +bei +bin +bis +bist +da +damit +dann +das +dasselbe +dazu +daß +dein +deine +deinem +deinen +deiner +deines +dem +demselben +den +denn +denselben +der +derer +derselbe +derselben +des +desselben +dessen +dich +die +dies +diese +dieselbe +dieselben +diesem +diesen +dieser +dieses +dir +doch +dort +du +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +es +etwas +euch +euer +eure +eurem +euren +eurer +eures +für +gegen +gewesen +hab +habe +haben +hat +hatte +hatten +hier +hin +hinter +ich +ihm +ihn +ihnen +ihr +ihre +ihrem +ihren +ihrer +ihres +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kann +kein +keine +keinem +keinen +keiner +keines +können +könnte +machen +man +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mich +mir +mit +muss +musste +nach +nicht +nichts +noch +nun +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +um +und +uns +unse +unsem +unsen +unser +unses +unter +viel +vom +von +vor +war +waren +warst +was +weg +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +während +würde +würden +zu +zum +zur +zwar +zwischen +über diff --git a/apps/common/src/python/mediawords/languages/es/es_stop_words_old.txt b/apps/common/src/python/mediawords/languages/es/es_stop_words_old.txt new file mode 100644 index 0000000000..4f08f76cb8 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/es/es_stop_words_old.txt @@ -0,0 +1,373 @@ +# +# This is a stop word list for the Spanish language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +a +acerca +además +adónde +al +algo +algunas +algunos +ante +antes +aquel +aquella +aquellas +aquellos +aseveró +así +aunque +cada +como +con +contra +cual +cuales +cualquiera +cuando +cuál +cuáles +cuánto +de +debe +del +desde +después +destacó +dicho +dijo +donde +durante +e +el +ella +ellas +ellos +en +entre +era +erais +eran +eras +eres +es +esa +esas +ese +eso +esos +esta +estaba +estabais +estaban +estabas +estad +estada +estadas +estado +estados +estamos +estando +estar +estaremos +estará +estarán +estarás +estaré +estaréis +estaría +estaríais +estaríamos +estarían +estarías +estas +este +estemos +esto +estos +estoy +estuve +estuviera +estuvierais +estuvieran +estuvieras +estuvieron +estuviese +estuvieseis +estuviesen +estuvieses +estuvimos +estuviste +estuvisteis +estuviéramos +estuviésemos +estuvo +está +estábamos +estáis +están +estás +esté +estéis +estén +estés +excepto +expresó +fue +fuera +fuerais +fueran +fueras +fueron +fuese +fueseis +fuesen +fueses +fui +fuimos +fuiste +fuisteis +fuéramos +fuésemos +ha +habida +habidas +habido +habidos +habiendo +habremos +habrá +habrán +habrás +habré +habréis +habría +habríais +habríamos +habrían +habrías +habéis +había +habíais +habíamos +habían +habías +hace +hacer +hacia +hacía +han +has +hasta +hay +haya +hayamos +hayan +hayas +hayáis +he +hemos +hicieron +hicimos +hube +hubiera +hubierais +hubieran +hubieras +hubieron +hubiese +hubieseis +hubiesen +hubieses +hubimos +hubiste +hubisteis +hubiéramos +hubiésemos +hubo +indicó +informó +la +lado +lados +las +le +les +lleva +lo +los +luego +me +mediante +mi +mis +misma +mismo +mucho +muchos +muy +más +mí +mía +mías +mío +míos +nada +ni +no +nos +nosotras +nosotros +nuestra +nuestras +nuestro +nuestros +o +obstante +os +otra +otras +otro +otros +para +parte +pero +poco +por +porque +porqué +pudieron +pudiese +pudimos +puede +que +quien +quienes +qué +se +sea +seamos +sean +seas +según +seremos +será +serán +serás +seré +seréis +sería +seríais +seríamos +serían +serías +seáis +señaló +si +sido +siendo +sin +sobre +sois +solo +solía +somos +son +soy +su +suele +sus +suya +suyas +suyo +suyos +sí +sólo +también +tanto +te +tendremos +tendrá +tendrán +tendrás +tendré +tendréis +tendría +tendríais +tendríamos +tendrían +tendrías +tened +tenemos +tenga +tengamos +tengan +tengas +tengo +tengáis +tenida +tenidas +tenido +tenidos +teniendo +tenéis +tenía +teníais +teníamos +tenían +tenías +ti +tiene +tienen +tienes +toda +todas +todo +todos +tras +través +tu +tus +tuve +tuviera +tuvierais +tuvieran +tuvieras +tuvieron +tuviese +tuvieseis +tuviesen +tuvieses +tuvimos +tuviste +tuvisteis +tuviéramos +tuviésemos +tuvo +tuya +tuyas +tuyo +tuyos +tú +un +una +unas +uno +unos +vez +vosotras +vosotros +vuestra +vuestras +vuestro +vuestros +y +ya +yo +él +éramos diff --git a/apps/common/src/python/mediawords/languages/fi/fi_stop_words_old.txt b/apps/common/src/python/mediawords/languages/fi/fi_stop_words_old.txt new file mode 100644 index 0000000000..aa2cb4cdf7 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/fi/fi_stop_words_old.txt @@ -0,0 +1,242 @@ +# +# This is a stop word list for the Finnish language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +ei +eivät +emme +en +et +ette +että +he +heidän +heidät +heihin +heille +heillä +heiltä +heissä +heistä +heitä +hän +häneen +hänelle +hänellä +häneltä +hänen +hänessä +hänestä +hänet +häntä +itse +ja +johon +joiden +joihin +joiksi +joilla +joille +joilta +joina +joissa +joista +joita +joka +joksi +jolla +jolle +jolta +jona +jonka +jos +jossa +josta +jota +jotka +kanssa +keiden +keihin +keiksi +keille +keillä +keiltä +keinä +keissä +keistä +keitä +keneen +keneksi +kenelle +kenellä +keneltä +kenen +kenenä +kenessä +kenestä +kenet +ketkä +ketkä +ketä +koska +kuin +kuka +kun +me +meidän +meidät +meihin +meille +meillä +meiltä +meissä +meistä +meitä +mihin +miksi +mikä +mille +millä +miltä +minkä +minkä +minua +minulla +minulle +minulta +minun +minussa +minusta +minut +minuun +minä +minä +missä +mistä +mitkä +mitä +mukaan +mutta +ne +niiden +niihin +niiksi +niille +niillä +niiltä +niin +niin +niinä +niissä +niistä +niitä +noiden +noihin +noiksi +noilla +noille +noilta +noin +noina +noissa +noista +noita +nuo +nyt +näiden +näihin +näiksi +näille +näillä +näiltä +näinä +näissä +näistä +näitä +nämä +ole +olemme +olen +olet +olette +oli +olimme +olin +olisi +olisimme +olisin +olisit +olisitte +olisivat +olit +olitte +olivat +olla +olleet +ollut +on +ovat +poikki +se +sekä +sen +siihen +siinä +siitä +siksi +sille +sillä +sillä +siltä +sinua +sinulla +sinulle +sinulta +sinun +sinussa +sinusta +sinut +sinuun +sinä +sinä +sitä +tai +tallä +te +teidän +teidät +teihin +teille +teillä +teiltä +teissä +teistä +teitä +tuo +tuohon +tuoksi +tuolla +tuolle +tuolta +tuon +tuona +tuossa +tuosta +tuotä +tähän +täksi +tälle +tältä +tämä +tämän +tänä +tässä +tästä +tätä +vaan +vai +vaikka +yli diff --git a/apps/common/src/python/mediawords/languages/fr/fr_stop_words_old.txt b/apps/common/src/python/mediawords/languages/fr/fr_stop_words_old.txt new file mode 100644 index 0000000000..2f7ed427ca --- /dev/null +++ b/apps/common/src/python/mediawords/languages/fr/fr_stop_words_old.txt @@ -0,0 +1,185 @@ +# +# This is a stop word list for the French language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +a-t-on +ai +aie +aient +aies +ait +as +au +aura +aurai +auraient +aurais +aurait +auras +aurez +auriez +aurions +aurons +auront +aux +avaient +avais +avait +avec +avez +aviez +avions +avons +ayant +ayante +ayantes +ayants +ayez +ayons +c +ce +celle +ces +d +d'une +dans +de +des +donc +dont +du +elle +en +es +est +et +eu +eue +eues +eurent +eus +eusse +eussent +eusses +eussiez +eussions +eut +eux +eûmes +eût +eûtes +furent +fus +fusse +fussent +fusses +fussiez +fussions +fut +fûmes +fût +fûtes +il +ils +j +je +l +l' +la +le +les +leur +leurs +lui +m +ma +mais +me +mes +moi +mon +même +n +n'a +n'est +ne +ni +nos +notre +nous +on +ont +ou +où +par +pas +pour +qu +qu'elle +qu'il +qu'on +qu'une +quand +que +qui +s +s'est +sa +se +sera +serai +seraient +serais +serait +seras +serez +seriez +serions +serons +seront +ses +si +soient +sois +soit +sommes +son +sont +soyez +soyons +suis +sur +t +ta +te +tes +toi +ton +tu +un +une +va +vais +vos +votre +vous +y +à +étaient +étais +était +étant +étante +étantes +étants +étiez +étions +été +étée +étées +étés +êtes +être diff --git a/apps/common/src/python/mediawords/languages/ha/ha_stop_words_old.txt b/apps/common/src/python/mediawords/languages/ha/ha_stop_words_old.txt new file mode 100644 index 0000000000..07c7723d36 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/ha/ha_stop_words_old.txt @@ -0,0 +1,46 @@ +# +# This is a stop word list for the Hausa language. +# +# Sources: +# https://github.com/stopwords-iso/stopwords-ha/blob/master/raw/gh-stopwords-json-ha.txt +# + +a +amma +ba +ban +ce +cikin +da +don +ga +in +ina +ita +ji +ka +ko +kuma +lokacin +ma +mai +na +ne +ni +sai +shi +su +suka +sun +ta +tafi +take +tana +wani +wannan +wata +ya +yake +yana +yi +za diff --git a/apps/common/src/python/mediawords/languages/hi/__init__.py b/apps/common/src/python/mediawords/languages/hi/__init__.py index 4a98351b3b..12f6e60b8f 100644 --- a/apps/common/src/python/mediawords/languages/hi/__init__.py +++ b/apps/common/src/python/mediawords/languages/hi/__init__.py @@ -21,6 +21,9 @@ class HindiLanguage(StopWordsFromFileMixIn): # Stop words map '__stop_words_map', + # FIXME remove once stopword comparison is over + '__stop_words_old_map', + # Hunspell instance '__hindi_hunspell', diff --git a/apps/common/src/python/mediawords/languages/hi/hi_stop_words_old.txt b/apps/common/src/python/mediawords/languages/hi/hi_stop_words_old.txt new file mode 100644 index 0000000000..27440bfb15 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/hi/hi_stop_words_old.txt @@ -0,0 +1,228 @@ +# +# This is a stop word list for the Hindi language. +# +# Sources: +# http://www.ranks.nl/stopwords/hindi +# http://members.unine.ch/jacques.savoy/clef/hindiST.txt +# https://sites.google.com/site/kevinbouge/stopwords-lists +# http://resgtholpadi.blogspot.com/2012/07/hindi-stop-words-list.html +# + +अंदर +अत +अथवा +अन्य +अपना +अपनी +अपने +अब +अभी +आज +आदि +आप +इत्यादि +इन +इन +इनका +इनके +इन्हीं +इन्हें +इन्हों +इस +इसका +इसकी +इसके +इसमें +इसी +इसे +उच्च +उत्तर +उन +उनका +उनकी +उनके +उनको +उन्हीं +उन्हें +उन्हें +उन्हों +उस +उसकी +उसके +उसी +उसे +ऊपर +एक +एवं +एस +ऐसा +ऐसे +और +कई +कभी +कम +कर +करता +करते +करना +करने +करें +कल +कहते +कहा +का +काफ़ी +कि +किए +कितना +किन्हें +किन्हों +किया +किर +किस +किसी +किसे +की +कुछ +कुल +के +को +कोई +कौन +कौनसा +गई +गए +गया +गयी +गये +घर +जब +जहाँ +जा +जाता +जाती +जाते +जाने +जितना +जिन +जिन्हें +जिन्हों +जिस +जिसमें +जिससे +जिसे +जीधर +जैसा +जैसे +जो +तक +तथा +तब +तरह +तिन +तिन्हें +तिन्हों +तिस +तिसे +तुम +तो +था +थी +थे +दबारा +दिया +दुसरा +दूर +दूसरे +दो +दोनों +द्वारा +न +नहीं +ना +निहायत +नीचे +ने +पर +पर +परंतु +पहले +पूरा +पूरे +पे +प्रति +फिर +बड़ा +बड़े +बनी +बही +बहुत +बाद +बाला +बाहर +बिलकुल +बीच +भी +भीतर +मगर +मध्य +मानो +मे +में +मै +यदि +यह +यहाँ +यहां +यही +या +यिह +ये +रखें +रहती +रहा +रहे +ऱ्वासा +लिए +लिया +लिये +लेकर +लेकिन +व +वर्ग +वह +वह +वहाँ +वहीं +वाले +वुह +वे +वग़ैरह +संग +सकता +सकती +सकते +सबसे +सभी +समय +साथ +साबुत +साभ +सारा +से +सो +स्थान +ही +हुआ +हुई +हुए +हुये +है +हैं +हो +होता +होती +होते +होना +होने +के diff --git a/apps/common/src/python/mediawords/languages/hu/hu_stop_words_old.txt b/apps/common/src/python/mediawords/languages/hu/hu_stop_words_old.txt new file mode 100644 index 0000000000..13c70d9d6f --- /dev/null +++ b/apps/common/src/python/mediawords/languages/hu/hu_stop_words_old.txt @@ -0,0 +1,206 @@ +# +# This is a stop word list for the Hungarian language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +a +abban +ahhoz +ahogy +ahol +aki +akik +akkor +alatt +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amikor +amit +amolyan +amíg +annak +arra +arról +az +azok +azon +azonban +azt +aztán +azután +azzal +azért +be +belül +benne +bár +cikk +cikkek +cikkeket +csak +de +e +ebben +eddig +egy +egyes +egyetlen +egyik +egyre +egyéb +egész +ehhez +ekkor +el +ellen +elsõ +elég +elõ +elõször +elõtt +emilyen +ennek +erre +ez +ezek +ezen +ezt +ezzel +ezért +fel +felé +hanem +hiszen +hogy +hogyan +igen +ill +ill. +illetve +ilyen +ilyenkor +ismét +ison +itt +jobban +jó +jól +kell +kellett +keressünk +keresztül +ki +kívül +között +közül +legalább +legyen +lehet +lehetett +lenne +lenni +lesz +lett +maga +magát +majd +majd +meg +mellett +mely +melyek +mert +mi +mikor +milyen +minden +mindenki +mindent +mindig +mint +mintha +mit +mivel +miért +most +már +más +másik +még +míg +nagy +nagyobb +nagyon +ne +nekem +neki +nem +nincs +néha +néhány +nélkül +olyan +ott +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +szemben +szerint +szinte +számára +talán +tehát +teljes +tovább +továbbá +több +ugyanis +utolsó +után +utána +vagy +vagyis +vagyok +valaki +valami +valamint +való +van +vannak +vele +vissza +viszont +volna +volt +voltak +voltam +voltunk +által +általában +át +én +éppen +és +így +õ +õk +õket +össze +úgy +új +újabb +újra diff --git a/apps/common/src/python/mediawords/languages/it/it_stop_words_old.txt b/apps/common/src/python/mediawords/languages/it/it_stop_words_old.txt new file mode 100644 index 0000000000..4448e81c70 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/it/it_stop_words_old.txt @@ -0,0 +1,286 @@ +# +# This is a stop word list for the Italian language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +a +abbia +abbiamo +abbiano +abbiate +ad +agl +agli +ai +al +all +alla +alle +allo +anche +avemmo +avendo +avesse +avessero +avessi +avessimo +aveste +avesti +avete +aveva +avevamo +avevano +avevate +avevi +avevo +avrai +avranno +avrebbe +avrebbero +avrei +avremmo +avremo +avreste +avresti +avrete +avrà +avrò +avuta +avute +avuti +avuto +c +che +chi +ci +coi +col +come +con +contro +cui +da +dagl +dagli +dai +dal +dall +dalla +dalle +dallo +degl +degli +dei +del +dell +della +delle +dello +di +dov +dove +e +ebbe +ebbero +ebbi +ed +era +erano +eravamo +eravate +eri +ero +essendo +faccia +facciamo +facciano +facciate +faccio +facemmo +facendo +facesse +facessero +facessi +facessimo +faceste +facesti +faceva +facevamo +facevano +facevate +facevi +facevo +fai +fanno +farai +faranno +farebbe +farebbero +farei +faremmo +faremo +fareste +faresti +farete +farà +farò +fece +fecero +feci +fosse +fossero +fossi +fossimo +foste +fosti +fu +fui +fummo +furono +gli +ha +hai +hanno +ho +i +il +in +io +l +la +le +lei +li +lo +loro +lui +ma +mi +mia +mie +miei +mio +ne +negl +negli +nei +nel +nell +nella +nelle +nello +noi +non +nostra +nostre +nostri +nostro +o +per +perché +più +quale +quanta +quante +quanti +quanto +quella +quelle +quelli +quello +questa +queste +questi +questo +sarai +saranno +sarebbe +sarebbero +sarei +saremmo +saremo +sareste +saresti +sarete +sarà +sarò +se +sei +si +sia +siamo +siano +siate +siete +sono +sta +stai +stando +stanno +starai +staranno +starebbe +starebbero +starei +staremmo +staremo +stareste +staresti +starete +starà +starò +stava +stavamo +stavano +stavate +stavi +stavo +stemmo +stesse +stessero +stessi +stessimo +steste +stesti +stette +stettero +stetti +stia +stiamo +stiano +stiate +sto +su +sua +sue +sugl +sugli +sui +sul +sull +sulla +sulle +sullo +suo +suoi +ti +tra +tu +tua +tue +tuo +tuoi +tutti +tutto +un +una +uno +vi +voi +vostra +vostre +vostri +vostro +è diff --git a/apps/common/src/python/mediawords/languages/ja/ja_stop_words_old.txt b/apps/common/src/python/mediawords/languages/ja/ja_stop_words_old.txt new file mode 100755 index 0000000000..bfff6d32ff --- /dev/null +++ b/apps/common/src/python/mediawords/languages/ja/ja_stop_words_old.txt @@ -0,0 +1,636 @@ +# +# This is a stop word list for the Japanese language. +# +# Sources: +# https://github.com/stopwords/japanese-stopwords/blob/master/data/japanese-stopwords.txt +# Lucene's stopwords_ja.txt +# + +$ +% +& +@ +lwa +posted +ref +url +” +… +▽ +、 +。 +「 +」 +『 +』 +【 +】 +あそこ +あたり +あちら +あっ +あった +あっち +あと +あな +あなた +あの +あのかた +あの人 +あり +ありました +あります +ある +あれ +い +いい +いう +いく +いくつ +いつ +いま +います +いや +いる +いろいろ +う +うち +え +お +おおまか +おまえ +およそ +および +おり +おります +おれ +か +かく +かたち +かつて +かも +かやの +から +が +がい +がら +き +きき +きた +くせ +ください +くれ +くれる +けど +こうした +ここ +こちら +こっち +こと +この +これ +これから +これら +ご +ごっちゃ +ごと +ごろ +さ +さま +さまざま +さらい +さらに +される +さん +し +しか +しかし +しかた +した +したい +して +しまう +します +しまっ +しよう +しれ +しろ +じゃ +す +すか +すでに +すね +すべて +する +すれ +ず +ずつ +せ +せる +ぜんぶ +そう +そこ +そして +そちら +そっち +そで +その +その他 +その後 +それ +それから +それぞれ +それで +それと +それなり +それに +そんな +た +たい +たくさん +ただ +ただし +たち +たび +ため +たら +たり +だ +だけ +だっ +だめ +だれ +だろ +ちゃ +ちゃん +った +って +つ +ついに +つつ +て +てる +てん +で +でき +できる +でし +でしょ +です +では +でも +と +という +といった +とおり +とか +とき +ところ +として +とって +とともに +となる +とは +とも +と共に +どう +どういう +どこ +どこか +どちら +どっか +どっち +どの +どれ +な +ない +なお +なか +なかっ +なかば +ながら +なく +なけれ +なし +なぜ +なっ +なった +など +なに +なのか +なのに +なら +なり +なる +なん +なんか +に +において +における +について +にて +にとって +によって +により +による +に対し +に対して +に対する +に関して +に関する +ね +の +ので +のに +のみ +は +はじめ +はず +はるか +ば +ばかり +ひと +ひとつ +ふく +ぶり +へ +への +へん +べき +べつ +ぺん +ほう +ほか +ほとんど +ほど +ま +まさ +まし +ましょ +ます +ませ +また +または +まで +まとも +まま +み +みたい +みつ +みなさん +みんな +も +もし +もしくは +もっと +もと +もの +ものの +もん +や +やすい +やっ +やつ +よ +よう +ような +よく +よそ +より +よる +よると +ら +られ +られる +れ +れる +ろ +わ +わけ +わたし +を +を通じて +ん +エラー +カ所 +カ月 +キロ +センチ +ページ +メートル +レ +ヵ所 +ヵ月 +ヶ所 +ヶ月 +・ +ー +一 +一つ +一方 +一覧 +七 +万 +三 +上 +上記 +下 +下記 +中 +九 +事 +二 +五 +人 +今 +今回 +他 +代 +以上 +以下 +以前 +以後 +以降 +会 +伸 +位 +体 +何 +何人 +作 +作ら +例 +係 +俺 +個 +億 +元 +兆 +先 +全部 +八 +六 +内 +円 +再 +冬 +出 +分 +列 +別 +前 +前回 +力 +化 +匹 +区 +十 +千 +半ば +及び +受け +口 +台 +右 +各 +同 +同じ +名 +名前 +向け +向こう +和 +哀 +品 +員 +喜 +器 +四 +回 +国 +土 +在 +地 +報じ +場 +場合 +境 +士 +夏 +外 +多く +大 +女 +奴 +婦 +子 +字 +安 +官 +室 +家 +対 +小 +屋 +巡る +左 +市 +席 +年 +年生 +幾つ +店 +府 +度 +式 +形 +役 +彼 +彼女 +後 +怒 +思わ +性 +情 +感 +感じ +我々 +所 +手 +手段 +扱い +数 +文 +新た +新着 +方 +方法 +日 +春 +時 +時点 +時間 +更新 +書 +月 +期 +木 +未満 +末 +本 +本当 +村 +束 +枚 +校 +楽 +様 +様々 +次 +歳 +歴 +段 +毎 +毎日 +気 +水 +求め +法 +派 +火 +点 +版 +特に +玉 +用 +男 +町 +界 +略 +百 +的 +目 +相 +県 +確か +示し +社 +私 +私達 +秋 +秒 +第 +等 +箇所 +箇月 +簿 +系 +紀 +約 +結局 +続き +線 +署 +考え +者 +自体 +自分 +行 +行わ +見 +見る +観 +言わ +計 +話 +話し +誌 +語っ +読む +誰 +課 +調べ +論 +貴方 +貴方方 +輪 +近く +述べ +通 +速報 +連 +週 +道 +達 +違い +選 +部 +都 +金 +銭 +開か +間 +関 +関係 +関連 +際 +集 +面 +頃 +類 +首 +高 +! +!? +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +: +; +< += +> +? +@ +[ +\ +] +^ +_ +` +{ +| +} +~ +⦅ +⦆ +。 +「 +」 +、 +・ +¢ +£ +¬ + ̄ +¦ +¥ +₩ +│ +← +↑ +→ +↓ +■ +○ diff --git a/apps/common/src/python/mediawords/languages/lt/lt_stop_words_old.txt b/apps/common/src/python/mediawords/languages/lt/lt_stop_words_old.txt new file mode 100755 index 0000000000..69707d4e8c --- /dev/null +++ b/apps/common/src/python/mediawords/languages/lt/lt_stop_words_old.txt @@ -0,0 +1,133 @@ +# +# This is a stop word list for the Lithuanian language. +# +# Sources: +# http://www.filewatcher.com/p/punbb-1.2.16.tbz.620109/www/punbb/lang/Lithuanian/stopwords.txt.html +# auto-generated sources +# + +a +apie +ar +arba +aš +be +bei +bet +bus +buvo +būti +būtų +d +dabar +dar +darbo +daryti +daug +daugiau +daugiausia +dažnai +dieną +dėl +gali +gauna +gauti +iki +ir +iš +jam +jau +jei +ji +jie +jis +jo +jog +jos +jį +jų +kad +kada +kai +kaip +kam +kartą +kas +kiekvienas +kitas +klausimas +klausti +kovo +kur +kurie +kurios +kuris +kurių +labai +lietuva +lietuvoje +lietuvos +m +man +mano +mažai +mažas +mažiau +mes +metais +metu +metus +metų +mūsų +ne +negali +negu +nei +nes +net +niekada +niekas +nors +nuo +nėra +o +pagal +pasak +pasakė +pat +per +po +prašau +prie +prieš +r +reikia +sakyti +sakė +savo +su +tai +taip +taip pat +tarp +tas +tavo +tačiau +tik +tikrai +to +todėl +tuo +turi +turėjo +už +val +vienas +visi +yra +čia +į +šalia +šalies +šios +žmonių diff --git a/apps/common/src/python/mediawords/languages/nl/nl_stop_words_old.txt b/apps/common/src/python/mediawords/languages/nl/nl_stop_words_old.txt new file mode 100644 index 0000000000..1ee9a2887d --- /dev/null +++ b/apps/common/src/python/mediawords/languages/nl/nl_stop_words_old.txt @@ -0,0 +1,108 @@ +# +# This is a stop word list for the Dutch language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +aan +al +alles +als +altijd +andere +ben +bij +daar +dan +dat +de +der +deze +die +dit +doch +doen +door +dus +een +eens +en +er +ge +geen +geweest +haar +had +heb +hebben +heeft +hem +het +hier +hij +hoe +hun +iemand +iets +ik +in +is +ja +je +kan +kon +kunnen +maar +me +meer +men +met +mij +mijn +moet +na +naar +niet +niets +nog +nu +of +om +omdat +onder +ons +ook +op +over +reeds +te +tegen +toch +toen +tot +u +uit +uw +van +veel +voor +want +waren +was +wat +werd +wezen +wie +wil +worden +wordt +zal +ze +zelf +zich +zij +zijn +zo +zonder +zou diff --git a/apps/common/src/python/mediawords/languages/no/no_stop_words_old.txt b/apps/common/src/python/mediawords/languages/no/no_stop_words_old.txt new file mode 100644 index 0000000000..2fd8a00993 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/no/no_stop_words_old.txt @@ -0,0 +1,183 @@ +# +# This is a stop word list for the Norwegian language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +alle +at +av +bare +begge +ble +blei +bli +blir +blitt +både +båe +da +de +deg +dei +deim +deira +deires +dem +den +denne +der +dere +deres +det +dette +di +din +disse +ditt +du +dykk +dykkar +då +eg +ein +eit +eitt +eller +elles +en +enn +er +et +ett +etter +for +fordi +fra +før +ha +hadde +han +hans +har +hennar +henne +hennes +her +hjå +ho +hoe +honom +hoss +hossen +hun +hva +hvem +hver +hvilke +hvilken +hvis +hvor +hvordan +hvorfor +i +ikke +ikkje +ikkje +ingen +ingi +inkje +inn +inni +ja +jeg +kan +kom +korleis +korso +kun +kunne +kva +kvar +kvarhelst +kven +kvi +kvifor +man +mange +me +med +medan +meg +meget +mellom +men +mi +min +mine +mitt +mot +mykje +ned +no +noe +noen +noka +noko +nokon +nokor +nokre +nå +når +og +også +om +opp +oss +over +på +samme +seg +selv +si +si +sia +sidan +siden +sin +sine +sitt +sjøl +skal +skulle +slik +so +som +som +somme +somt +så +sånn +til +um +upp +ut +uten +var +vart +varte +ved +vere +verte +vi +vil +ville +vore +vors +vort +vår +være +være +vært +å diff --git a/apps/common/src/python/mediawords/languages/pt/pt_stop_words_old.txt b/apps/common/src/python/mediawords/languages/pt/pt_stop_words_old.txt new file mode 100644 index 0000000000..d49861eea5 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/pt/pt_stop_words_old.txt @@ -0,0 +1,4062 @@ +# +# This is a "long" stop word list for the Portuguese language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# + +a +a meta +abaixo +abastecimento +aberta +abertas +aberto +abertos +abertura +abraço +abre +abreu +abril +abrir +abriu +absoluta +absolutamente +absurdo +abuso +acaba +acabam +acabar +acabaram +acabou +academia +acaso +aceita +aceitar +aceitou +acertar +acertou +acesso +acha +acham +achar +achei +acho +achou +acidente +acidentes +acima +acompanha +acompanhada +acompanhado +acompanhamento +acompanhar +acompanhou +acontece +acontecem +acontecendo +acontecer +acontecerá +aconteceu +acontecido +acontecimentos +acordo +acredita +acreditam +acreditar +acredito +acrescenta +acrescentou +acumulado +acusado +acusados +acusação +acusações +adequada +adequado +adesão +adianta +adiante +adiantou +administrador +administrar +administrativa +administrativo +administração +admite +admitiu +adolescente +adolescentes +adotar +adoção +adquirir +adultos +adversário +adversários +advogada +advogado +advogados +aeroporto +afastado +afastar +afinal +afirma +afirmam +afirmando +afirmar +afirmou +agenda +agente +agentes +agir +agora +agosto +agradecer +agressão +agricultores +agricultura +agrícola +aguarda +aguardar +agência +agências +ah +ainda +ajuda +ajudam +ajudar +ajudou +ala +alcançar +alega +alegou +alegre +alegria +alemão +alerta +algo +algum +alguma +algumas +alguns +alguém +ali +aliado +aliados +aliança +alimentar +alimentação +alimento +alimentos +aliás +alma +almoço +alta +altas +alteração +alterações +alternativa +alternativas +alto +altos +altura +aluguel +aluno +alunos +alves +alvinegro +alvo +além +ama +amanhã +amarelo +ambas +ambientais +ambiental +ambiente +ambos +ameaça +ameaças +americano +americanos +amiga +amigo +amigos +amizade +amor +ampla +ampliar +ampliação +amplo +analisa +analisar +analistas +anda +andamento +andar +animais +animal +animação +aniversário +ano +anos +ante +anterior +anteriores +anteriormente +antes +antiga +antigo +antigos +anual +anuncia +anunciado +anunciar +anunciou +análise +anúncio +ao +aos +aparece +aparecem +aparecer +apareceu +aparecida +aparelho +aparelhos +apartamento +apelo +apenas +apesar +aplicada +aplicado +aplicar +aplicação +apoia +apoiar +apoio +aponta +apontam +apontou +aposentado +aposentadoria +aposentados +aposta +apreensão +aprender +aprendizado +apresenta +apresentada +apresentadas +apresentado +apresentados +apresentam +apresentar +apresentaram +apresentação +apresentações +apresentou +aprovada +aprovado +aprovados +aprovar +aprovação +aproveitar +aproveitou +aproximadamente +apuração +após +aquela +aquelas +aquele +aqueles +aqui +aquilo +aquisição +ar +areia +arena +argumento +argumentos +arma +armado +armas +arrecadação +arroz +arruda +art +arte +artes +artigo +artigos +artilheiro +artista +artistas +as +asfalto +aspecto +aspectos +assaltantes +assalto +assassinato +assembleia +assessor +assessores +assessoria +assim +assinado +assinar +assinatura +assinou +assistente +assistir +assistência +associados +associação +associações +assume +assumir +assumiu +assunto +assuntos +at +atacante +atacar +ataque +ataques +atende +atendendo +atender +atendidas +atendido +atendidos +atendimento +atendimentos +atento +atenção +atinge +atingido +atingir +atingiu +atitude +atitudes +ativa +atividade +atividades +atleta +atletas +ato +ator +atores +atos +atrair +atraso +através +atração +atrações +atriz +atrás +atua +atuais +atual +atualizado +atualmente +atuam +atuando +atuar +atuação +atuou +até +auditório +audiência +aula +aulas +aumenta +aumentando +aumentar +aumento +aumentou +ausência +automóveis +automóvel +autonomia +autor +autores +autoria +autoridade +autoridades +autorização +autos +auxiliar +auxílio +avalia +avaliar +avaliação +avaliou +avançar +avanço +avanços +avançou +avenida +avisa +avião +avó +azul +ação +ações +aérea +aí +baiano +baile +bairro +bairros +baixa +baixo +baixos +balanço +bancada +banco +bancos +banda +bandas +bandeira +bandido +bandidos +banheiro +banho +bar +barato +barco +bares +barra +barreiras +barros +barulho +base +baseado +bases +basta +bastante +bastidores +batalha +batalhão +bate +bater +bateria +bateu +beber +bebida +bebidas +bebê +beira +bela +beleza +belo +bem +beneficiar +benefício +benefícios +bens +bernardo +biblioteca +bicicleta +bilhão +bilhões +bloco +blocos +blog +boa +boas +boca +bola +boletim +bolsa +bolsas +bolso +bom +bomba +bombeiros +bonita +bonito +bons +branca +branco +brancos +brasileiras +brasileiro +brasileiros +braço +braços +breve +briga +brilhante +brincadeira +brincar +brinquedos +bruto +buraco +buracos +busca +buscam +buscando +buscar +básica +básicas +básico +cabe +cabeceou +cabelo +cabelos +cabeça +cabo +cachorro +cada +cadastro +cadeia +cadeira +cadeiras +cadê +cai +cair +caiu +caixa +caixas +calendário +calma +calor +calçada +cama +caminhada +caminho +caminhos +caminhão +caminhões +camisa +campanha +campanhas +campeonato +campeão +campeões +campo +campos +campus +cana +canal +candidata +candidato +candidatos +candidatura +candidaturas +canto +cantor +cantora +caos +capa +capacidade +capacitação +capaz +capazes +capitais +capital +capitão +capixaba +capítulo +cara +característica +características +caras +carga +cargo +cargos +carinho +carioca +carne +caro +carreira +carro +carros +carta +cartas +carteira +cartão +cartório +cartões +caruaru +caráter +casa +casado +casal +casamento +casar +casas +caso +casos +cassado +cassação +castelo +catarinense +categoria +categorias +causa +causar +causas +causou +caíram +cd +cedo +celular +cem +cemitério +cena +cenas +centenas +cento +centrais +central +centro +centros +cenário +cerca +cerimônia +certa +certamente +certas +certeza +certo +certos +cerveja +chama +chamada +chamado +chamados +chamar +chamou +chance +chances +chapa +chave +chefe +chega +chegada +chegam +chegamos +chegando +chegar +chegaram +chegou +cheguei +cheia +cheio +cheiro +cheque +choque +chute +chutou +chuva +chuvas +chão +ciclo +cidadania +cidade +cidades +cidadão +cidadãos +cientistas +cima +cinco +cinema +circo +circuito +circulação +cirurgia +cita +citado +citar +citou +civil +civis +ciência +ciências +classe +classes +classificação +cliente +clientes +clima +clique +clube +clubes +clássico +clínica +cobertura +cobra +cobrança +cobrar +cobrou +cofres +coisa +coisas +colaboradores +colega +colegas +coleta +coletiva +coletivo +coleção +coligação +coloca +colocada +colocado +colocados +colocando +colocar +colocação +colocou +coloque +colorado +coluna +colunista +colégio +com +comandante +comando +combate +combater +combustível +comecei +comemora +comemorar +comemoração +comemorou +comenta +comentar +comentou +comentário +comentários +comer +comerciais +comercial +comercialização +comerciante +comerciantes +começa +começam +começando +começar +começaram +começo +começou +comida +comigo +comissão +comitê +como +companheiro +companheiros +companhia +companhias +comparação +competente +competição +competições +competência +complementar +completa +completamente +completar +completo +completou +complexo +complicado +compor +comportamento +composição +composta +composto +compra +comprar +compras +compreensão +compromisso +compromissos +comprou +computador +computadores +comum +comunicado +comunicação +comunidade +comunidades +comuns +comércio +conceito +conceitos +concentração +concessão +conclui +concluir +concluiu +conclusão +concorda +concordo +concorrentes +concorrer +concorrência +concreto +concurso +concursos +condenado +condenação +condição +condições +conduta +confederação +conferir +conferência +confiança +confira +confirma +confirmado +confirmar +confirmou +conflito +conflitos +conforme +conforto +confronto +confusão +conhece +conhecem +conhecer +conheceu +conhecida +conhecido +conhecidos +conhecimento +conhecimentos +conheço +conjunto +conquista +conquistar +conquistas +conquistou +consciente +conscientização +consciência +consegue +conseguem +consegui +conseguia +conseguimos +conseguir +conseguiram +conseguiu +conselheiro +conselho +conselhos +consenso +conservação +considera +considerada +considerado +considerados +considerando +considerar +consideração +considerou +consigo +consta +constante +constantes +constitucional +construir +construção +construída +construído +consulta +consultas +consultoria +consumidor +consumidores +consumo +consórcio +conta +contam +contando +contar +contará +contas +contato +contatos +conter +contexto +conteúdo +continua +continuam +continuar +continuará +continue +continuidade +continuou +contou +contra +contrapartida +contratado +contratados +contratar +contratação +contratações +contrato +contratos +contribuinte +contribuir +contribuição +controlar +controle +contrário +contudo +convencer +convenção +conversa +conversar +conversas +convidado +convidados +convite +conviver +convivência +convênio +cooperativa +coordenador +coordenadora +coordenação +cor +coragem +coração +cores +coronel +corpo +corpos +corre +correndo +corrente +correr +correta +correto +correção +corrida +cortar +corte +cortes +costas +costuma +costumam +cotidiano +cozinha +credibilidade +creio +cresce +crescendo +crescente +crescer +cresceu +crescimento +cria +criada +criado +criando +crianças +criar +criatividade +criação +crime +crimes +criminal +criminalidade +criminosos +criou +crise +criticar +criticou +critério +critérios +cruzamento +cruzes +cruzou +crédito +créditos +crítica +críticas +crítico +cuidado +cuidados +cuidar +cuja +cujo +culpa +cultura +culturais +cultural +cumprimento +cumprir +cumpriu +cunha +currículo +curso +cursos +curto +custa +custo +custos +cá +cães +cérebro +céu +código +cúpula +da +dada +dado +dados +dando +danos +dantas +dança +daquela +daquele +daqueles +daqui +dar +daria +dará +das +data +dava +daí +de +de deus +debaixo +debate +debates +decide +decidir +decidiu +decisão +decisões +declaração +declarações +declarou +decoração +decreto +dedicação +defende +defender +defendeu +defensor +defesa +deficiência +define +definida +definido +definir +definitivamente +definiu +definição +deixa +deixado +deixam +deixando +deixar +deixaram +deixe +deixou +dela +delas +dele +delegacia +delegado +deles +demais +demanda +demandas +demissão +democrático +demonstra +demonstrou +demora +demorou +dentre +dentro +denunciar +denúncia +denúncias +departamento +depende +dependendo +depender +depoimento +depoimentos +depois +deputada +deputado +deputados +der +deram +derrota +derrotado +desafio +desafios +descoberta +descobre +descobrir +descobriu +desconto +desculpas +desde +deseja +desejar +desejo +desembargador +desempenho +desemprego +desenvolver +desenvolvido +desenvolvimento +desespero +desfile +despesas +dessa +dessas +desse +desses +desta +destaca +destacar +destacou +destaque +destaques +destas +deste +destes +destinado +destinados +destino +desvio +desviou +detalhe +detalhes +determina +determinado +determinação +determinou +deu +deve +deve-se +devem +devemos +dever +deveria +deveriam +deverá +deverão +devia +devido +dez +dezembro +dezenas +dia +dia-a-dia +diagnóstico +diante +diariamente +dias +dica +dicas +diferente +diferentes +diferença +diferenças +dificilmente +dificuldade +dificuldades +difíceis +difícil +diga +digital +dignidade +digo +diminuir +diminuição +dinheiro +diploma +direito +direta +diretamente +direto +diretor +diretora +diretores +diretoria +diretório +direção +diria +dirigente +dirigentes +dirigir +disciplina +disco +discurso +discussão +discussões +discutir +disponíveis +disponível +disposição +disposto +disputa +disputar +dispõe +disse +disseram +disso +distante +distribuição +distribuídos +distrito +dito +diversas +diversos +divisão +divulgada +divulgado +divulgados +divulgar +divulgação +divulgou +diz +dizem +dizendo +dizer +dizia +diálogo +diária +diárias +diário +do +doação +doações +doce +documentação +documento +documentos +doente +doença +doenças +dois +domingo +domínio +dona +dono +donos +dor +dores +dormir +dos +dose +doutor +duas +duelo +dupla +dura +durante +duração +duro +durou +dutra +dvd +dão +década +décadas +déficit +dívida +dívidas +dólar +dólares +dúvida +dúvidas +e +e-mail +economista +econômica +econômicas +econômico +econômicos +edital +edição +edições +educacional +efeito +efeitos +efetivamente +efetivo +eficiente +eficiência +eis +ela +elaboração +elas +ele +eleger +elegeu +eleita +eleito +eleitor +eleitorado +eleitorais +eleitores +eleitos +eleição +eleições +elementos +elenco +eles +eletrônica +eletrônico +elevado +elevação +elite +elogios +elétrica +em +embora +emenda +emendas +emergência +emissora +emissoras +emissão +emocional +emoção +emoções +empate +empatou +empenho +empreendimento +empreendimentos +empregados +emprego +empregos +empresa +empresarial +empresas +empresário +empresários +empréstimo +empréstimos +encaminhado +encaminhados +encarar +encerramento +encerrou +enchentes +encontra +encontrada +encontrado +encontrados +encontram +encontrar +encontraram +encontro +encontros +encontrou +endereço +energia +enfatizou +enfim +enfrenta +enfrentar +engenharia +engenheiro +enorme +enquanto +ensinar +ensino +entanto +entende +entender +entendeu +entendimento +entendo +entidade +entidades +entra +entrada +entram +entrar +entraram +entre +entrega +entregar +entregou +entregue +entregues +entretanto +entrevista +entrevistados +entrevistas +entrou +então +enviado +enviar +enviou +envolve +envolvendo +envolvido +envolvidos +envolvimento +episódio +equilíbrio +equipamento +equipamentos +equipe +equipes +equivalente +era +eram +errado +erro +erros +escala +escanteio +esclarecer +escola +escolar +escolas +escolha +escolher +escolheu +escolhido +esconder +escrever +escreveu +escrita +escrito +escritor +escritório +esforço +esforços +espanhol +espaço +espaços +especiais +especial +especialista +especialistas +especializada +especialmente +específica +específico +espera +esperado +esperamos +esperando +esperança +esperar +esperava +espero +espetáculo +espiritual +esportiva +esportivo +esposa +espécie +espécies +espírito +esquecer +esquema +esquerdo +esquina +essa +essas +esse +essencial +esses +esta +estabelece +estabelecer +estabelecimento +estabelecimentos +estabilidade +estacionamento +estado +estados +estaduais +estadual +estamos +estar +estaria +estariam +estará +estarão +estas +estatal +estatuto +estatística +estatísticas +estava +estavam +estação +este +esteja +estejam +estejamos +estes +esteve +estilo +estimativa +estimular +estive +estivemos +estiver +estivera +estiveram +estiverem +estivermos +estivesse +estivessem +estivéramos +estivéssemos +estou +estrada +estradas +estrangeiros +estranho +estratégia +estratégias +estreia +estrela +estrelas +estrutura +estudar +estudo +estudos +está +estádio +estágio +estávamos +estão +etapa +etapas +etc +etc. +eu +evento +eventos +eventual +evidente +evitar +evolução +ex-deputado +ex-governador +ex-prefeito +ex-presidente +exame +exames +exatamente +excelente +excelência +excesso +exceção +exclusivamente +executiva +executivo +execução +exemplo +exemplos +exercer +exercício +exercícios +exige +exigir +exigência +exigências +existe +existem +existentes +existia +existir +existência +expandir +expansão +expectativa +expectativas +experiência +experiências +explica +explicar +explicação +explicações +explicou +exploração +exportações +exposição +expressão +expulso +extensão +exterior +externa +extra +extremamente +exército +face +facilidade +facilitar +facilmente +faculdade +faixa +faixas +fala +falam +falando +falar +falei +falha +falhas +falou +falta +faltam +faltando +faltou +fama +familiar +familiares +famoso +famílias +faria +farmácia +farroupilha +fará +farão +fase +fato +fator +fatores +fatos +faturamento +favor +favorável +faz +fazem +fazemos +fazenda +fazendo +fazer +fazia +faça +façam +faço +febre +fechada +fechado +fechados +fechamento +fechar +fechou +federais +federal +federação +feira +feita +feitas +feito +feitos +felicidade +feliz +felizes +feminina +feminino +fenômeno +feriado +feridos +ferramenta +ferramentas +ferro +festa +festas +festival +fevereiro +fez +fica +ficam +ficamos +ficando +ficar +ficaram +ficaria +ficará +ficarão +ficava +ficha +fico +ficou +fiel +figura +fila +filha +filhas +filho +filhos +filme +filmes +filosofia +fim +finais +final +finalidade +finalizou +finalmente +financeira +financeiras +financeiro +financeiros +financiamento +finanças +fins +fique +fiquei +firme +fiscais +fiscal +fiscalizar +fiscalização +fiz +fizemos +fizeram +fiéis +flagrante +flor +flores +floresta +fluxo +foco +fogo +foi +folha +fomos +fonte +fontes +for +fora +foram +forem +forma +formada +formado +formar +formas +formato +formação +formos +forró +fortalecer +forte +força +forças +fosse +fossem +foto +fotos +fraco +francês +frase +fraude +freitas +frente +frio +frisou +fronteira +frota +frutas +fruto +frutos +fuga +fugir +fugiu +fui +funciona +funcionamento +funcionando +funcionar +funcionário +funcionários +fundamentais +fundamental +fundação +fundo +fundos +função +funções +furto +futebol +futsal +futuro +futuros +fábrica +fácil +fãs +fé +férias +física +físicas +físico +fórmula +fórum +fôramos +fôssemos +gabinete +gado +galeria +ganha +ganham +ganhando +ganhar +ganho +ganhos +ganhou +garante +garantia +garantir +garantiu +garota +garoto +gastar +gasto +gastos +gaúcha +gaúchos +general +gente +geografia +gera +gerais +geral +geralmente +gerando +gerar +geração +gerente +gerou +gestor +gestores +gestão +ginásio +global +gol +goleiro +golpe +gols +gosta +gostam +gostaria +gostei +gosto +gostou +governador +governadora +governadores +governantes +governar +governo +governos +gramado +grande +grandes +gratuita +gratuito +grau +grave +graves +graças +grossa +grosso +grupo +grupos +grãos +guarda +guia +gás +gênero +habitantes +habitação +haja +hajam +hajamos +harmonia +havemos +haver +haveria +haverá +havia +haviam +hectares +hei +helena +hipótese +história +histórias +histórica +histórico +hoje +homem +homenagem +homens +homicídio +homicídios +honra +hora +horas +horizonte +horário +horários +hospital +hotel +hotéis +houve +houvemos +houver +houvera +houveram +houverei +houverem +houveremos +houveria +houveriam +houvermos +houverá +houverão +houveríamos +houvesse +houvessem +houvéramos +houvéssemos +hugo +humana +humanidade +humano +humanos +humor +há +hábito +hão +ia +ibope +ida +idade +ideal +identidade +identificado +identificar +identificação +idosos +idéia +idéias +iguais +igual +igualdade +ilegal +ilha +iluminação +imagem +imagens +imaginar +imediata +imediatamente +imediato +impacto +impede +impedir +implantar +implantação +impor +importa +importante +importantes +impossível +imposto +impostos +imprensa +impressão +imóveis +imóvel +inauguração +incentivar +incentivo +inclui +incluindo +inclusive +inclusão +incrível +incêndio +indenização +independente +independentemente +independência +indica +indicado +indicar +indicação +individuais +individual +indivíduo +indivíduos +indo +industrial +indícios +indígena +indígenas +indústria +indústrias +infantil +infelizmente +inferior +influência +informa +informado +informar +informação +informações +informou +informática +infra-estrutura +infraestrutura +inglês +ingresso +ingressos +inicia +iniciada +inicial +inicialmente +iniciar +iniciativa +iniciativas +iniciou +inquérito +inscritos +inscrição +inscrições +instalada +instalar +instalação +instalações +institucional +instituição +instituições +instituto +instrumento +instrumentos +integra +integral +integrante +integrantes +integrar +integração +inteira +inteiro +inteligente +inteligência +intensa +intenso +intenção +intenções +inter +interessa +interessados +interessante +interesse +interesses +interior +interna +internacionais +internacional +internado +internet +interno +interpretação +intervalo +intervenção +intuito +invadiu +inverno +investidores +investigar +investigação +investigações +investimento +investimentos +investir +invés +início +inúmeras +inúmeros +ir +iria +irmã +irmão +irmãos +irregular +irregularidades +irá +irão +isso +isto +italiano +item +itens +jamais +janeiro +janela +jantar +jardim +jc +jeito +joga +jogada +jogadas +jogador +jogadores +jogando +jogar +jogo +jogos +jogou +jornada +jornais +jornal +jornalismo +jornalistas +judicial +judiciário +juiz +julgamento +julgar +julho +junho +juntamente +junto +juntos +juros +jurídica +jurídico +justa +justamente +justifica +justificar +justificativa +justiça +justo +juventude +juíza +juízes +juízo +já +km +laboratório +lado +lados +ladrões +lago +lamentável +lance +lança +lançado +lançamento +lançar +lançou +lar +larga +lateral +latina +lazer +leal +legais +legal +legenda +legislativa +legislativo +legislação +lei +leia +leilão +leis +leite +leitor +leitores +leitura +lembra +lembrando +lembrar +lembro +lembrou +ler +leste +lesão +letra +letras +leva +levada +levado +levados +levam +levando +levantamento +levantar +levar +levaram +leve +levou +lhe +lhes +li +liberado +liberação +liberdade +licença +licitação +lidar +liderança +lideranças +liga +ligada +ligadas +ligado +ligados +ligar +ligação +ligações +liminar +limite +limites +limpa +limpeza +linda +linguagem +linha +linhas +lista +literatura +litoral +litros +livre +livres +livro +livros +lixo +lição +locais +local +localidade +localizada +localizado +logo +loja +lojas +longa +longe +longo +lua +lucro +lucros +lugar +lugares +luta +lutar +luxo +luz +lá +lê +líder +líderes +língua +líquido +lógica +madeira +madrugada +maio +maior +maiores +maioria +mais +mal +manda +mandado +mandar +mandato +mandatos +mandou +maneira +manhã +manifestação +mano +manter +manteve +mantido +mantém +manutenção +mar +marca +marcada +marcado +marcador +marcar +marcas +marcação +marcou +margem +margens +marido +marinho +marketing +março +mas +masculino +massa +mata +matar +matemática +materiais +material +mato +matou +matriz +matéria +matérias +mau +mauro +maus +me +medalha +mediante +medida +medidas +medo +meia +meio +meio-campo +meios +melhor +melhora +melhorar +melhores +melhoria +melhorias +melhorou +membro +membros +memória +menina +meninas +menino +meninos +menor +menores +menos +mensagem +mensagens +mensais +mensal +mental +mente +mentira +mercado +mercadorias +mercados +merece +merecem +mesa +meses +mesma +mesmas +mesmo +mesmos +mestre +meta +metade +metas +metropolitana +metros +meu +meus +mil +milhares +milho +milhão +milhões +militar +militares +mim +mineiro +minha +minhas +ministra +ministros +minuto +minutos +mirim +missão +mistura +mobilização +moda +modalidade +modelo +modelos +moderna +moderno +modo +moeda +momento +momentos +montagem +montante +montar +monte +mora +moradia +morador +moradora +moradores +morais +moral +moram +morar +morava +moro +morre +morrer +morreram +morreu +morro +morte +mortes +morto +mortos +mostra +mostram +mostrando +mostrar +mostrou +motivo +motivos +moto +motor +motoristas +motos +movimentação +movimento +movimentos +moça +muda +mudança +mudanças +mudar +mudou +muita +muitas +muito +muitos +mulher +mulheres +multa +multas +mundial +mundo +municipais +municipal +município +municípios +muro +museu +musical +má +máquina +máquinas +máxima +máximo +mãe +mães +mão +mãos +média +médica +médio +mérito +mês +mídia +mínima +mínimo +mínimos +móveis +móvel +música +músicas +músicos +na +nacionais +nacional +nada +namorada +namorado +naquela +naquele +nas +nasceu +nascido +nascimento +naturais +natural +naturalmente +natureza +nação +nações +necessidade +necessidades +necessita +necessária +necessárias +necessário +necessários +nega +negar +negativa +negativo +negociar +negociação +negociações +negou +negra +negro +negros +negócio +negócios +nela +nele +nem +nenhum +nenhuma +nessa +nessas +nesse +nesses +nesta +neste +nestes +neto +news +ninguém +nisso +no +nobre +noite +noites +nome +nomes +norma +normal +normalmente +normas +norte +nos +nossa +nossas +nosso +nossos +nota +notas +notícia +notícias +nova +novamente +novas +nove +novembro +novidade +novidades +novo +novos +num +numa +nunca +não +né +níveis +nível +nós +núcleo +número +números +o +objetivo +objetivos +objeto +objetos +obra +obras +obrigado +obrigados +obrigação +observa +observar +observou +obter +obteve +ocasião +ocorre +ocorrem +ocorrer +ocorreram +ocorreu +ocorrido +ocorrência +ocorrências +ocupa +ocupar +ocupação +oeste +oferece +oferecem +oferecer +oferecido +oferecidos +oferta +oficiais +oficial +oficialmente +oficina +oficinas +ofício +oito +olha +olhando +olhar +olho +olhos +oliveira +olímpico +onda +onde +ong +online +ontem +operação +operações +opinião +opiniões +oportunidade +oportunidades +optar +opção +opções +ora +ordem +organismo +organizada +organizado +organizar +organização +organizações +orientação +origem +original +orçamento +os +ou +ouro +outra +outras +outro +outros +outubro +ouvi +ouvido +ouvidos +ouvir +ouviu +paciente +pacientes +paciência +pacote +padrão +padrões +paga +pagam +pagamento +pagamentos +pagando +pagar +pago +pagos +pagou +pai +paixão +palanque +palavra +palavras +palco +palestra +palestras +palácio +papai +papel +papéis +par +para +parabéns +parada +parado +paralisação +paranaense +parar +parceiro +parceiros +parcela +parceria +parcerias +parece +parecem +parecer +parecia +paredes +parentes +parlamentar +parlamentares +parlamento +parou +parque +parte +partes +participa +participam +participantes +participar +participaram +participação +participou +particular +particulares +partida +partidas +partido +partidos +partidária +partir +partiu +passa +passada +passado +passageiros +passagem +passagens +passam +passando +passar +passaram +passará +passava +passe +passei +passeio +passo +passo fundo +passos +passou +pasta +patamar +patrimônio +pau +paula +paulistas +pauta +pavimentação +paz +país +países +pede +pedido +pedidos +pedindo +pedir +pediu +pedra +pedras +pega +pegar +pegou +peito +peixe +peixes +pela +pelas +pele +pelo +pelos +pena +penal +pensa +pensam +pensamento +pensamentos +pensando +pensar +pensei +penso +pensou +pensão +pequena +pequenas +pequeno +pequenos +perante +percebe +perceber +percebeu +percentual +percurso +perda +perdas +perde +perdendo +perder +perderam +perdeu +perdido +perfeito +perfil +pergunta +perguntar +perguntas +perguntou +perigo +perigoso +permanece +permanecer +permaneceu +permanente +permanência +permite +permitido +permitir +permitiu +perna +pernas +personagem +personagens +personalidade +perspectiva +pertence +perto +período +períodos +pesado +pesca +peso +pesquisa +pesquisadores +pesquisas +pessoa +pessoais +pessoal +pessoalmente +pessoas +peça +peças +piloto +pilotos +pintura +pior +piores +piso +pista +placa +placas +planejamento +planeta +plano +planos +planta +plantas +plantio +plantão +plateia +pleito +plena +pleno +plenário +plástico +pneus +pobre +pobres +pobreza +pode +pode-se +podem +podemos +podendo +poder +poderes +poderia +poderiam +poderá +poderão +podia +poeta +pois +policiais +policial +politicamente +polêmica +políticas +político +políticos +ponta +ponte +ponto +pontos +popular +populares +população +por +porque +porta +portal +portanto +portas +porte +porto +português +porém +posicionamento +positiva +positivo +positivos +posição +posições +possa +possam +posse +possibilidade +possibilidades +posso +possuem +possui +possíveis +possível +posteriormente +posto +postos +postura +potencial +pouca +poucas +pouco +poucos +povo +povos +pps +pq +pra +praia +praias +prata +praticamente +praticar +prato +pratos +prazer +prazo +prazos +praça +praças +precisa +precisam +precisamos +precisar +precisava +preciso +precisou +preconceito +preencher +prefeita +prefeitos +prefeituras +prefere +preferiu +preferência +prejudicar +prejuízo +prejuízos +premiação +preocupa +preocupado +preocupar +preocupação +prepara +preparado +preparados +preparar +preparação +presa +presente +presentes +presença +preservar +preservação +presidencial +presidente +presidentes +presidência +preso +presos +pressão +prestar +prestação +presídio +preta +pretende +preto +prevenção +previdência +prevista +previstas +previsto +previstos +previsão +prevê +preço +preços +primavera +primeira +primeiras +primeiro +primeiros +principais +principal +principalmente +princípio +princípios +prioridade +prioridades +prisão +privada +privado +pro +problema +problemas +procedimento +procedimentos +processo +processos +procura +procurado +procurador +procuram +procurando +procurar +procure +procurou +produtividade +produto +produtor +produtores +produtos +produz +produzido +produzir +produção +professor +professora +profissionais +profissional +profissão +profunda +programa +programas +programação +progresso +proibido +projeto +projetos +prol +promessa +promessas +promete +prometeu +promotor +promove +promover +promovido +promoção +pronta +pronto +propaganda +proposta +propostas +propriedade +propriedades +proprietário +proprietários +propósito +propõe +proteger +protesto +proteção +prova +provar +provas +provavelmente +providências +provisória +provocar +provocou +provável +proximidades +prudente +prática +práticas +pré-candidato +prédio +prédios +prévia +prêmio +prêmios +própria +próprias +próprio +próprios +próxima +próximas +próximo +próximos +publicada +publicado +publicação +publicidade +pudesse +punição +pura +página +páginas +pátio +pão +pé +pés +pênalti +pólo +pública +públicas +público +públicos +quadra +quadrados +quadrilha +quadro +quadros +quais +qual +qualidade +qualificação +qualquer +quando +quantas +quantia +quantidade +quanto +quantos +quarta +quarta-feira +quarto +quase +quatro +que +quebra +quebrar +queda +queira +quem +quente +quer +querem +queremos +querendo +querer +queria +queriam +querido +quero +questionado +questão +questões +quilos +quilômetros +quinta +quinta-feira +quinto +quis +quiser +rainha +ramo +ranking +rapaz +rapidamente +razão +razões +reais +reajuste +real +realidade +realiza +realizada +realizadas +realizado +realizados +realizando +realizar +realização +realizou +realmente +reação +rebaixamento +recado +recebe +recebem +recebendo +receber +receberam +receberá +recebeu +recebi +recebido +receita +receitas +recente +recentemente +recentes +reclama +reclamar +reclamação +reclamações +reclamou +reconhece +reconhecer +reconhecido +reconhecimento +recorde +recorrer +recuperar +recuperação +recurso +recursos +redação +rede +redes +redonda +redor +reduzir +redução +reeleito +reeleição +refere +referente +referência +reflete +refletir +reflexão +reforma +reformas +reforçar +reforço +regime +regionais +regional +registrada +registrado +registrados +registrar +registro +registros +registrou +região +regiões +regra +regras +regular +rei +reino +reivindicações +relacionados +relacionamento +relacionamentos +relata +relator +relatou +relatório +relação +relações +religioso +remuneração +remédio +remédios +renda +rendimento +renovação +repasse +repente +repercussão +repetir +reportagem +representa +representam +representante +representantes +representar +representação +repórter +república +reserva +reservas +resgate +residência +residências +resistência +resolução +resolve +resolver +resolveu +resolvido +respectivamente +respeitar +respeito +responde +responder +respondeu +responsabilidade +responsáveis +responsável +resposta +respostas +ressalta +ressaltar +ressaltou +resta +restante +restaurante +restaurantes +resto +resultado +resultados +retirada +retirar +retornar +retorno +reunir +reuniu +reunião +reuniões +revela +revelou +rever +reverter +revista +revistas +revisão +revolução +reúne +rica +rico +ricos +rio +rio de janeiro +rios +riqueza +risco +riscos +ritmo +rival +rock +rodada +rodadas +rodovia +rodovias +rodoviária +romance +rosto +roteiro +rotina +roubo +roupa +roupas +rua +ruas +rubro-negro +ruim +rumo +rurais +rural +rádio +rápida +rápido +sabe +sabedoria +sabem +sabemos +sabendo +saber +sabia +saco +saem +sai +saia +saiba +saindo +sair +saiu +sala +salarial +salas +saldo +salto +salvar +salário +salários +salão +saneamento +sangue +santista +satisfação +satisfeito +saudade +saudável +saída +saíram +se +secretaria +secretarias +secretário +secretários +sede +segmento +segmentos +segredo +segue +seguem +seguida +seguido +seguindo +seguinte +seguintes +seguir +seguiu +segunda +segunda-feira +segundo +segundos +segura +segurança +segurar +seguro +sei +seis +seja +sejam +sejamos +seleção +sem +semana +semanas +semelhante +semelhantes +semestre +seminário +sempre +senado +senador +senadora +senadores +sendo +senhor +senhora +senhores +sensação +senso +sente +sentença +sentido +sentimento +sentimentos +sentindo +sentir +sentiu +senão +sequer +sequência +ser +serei +serem +seremos +seres +seria +seriam +serve +servidor +servidores +servir +serviu +serviço +serviços +será +serão +seríamos +sessão +sessões +sete +setembro +setor +setores +seu +seus +sexo +sexta +sexta-feira +sexual +shopping +show +shows +si +sido +sigilo +sigla +significa +significado +silêncio +sim +simples +simplesmente +sinais +sinal +sinto +sintomas +sistema +sistemas +site +sites +situação +situações +sob +sobe +sobra +sobre +sobretudo +sobrinho +sociais +social +socorro +sofre +sofrem +sofrendo +sofrer +sofreu +sofrimento +sol +soldados +solenidade +solicitação +solicitou +solidariedade +solo +solução +soluções +som +soma +sombra +somente +somos +sonho +sonhos +sono +sorriso +sorte +sorteio +sou +soube +sousa +sozinha +sozinho +sua +suas +subir +subiu +substituir +substituição +sucesso +sucessão +sudeste +suficiente +suficientes +sugere +sugestão +sugestões +sujeito +sul +super +superar +superintendente +superior +superiores +supermercado +superou +suplente +suporte +suposto +supremo +surge +surgiu +surpresa +suspeita +suspeito +suspeitos +suspensão +sábado +sábados +são +século +série +sério +sítio +só +sócios +tabela +tais +tal +talento +talvez +tamanho +também +tanta +tantas +tanto +tantos +taques +tarde +tarefa +tarifa +taxa +taxas +taça +te +teatro +tecnologia +tecnologias +tela +telefone +telefones +televisão +tem +tema +temas +temos +temperatura +tempo +temporada +tempos +tende +tendo +tendência +tenha +tenham +tenhamos +tenho +tenta +tentam +tentando +tentar +tentaram +tentativa +tentou +teoria +ter +terceira +terceiro +terei +terem +teremos +teria +teriam +termina +terminal +terminar +terminou +termo +termos +terra +terras +terreno +terrenos +território +terá +terão +terça +terça-feira +teríamos +tese +tesouro +teste +testemunhas +testes +teto +teu +teus +teve +texto +textos +the +ti +tido +time +times +tinha +tinham +tio +tipo +tipos +tira +tirar +tiro +tiros +tirou +titular +titulares +tive +tivemos +tiver +tivera +tiveram +tiverem +tivermos +tivesse +tivessem +tivéramos +tivéssemos +tocar +tocou +toda +todas +todo +todos +tom +toma +tomada +tomadas +tomando +tomar +tomou +toneladas +toque +torcedor +torcedores +torcida +torna +tornando +tornar +torneio +torno +tornou +tornou-se +torres +total +totalmente +trabalha +trabalhador +trabalhadores +trabalham +trabalhando +trabalhar +trabalhava +trabalho +trabalhos +trabalhou +tradicionais +tradicional +tradição +traficantes +tragédia +trajetória +tranquilidade +transferência +transformar +transformação +transformou +transição +transmissão +transparência +transporte +transportes +trata +trata-se +tratado +tratamento +tratar +trave +travessão +traz +trazendo +trazer +trecho +trechos +treinador +treinamento +treino +trem +tribuna +tribunal +tributária +trimestre +trinta +trio +triste +tristeza +troca +trocar +troféu +trouxe +tráfego +trás +três +tu +tua +tuas +tudo +turismo +turistas +turma +tv +twitter +tá +tão +técnica +técnicas +técnico +técnicos +tém +término +tênis +tínhamos +título +títulos +um +uma +umas +unidade +unidades +unidos +unir +universidade +universidades +universitário +universo +união +uns +urbana +urbano +urgência +urnas +usada +usado +usados +usam +usando +usar +usina +usinas +uso +usou +usuário +usuários +utilizada +utilizado +utilizados +utilizar +utilização +vacinação +vaga +vagas +vai +vale +valer +valor +valores +valorizar +valorização +vamos +vantagem +vantagens +vara +variação +vc +vcs +veio +vejo +velha +velho +velhos +velocidade +vem +vemos +vence +vencedor +vencer +venceu +venda +vendas +vender +vendidos +vendo +venha +vento +ver +vera +verba +verbas +verdade +verdadeira +verdadeiro +verdadeiros +verde +vereador +vereadora +vereadores +vergonha +verificar +vermelha +vermelho +versão +verão +vez +vezes +veículo +veículos +vi +via +viagem +viagens +viajar +vias +vice +vice-governador +vice-prefeito +vice-presidente +vida +vidas +vieram +vigor +vila +vinda +vindo +vinha +vinho +vinte +violência +vir +vira +virada +viram +virar +virou +virtude +visa +visando +visita +visitantes +visitar +visitas +vista +visto +visual +visão +vitória +vitórias +viu +viva +vive +vivem +vivemos +vivendo +viver +viveu +vivo +vizinho +vizinhos +você +vocês +volante +volta +voltada +voltado +voltam +voltando +voltar +voltaram +voltou +volume +voluntários +vontade +voos +vos +votado +votar +votação +voto +votos +votou +vou +voz +vá +várias +vários +várzea +vão +véspera +vê +vídeo +vídeos +vítima +vítimas +vôo +zagueiro +zero +zona +à +às +água +águas +árbitro +área +áreas +árvore +árvores +época +éramos +êxito +índia +índice +índices +óleo +órgão +órgãos +ótima +ótimo +ônibus +última +últimas +último +últimos +única +único +útil diff --git a/apps/common/src/python/mediawords/languages/ro/ro_stop_words_old.txt b/apps/common/src/python/mediawords/languages/ro/ro_stop_words_old.txt new file mode 100755 index 0000000000..2afa1eb3de --- /dev/null +++ b/apps/common/src/python/mediawords/languages/ro/ro_stop_words_old.txt @@ -0,0 +1,440 @@ +# +# This is a stop word list for the Romanian language. +# +# Source: http://snowball.tartarus.org/otherapps/romanian/intro.html (romanian2.tgz) +# + + # A Romanian stop word list. Comments begin with vertical bar. Each stop + # word is at the start of a line. + + # Many of the forms below are quite rare but included for completeness. + + # ARTICLE + # Indefinite article +o # a +unui +unei +unor +nişte # some + # Demonstrative/adjectival article +cel +cea +cei +cele +celui +celei +celor + # Possessive / genitival article +al # of +a +ai +ale + # PREPOSITION AND ADVERB +pe # on +la # at +în # in +fără # without +sub # under +despre # about +către # to +cu # with +de # from +din # on +lângă # by +pentru # for +peste # over +spre # to +prin # through +dintre # between +printre # among +până # until +după # after +înspre # towards +ca # as + # ADJECTIVE +mai # more +decât # than +cum # how +foarte # very +mult # much +multă +mulţi +multe +puţin # little +puţină +puţini +puţine +destul # enough +destulă +destui +destule + # PRONOUN + # Personal pronoun +eu # I +tu # you +el # he +ea # she +noi # we +voi # you +ei # they +ele # they +mie # me +îmi +mi +mine +mă +m +ţie # you +îţi +ţi +tine +te +lui # him +îl +l +îi +i +nouă # us +ne +ni +vouă # you +vă +vi +v +lor # them +le +li + # Pronoun of politeness +dumneavoastră # you + # Reflexive pronoun +se # himself +îşi +sie +sieşi +sine + # Pronoun of reinforcement +însumi # myself +însămi +însuţi # youself +însăţi +însuşi # himself +însăşi # herself +înşine # ourselves +însene +înşivă # youselves +însevă +înşişi # themselves +înseşi +însele + # Possessive pronoun +meu # mine +mea +mei +mele +tău # yours +ta +tăi +tale +său # his +sa +săi +sale +nostru # ours +noastră +noştri +noastre +vostru # yours +voastră +voştri +voastre + # Demonstrative pronoun +acesta # this +ăsta +aceştia +ăştia +acestuia +ăstuia +acestora +ăstora +aceasta +asta +acestea +astea +acesteia +ăsteia +acest +aceşti +acestui +acestor +această +aceste +acestei +acela # that +ăla +acelui +ăluia +aceia +ăia +acelora +ălora +aceea +aia +acelea +alea +aceleia +ăleia +acel +acei +acelor +acea +acele +acelei +acelaşi # the same +aceiaşi +aceeaşi +aceleaşi +aceluiaşi +aceloraşi +aceleiaşi +celălalt # the other +celuilalt +ceilalţi +celorlalţi +cealaltă +celeilalte +celelalte +celorlalte + # Interrogative pronoun +ce # what +cine # who +cui # whom +care # which, what +cărui +cărei +căror +unde # where +când # when + # Indefinite pronoun +cineva # someone +cuiva +altcineva # someone else +altcuiva +oricine # anyone +oricui +orice # anything +unul # one +una +unii +unele +unuia +uneia +unora +altul # other +alta +alţii +altele +alt +altă +alţi +alte +altuia +alteia +altora +altui +altei +altor +vreunul # somebody, some (of them) +vreuna +vreunii +vreunele +vreun +vreo +vreunuia +vreuneia +vreunora +vreunui +vreunei +vreunor +oricare # anyone +oricăruia +oricăreia +oricărora +oricărui +oricărei +oricăror +fiecare # everyone +fiecăruia +fiecăreia +fiecărui +fiecărei +cât # how, how many +câtă +câţi +câte +câtora +câtor +atât # this much +atâta +atâţi +atâţia +atâtea +atâtora +atâtor +oricât # however much +oricâtă +oricâţi +oricâte +oricâtora +oricâtor +câtva # some +câţiva +câteva +câtorva +tot # all +toată +toţi +toate +tuturor +totul +cutare # that +oarecare # some +ceva # something +altceva # something else + # Negative pronoun +nimeni # nobody +nimănui +nimic # nothing + # NUMERAL + # Cardinal numeral +unu # one +doi # two +doua +trei # three +patru # four +cinci # five +şase # six +şapte # seven +opt # eight +noua # nine +zece # ten + # Fractional numeral +doime # half +treime # third +sutime # hundredth + # Collective numeral +amândoi # both +amândouă +amândurora +amânduror +ambii +ambele +ambilor +ambelor + # Multiplicative numeral +îndoit # double +întreit # threefold +însutit # hundred-fold + # Ordinal numeral +întâiul # the first +întâia +primul # former +prima +primii +primele +primului +primei +primilor +primelor + # VERB + # To be +sunt # (I) am +s +eşti # (you) are +este # (he/she) is +e +suntem # (we) are +sunteţi # (you) are +eram # (I) were +erai # (you) were +era # (he) was +eraţi # (you) were +erau # (they) were +fiu # be +fii +fie +fim +fiţi +fi +fiind # being +fost # been + # Auxiliary verb +am # to have - all forms +aţi +au +are +avem +aveţi +aveam +aveai +avea +aveaţi +aveau +aş +ar +oi # to will +om +oţi +or +vei +va +vom +veţi +vor + # CONJUNCTION +şi # and +nici # neither +dar # but +însă +iar # and, but, while, again +ci # but, so that +sau # or +ori +deci # so +aşadar +încât # so that +aşa # such +deşi # although +totuşi # though +dacă # if +atunci # then +că # that + # OTHER +nu # no + + # The following is a ranked list (commonest to rarest) of stopwords + # deriving from a large sample of text. + +poate # maybe +ieri # yesterday +mare # big +doar # just +trebuie # must +spus # said +acum # now +putea # can +chiar # even +face # do +astfel # such +pot # can +făcut # done +avut # had +parte # part +spune # says +bine # good +faţă # front +există # exists +încă # still +numai # only +dat # given +asupra # on +aproape # near diff --git a/apps/common/src/python/mediawords/languages/ru/ru_stop_words_old.txt b/apps/common/src/python/mediawords/languages/ru/ru_stop_words_old.txt new file mode 100644 index 0000000000..e4a59dda4c --- /dev/null +++ b/apps/common/src/python/mediawords/languages/ru/ru_stop_words_old.txt @@ -0,0 +1,909 @@ +# +# This is a "short" stop word list for the Russian language. +# + +adriver +amp +bin +cgi +href +html +http +link +livejournal +quot +rnd +sid +style +www +а +А +августа +акций +Александр +Александра +Алексей +Анатолий +Андрей +АО +апрель +апреля +Ассошиэйтед +Б +без +Без +блог +более +Более +больше +большинство +большой +Борис +будет +будто +будут +бы +бывшего +бывший +был +была +были +было +быстро +быть +в +В +вам +вас +ваш +вдруг +ведь +Ведь +века +вести +весь +весьма +взгляд +взять +виде +видимо +Виктор +вице +включая +Владимира +власть +вместе +вместо +внимание +вновь +во +Во +вовсе +воды +возможно +возможности +возможность +войск +вокруг +вообще +вопрос +вопросы +воскресение +вот +Вот +впервые +вполне +Впрочем +времена +времени +время +вроде +вряд +все +Все +всегда +всего +всей +всем +всему +всех +встречи +всю +вся +всё +вторая +второй +вы +Вы +выборах +выше +выяснилось +г +Г +где +глава +главе +главного +главное +главный +главным +главы +го +говорил +говорит +говорится +говорить +говоря +говорят +год +года +году +годы +город +города +городе +градусов +Грозном +группа +группы +д +Д +Да +да +давно +дает +даже +Даже +дал +далеко +дальше +данным +дать +два +две +движения +двух +действий +действительно +действия +декабря +дел +дела +делаем +делам +делать +деле +дело +Дело +делу +день +деньги +десять +деятельности +деятельность +директора +для +Для +дней +дни +дня +днях +до +До +довольно +документы +долго +должен +должна +должно +должны +дом +дома +доме +достаточно +друг +друга +другие +другим +других +другое +другой +другом +е +Е +его +Его +едва +ее +ей +ему +если +Если +естественно +есть +Есть +еще +Еще +же +женщин +женщины +жизни +жизнь +жителей +жить +за +За +завода +закон +зам +заместитель +затем +заявил +заявление +здесь +Здесь +земли +знает +значит +знаю +знают +зрения +и +И +игры +идет +из +Из +Известий +Известия +Известиям +известно +или +Иллюстрация +им +имеет +имени +именно +Именно +иметь +имеют +имя +иначе +интервью +интересы +информацию +история +ИТАР +итоге +их +Их +июля +июня +й +к +К +каждая +каждого +каждый +кажется +как +Как +какие +каким +каких +какой +касается +качестве +квартиры +километров +когда +Когда +кого +количество +команда +команды +комиссии +комитета +комментариев +компания +кому +конечно +Конечно +конференции +конца +конце +коп +корреспонденту +которая +которого +которое +которой +котором +которому +которую +которые +который +которым +которыми +которых +края +Кроме +кроме +крупных +кстате +Кстати +кстати +кто +Кто +куда +Л +легко +лет +летний +ли +либо +лидер +лиц +лица +лично +лишь +лучше +любая +любой +людей +люди +людям +М +м +мало +марта +массовой +мая +между +Между +мене +менее +меньше +меня +мере +меры +места +месте +местных +место +месяц +месяца +месяцев +метра +метров +миллиарда +миллион +миллиона +минут +мире +мировой +Михаил +мне +Мне +многие +Многие +многих +много +мог +могла +могли +могут +может +Может +можно +Можно +мой +момент +мы +Мы +Н +на +На +НА +над +надо +назад +наиболее +найти +наконец +нам +например +народа +нас +находившегося +находится +начала +начале +начальник +начальника +наш +наша +нашего +нашей +наши +наших +не +Не +НЕ +невозможно +него +недавно +недели +неделю +нее +ней +некоторого +некоторые +некоторых +нельзя +нем +немало +нему +необходимо +нескольких +несколько +несмотря +нет +Нет +ни +Ни +нибудь +никак +никаких +никогда +Николай +никто +ним +ними +них +ничего +но +Но +нового +новой +новостей +новые +новый +новых +ноября +Ну +нужно +ныне +нынешнего +Нью +о +О +об +Об +области +образом +обычно +один +Один +одна +Однако +однако +одним +одно +одновременно +одного +одной +одном +одну +оказалась +оказались +оказалось +оказался +около +октября +он +Он +она +Она +они +Они +оно +операции +опыт +опять +органы +основном +особенно +остается +от +От +ответ +отдела +отличие +отношении +отношения +очень +очередной +очередь +П +партия +первая +первого +первой +первую +первые +первый +первым +первых +перед +период +письмо +площади +по +По +поводу +под +подобная +позиции +пока +Пока +политики +полностью +положение +полтора +получил +получили +получить +помощи +помощь +помощью +понять +пор +порядке +посколько +поскольку +после +После +последнее +последние +последний +последних +пост +постоянно +потом +Потом +потому +похоже +почему +Почему +почта +почти +Поэтому +поэтому +права +Правда +правда +правило +право +практически +предприятий +предприятия +председателя +представителей +представители +прежде +прежнему +премьер +премьера +Пресс +пресс +при +При +придется +примерно +примеру +принять +приходится +Причем +пришлось +про +проблем +проблема +проблемы +провести +продукции +проект +производства +производство +произошло +происходит +прокуратуры +просто +против +процента +процентов +процесс +прошла +прошлого +прошлом +прямо +пути +путь +пятая +пяти +пять +работа +работавшую +работает +работать +работе +работу +равно +раз +раза +развития +разных +района +районе +ранее +раньше +резко +результате +Рейтер +речь +решения +решил +решили +рода +роль +руб +рук +руках +руки +руководителей +руководитель +руководство +ряд +рядом +с +С +сам +сама +сами +самого +самое +самой +самом +самые +самый +самым +самых +сборной +свет +свое +своего +своей +своем +своему +свои +своим +своими +своих +свой +свою +связи +сделаем +сделал +сделать +себе +себя +сегодня +Сегодня +сейчас +Сейчас +семьи +сентября +Сергей +Сергея +силу +силы +система +системы +ситуации +ситуацию +ситуация +сих +скажем +сказал +сказать +сколько +скорее +следует +слишком +слова +словам +случае +случай +смерти +снова +со +собой +собственности +событий +события +совершенно +совсем +создать +сообща +сообщил +сообщили +состоянии +сотрудники +сотрудников +специалистов +специалисты +сразу +среди +Среди +средств +средства +срок +ссылка +стал +стала +стали +стало +станет +становится +стате +стать +степени +сто +стоит +столице +столь +столько +сторону +стороны +суббота +суда +сумму +сути +существует +счет +считает +считать +считают +т +так +Так +такая +также +таки +такие +таким +Таким +таких +такого +такое +такой +там +Там +ТАСС +те +театра +тебе +тем +Тем +теперь +Теперь +территории +тех +течение +то +То +тогда +Тогда +того +тоже +той +только +Только +том +тому +тонн +тот +точки +точнее +трех +три +труда +трудно +туда +тут +ты +тысяч +тысячи +у +У +удалось +уж +уже +Уже +уровень +уровне +условия +условиях +утверждает +утверждают +участие +участников +факт +февраля +фирм +фирма +фирмы +фонда +Фото +х +ходе +хорошо +хоть +хотя +Хотя +хочет +целом +центр +центра +центре +цены +час +часа +часов +части +частности +часто +часть +чаще +чего +человек +человека +чем +через +Через +четыре +четырех +числе +число +членов +что +Что +чтобы +Чтобы +чуть +шесть +эта +Эта +эти +Эти +этим +этих +это +Это +этого +этой +этом +этому +этот +Этот +эту +Ю +Юрий +я +Я +являетесь +является +явно +якобы +января +ясно diff --git a/apps/common/src/python/mediawords/languages/sv/sv_stop_words_old.txt b/apps/common/src/python/mediawords/languages/sv/sv_stop_words_old.txt new file mode 100644 index 0000000000..0629e2deb2 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/sv/sv_stop_words_old.txt @@ -0,0 +1,153 @@ +# +# This is a stop word list for the Swedish language. +# +# Sources: +# http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ +# that one Swedish journalist +# + +alla +allt +att +av +blev +bli +blir +blivit +borde +båda +de +dem +den +denna +dens +deras +dess +dessa +det +detta +dig +din +dina +ditt +dom +du +där +då +efter +eftersom +egen +ej +eller +en +er +era +ert +ett +fanns +finns +från +få +för +före +genom +gjorde +gjort +göra +ha +hade +hade +han +han +hans +hans +har +hen +henne +hennes +hon +honom +hur +här +i +i +icke +igen +ingen +innan +inom +inte +jag +ju +kan +kunde +kunnat +lite +man +med +mellan +men +mig +min +min +mina +mitt +mot +mycket +nere +ni +nu +när +någon +något +några +och +om +oss +på +samma +sedan +sen +sig +sin +sina +sitta +själv +skulle +som +så +sådan +sådana +sådant +sån +till +till +tills +under +upp +ut +utan +ute +vad +var +vara +varför +varit +varje +vars +vart +vem +vi +vid +vilka +vilkas +vilken +vilket +vår +våra +vårat +vårt +än +är +åt +över diff --git a/apps/common/src/python/mediawords/languages/tr/tr_stop_words_old.txt b/apps/common/src/python/mediawords/languages/tr/tr_stop_words_old.txt new file mode 100755 index 0000000000..2418be327c --- /dev/null +++ b/apps/common/src/python/mediawords/languages/tr/tr_stop_words_old.txt @@ -0,0 +1,269 @@ +# +# This is a stop word list for the Turkish language. +# +# Sources: +# http://nlp.ceng.fatih.edu.tr/blog/?p=101 +# http://www.ranks.nl/stopwords/turkish.html +# + +a +acaba +altmýþ +altý +altı +ama +ancak +artık +asla +aslında +az +b +bana +bazen +bazý +bazı +bazıları +bazısı +belki +ben +benden +beni +benim +beþ +beş +bile +bin +bir +biri +birisi +birkaç +birkaçı +birkez +birçok +birçokları +birçoğu +birþey +birþeyi +birşey +birşeyi +biz +bizden +bize +bizi +bizim +bu +buna +bunda +bundan +bunu +bunun +burada +böyle +böylece +bütün +c +d +da +daha +dahi +de +defa +demek +değil +diye +diğer +diğeri +diğerleri +doksan +dokuz +dolayı +dört +e +elbette +elli +en +en gibi +f +fakat +falan +felan +filan +g +gene +gibi +h +hangi +hangisi +hani +hatta +hem +henüz +hep +hepsi +hepsine +hepsini +her +her biri +herkes +herkese +herkesi +hiç +hiç kimse +hiçbiri +hiçbirine +hiçbirini +hâlâ +i +iki +ile +INSERmi +ise +için +içinde +işte +j +k +kadar +katrilyon +kaç +kendi +kendine +kendini +kez +ki +kim +kimden +kime +kimi +kimin +kimisi +kýrk +l +m +madem +mi +milyar +milyon +mu +mü +mý +mı +n +nasýl +nasıl +ne +ne kadar +ne zaman +neden +nedir +nerde +nerede +nereden +nereye +nesi +neyse +niye +niçin +o +on +ona +ondan +onlar +onlara +onlardan +onlari +onlarýn +onların +onu +onu otuz +onun +orada +oysa +oysaki +p +r +rağmen +s +sana +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +size +sizi +sizin +son +sonra +t +tabi +tamam +trilyon +tüm +tümü +u +v +var +ve +veya +veyahut +y +ya +ya da +yani +yedi +yerine +yetmiþ +yine +yirmi +yoksa +yüz +z +zaten +zira +ç +çok +çoğu +çoğuna +çoğunu +çünkü +ö +öbürü +ön +önce +ötürü +öyle +ü +üzere +üç +þey +þeyden +þeyi +þeyler +þu +þuna +þunda +þundan +þunu +ğ +ı +ş +şayet +şey +şeyden +şeye +şeyi +şeyler +şimdi +şu +şuna +şunda +şundan +şunlar +şunu +şunun +şöyle diff --git a/apps/common/src/python/mediawords/languages/zh/__init__.py b/apps/common/src/python/mediawords/languages/zh/__init__.py index 909365b9dd..eb011d37be 100644 --- a/apps/common/src/python/mediawords/languages/zh/__init__.py +++ b/apps/common/src/python/mediawords/languages/zh/__init__.py @@ -32,6 +32,9 @@ class ChineseLanguage(StopWordsFromFileMixIn): # Stop words map '__stop_words_map', + # FIXME remove once stopword comparison is over + '__stop_words_old_map', + # Jieba instance '__jieba', diff --git a/apps/common/src/python/mediawords/languages/zh/zh_stop_words_old.txt b/apps/common/src/python/mediawords/languages/zh/zh_stop_words_old.txt new file mode 100644 index 0000000000..3eb0376f33 --- /dev/null +++ b/apps/common/src/python/mediawords/languages/zh/zh_stop_words_old.txt @@ -0,0 +1,2727 @@ +# Appended Traditional Chinese characters (Note: This does not include all stopwords in Cantonese or Taiwanese Mandarin) +# Sources: +# http://blog.csdn.net/shijiebei2009/article/details/39696571 +# http://github.com/stopwords-iso/stopwords-zh +! +" +# +$ +% +& +( +) +* ++ +, +- +-- +. +.. +... +...... +................... +./ +.一 +.数 +.數 +.日 +/ +// +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +:// +:: +; +< += +> +>> +? +@ +[ +\ +] +^ +_ +` +A +exp +Lex +sub +sup +| +} +~ +~~~~ +· +× +××× +γ +Δ +μ +φ +φ. +Ψ +В +— +—— +——— +‘ +’ +’‘ +“ +” +”, +… +…… +…………………………………………………③ +′∈ +′| +℃ +Ⅲ +↑ +→ +∈[ +∪φ∈ +≈ +① +② +②c +③ +③] +④ +⑤ +⑥ +⑦ +⑧ +⑨ +⑩ +── +■ +▲ +  +、 +。 +〈 +〉 +《 +》 +》), +「 +」 +『 +』 +【 +】 +〔 +〕 +〕〔 +㈧ +一 +一. +一一 +一下 +一个 +一些 +一何 +一來 +一個 +一切 +一则 +一则通过 +一則 +一則通過 +一天 +一定 +一方面 +一旦 +一时 +一時 +一来 +一样 +一樣 +一次 +一片 +一番 +一直 +一致 +一般 +一起 +一轉眼 +一转眼 +一边 +一邊 +一面 +七 +万一 +三 +三天两头 +三天兩頭 +三番两次 +三番五次 +三番兩次 +上 +上下 +上來 +上升 +上去 +上来 +上述 +上面 +下 +下來 +下列 +下去 +下来 +下面 +不 +不一 +不下 +不久 +不了 +不亦乐乎 +不亦樂乎 +不仅 +不仅...而且 +不仅仅 +不仅仅是 +不会 +不但 +不但...而且 +不僅 +不僅...而且 +不僅僅 +不僅僅是 +不光 +不免 +不再 +不力 +不勝 +不单 +不变 +不只 +不可 +不可开交 +不可抗拒 +不可開交 +不同 +不問 +不單 +不外 +不外乎 +不够 +不夠 +不大 +不如 +不妨 +不定 +不对 +不對 +不少 +不尽 +不尽然 +不巧 +不已 +不常 +不得 +不得不 +不得了 +不得已 +不必 +不怎么 +不怎麼 +不怕 +不惟 +不成 +不拘 +不择手段 +不擇手段 +不敢 +不料 +不断 +不斷 +不日 +不时 +不是 +不時 +不曾 +不會 +不止 +不止一次 +不比 +不消 +不满 +不滿 +不然 +不然的話 +不然的话 +不特 +不独 +不獨 +不由得 +不盡 +不盡然 +不知不覺 +不知不觉 +不管 +不管怎样 +不管怎樣 +不經意 +不经意 +不胜 +不能 +不能不 +不至于 +不至於 +不若 +不要 +不論 +不變 +不论 +不起 +不足 +不过 +不迭 +不過 +不问 +不限 +与 +与其 +与其说 +与否 +与此同时 +专门 +且 +且不說 +且不说 +且說 +且说 +两者 +严格 +严重 +並 +並不 +並不是 +並且 +並排 +並沒 +並沒有 +並無 +並肩 +並非 +个 +个人 +个别 +中 +中小 +中間 +中间 +丰富 +串行 +临 +临到 +为 +为主 +为了 +为什么 +为什麽 +为何 +为止 +为此 +为着 +主张 +主張 +主要 +举凡 +举行 +乃 +乃至 +乃至于 +乃至於 +么 +之 +之一 +之前 +之后 +之後 +之所以 +之类 +之類 +乌乎 +乎 +乒 +乘 +乘势 +乘勝 +乘勢 +乘机 +乘機 +乘胜 +乘虚 +乘虛 +乘隙 +九 +也 +也好 +也就是說 +也就是说 +也是 +也罢 +也罷 +了 +了解 +争取 +二 +二來 +二来 +二話不說 +二話沒說 +二话不说 +二话没说 +于 +于是 +于是乎 +云云 +云尔 +云爾 +互 +互相 +五 +些 +交口 +亦 +产生 +亲口 +亲手 +亲眼 +亲自 +亲身 +人 +人人 +人们 +人們 +人家 +人民 +什么 +什么样 +什麼 +什麼樣 +什麽 +仅 +仅仅 +今 +今后 +今天 +今年 +今後 +今日 +今次 +介于 +介於 +仍 +仍旧 +仍然 +仍舊 +从 +从不 +从严 +从中 +从事 +从今以后 +从优 +从古到今 +从古至今 +从头 +从宽 +从小 +从新 +从无到有 +从早到晚 +从未 +从来 +从此 +从此以后 +从而 +从轻 +从速 +从重 +他 +他人 +他们 +他們 +他是 +他的 +代替 +令 +以 +以上 +以下 +以为 +以來 +以便 +以免 +以前 +以及 +以后 +以外 +以後 +以故 +以期 +以来 +以為 +以至 +以至于 +以至於 +以致 +们 +任 +任何 +任凭 +任务 +任務 +任憑 +企图 +企圖 +伙同 +会 +伟大 +传 +传说 +传闻 +似乎 +似的 +但 +但凡 +但愿 +但是 +但願 +何 +何乐而不为 +何以 +何况 +何嘗 +何处 +何妨 +何尝 +何必 +何时 +何時 +何樂而不為 +何止 +何況 +何苦 +何處 +何須 +何须 +余外 +作为 +作為 +作爲 +你 +你们 +你們 +你是 +你的 +佢 +使 +使得 +使用 +來 +來不及 +來得及 +來看 +來着 +來自 +來著 +來說 +來講 +例如 +依 +依据 +依據 +依照 +依靠 +便 +便于 +便於 +係 +促进 +促進 +保持 +保管 +保险 +保險 +俺 +俺们 +俺們 +個 +個人 +個別 +倍加 +倍感 +們 +倒不如 +倒不如說 +倒不如说 +倒是 +倘 +倘使 +倘或 +倘然 +倘若 +借 +借以 +借此 +假使 +假如 +假若 +偉大 +偏偏 +做到 +偶尔 +偶爾 +偶而 +傥然 +傳 +傳聞 +傳說 +僅 +僅僅 +像 +儘 +儘早 +儘管 +儘管如此 +儻然 +儿 +允許 +允许 +元/吨 +元/噸 +充其极 +充其極 +充其量 +充分 +先不先 +先后 +先後 +先生 +光 +光是 +兒 +內 +全体 +全力 +全年 +全然 +全身心 +全部 +全都 +全面 +全體 +兩者 +八 +八成 +公然 +六 +兮 +共 +共同 +共总 +共總 +关于 +其 +其一 +其中 +其二 +其他 +其余 +其后 +其它 +其实 +其實 +其後 +其次 +其餘 +具体 +具体地说 +具体来说 +具体说来 +具有 +具體 +具體來說 +具體來説 +具體地說 +具體說來 +兼之 +内 +再 +再其次 +再则 +再則 +再有 +再次 +再者 +再者說 +再者说 +再說 +再説 +再说 +冒 +冲 +决不 +决定 +决非 +况且 +准备 +凑巧 +凝神 +几 +几乎 +几度 +几时 +几番 +几经 +凡 +凡是 +凭 +凭借 +出 +出于 +出來 +出去 +出於 +出来 +出现 +出現 +分別 +分别 +分头 +分期 +分期分批 +分頭 +切 +切不可 +切切 +切勿 +切莫 +则 +则甚 +刚 +刚好 +刚巧 +刚才 +初 +別 +別人 +別是 +別的 +別管 +別處 +別說 +別説 +别 +别人 +别处 +别是 +别的 +别管 +别说 +到 +到了儿 +到了兒 +到处 +到头 +到头来 +到底 +到目前为止 +到目前為止 +到處 +到頭 +到頭來 +則 +則甚 +前后 +前後 +前此 +前者 +前进 +前進 +前面 +剛 +剛好 +剛巧 +剛才 +加上 +加之 +加以 +加入 +加強 +加强 +动不动 +动辄 +勃然 +動不動 +動輒 +匆匆 +十分 +千 +千万 +千万千万 +千萬 +千萬千萬 +半 +单 +单单 +单纯 +即 +即令 +即使 +即便 +即刻 +即如 +即将 +即將 +即或 +即是說 +即是说 +即若 +却 +却不 +卻 +卻不 +历 +原來 +原来 +去 +又 +又及 +及 +及其 +及时 +及時 +及至 +双方 +反之 +反之亦然 +反之则 +反之則 +反倒 +反倒是 +反应 +反應 +反手 +反映 +反而 +反过来 +反过来说 +反過來 +反過來說 +反過來説 +取得 +取道 +受到 +变成 +古來 +古来 +另 +另一个 +另一個 +另一方面 +另外 +另悉 +另方面 +另行 +只 +只当 +只怕 +只是 +只有 +只消 +只當 +只要 +只限 +叫 +叫做 +召开 +召開 +叮咚 +叮噹 +叮当 +可 +可以 +可好 +可是 +可能 +可見 +可见 +各 +各个 +各人 +各位 +各個 +各地 +各式 +各种 +各種 +各級 +各级 +各自 +合理 +同 +同一 +同时 +同時 +同样 +同樣 +后 +后来 +后者 +后面 +向 +向使 +向着 +向著 +吓 +吗 +否则 +否則 +吧 +吧哒 +吧噠 +吱 +吶 +呀 +呃 +呆呆地 +呐 +呕 +呗 +呜 +呜呼 +呢 +周围 +周圍 +呵 +呵呵 +呸 +呼哧 +呼啦 +咁 +咋 +和 +咚 +咦 +咧 +咱 +咱们 +咱們 +咳 +哇 +哈 +哈哈 +哉 +哎 +哎呀 +哎哟 +哎喲 +哗 +哗啦 +哟 +哦 +哩 +哪 +哪个 +哪些 +哪個 +哪儿 +哪兒 +哪天 +哪年 +哪怕 +哪样 +哪樣 +哪裏 +哪裡 +哪边 +哪邊 +哪里 +哼 +哼唷 +唄 +唉 +唔 +唯有 +啊 +啊呀 +啊哈 +啊哟 +啊喲 +問題 +啐 +啥 +啦 +啪达 +啪達 +啷噹 +啷当 +喀 +喂 +喏 +喔唷 +單 +單單 +單純 +喲 +喽 +嗎 +嗚 +嗚呼 +嗡 +嗡嗡 +嗬 +嗯 +嗳 +嘅 +嘍 +嘎 +嘎嘎 +嘎登 +嘔 +嘘 +嘛 +嘩 +嘩啦 +嘻 +嘿 +嘿嘿 +噓 +噯 +嚇 +嚴格 +嚴重 +四 +因 +因为 +因了 +因此 +因為 +因爲 +因着 +因而 +因著 +固 +固然 +在 +在下 +在于 +在於 +地 +均 +坚决 +坚持 +基于 +基於 +基本 +基本上 +堅持 +堅決 +報導 +報道 +处在 +处处 +处理 +复杂 +多 +多么 +多亏 +多多 +多多少少 +多多益善 +多少 +多年來 +多年前 +多年来 +多数 +多數 +多次 +多虧 +多麼 +够瞧的 +夠瞧的 +夥同 +大 +大不了 +大举 +大事 +大体 +大体上 +大凡 +大力 +大多 +大多数 +大多數 +大大 +大家 +大张旗鼓 +大張旗鼓 +大批 +大抵 +大概 +大略 +大約 +大约 +大致 +大舉 +大都 +大量 +大面儿上 +大面兒上 +大體 +大體上 +失去 +奇 +奈 +奋勇 +奮勇 +她 +她们 +她們 +她是 +她的 +好 +好像 +好在 +好的 +好象 +如 +如上 +如上所述 +如下 +如今 +如何 +如其 +如前所述 +如同 +如常 +如是 +如期 +如果 +如次 +如此 +如此等等 +如若 +始而 +姑且 +存在 +存心 +孰料 +孰知 +宁 +宁可 +宁愿 +宁肯 +它 +它们 +它们的 +它們 +它們的 +它是 +它的 +安全 +完全 +完成 +定 +实现 +实际 +宣布 +容易 +密切 +實現 +實際 +寧 +寧可 +寧肯 +寧願 +对 +对于 +对应 +对待 +对方 +对比 +将 +将才 +将要 +将近 +將 +將才 +將要 +將近 +專門 +對 +對待 +對應 +對方 +對於 +對比 +小 +少数 +少數 +尔 +尔后 +尔尔 +尔等 +尚且 +尤其 +就 +就地 +就是 +就是了 +就是說 +就是説 +就是说 +就此 +就算 +就要 +尽 +尽可能 +尽如人意 +尽心尽力 +尽心竭力 +尽快 +尽早 +尽然 +尽管 +尽管如此 +尽量 +局外 +居然 +屆時 +届时 +属于 +屡 +屡屡 +屡次 +屡次三番 +屢 +屢屢 +屢次 +屢次三番 +屬於 +岂 +岂但 +岂止 +岂非 +川流不息 +左右 +巨大 +巩固 +差一点 +差一點 +差不多 +己 +已 +已矣 +已經 +已经 +巴 +巴巴 +带 +帮助 +帶 +常 +常常 +常言說 +常言說得好 +常言说 +常言说得好 +常言道 +幫助 +平素 +年 +年复一年 +年復一年 +并 +并不 +并不是 +并且 +并排 +并无 +并没 +并没有 +并肩 +并非 +幾 +幾乎 +幾度 +幾時 +幾番 +幾經 +广大 +广泛 +应当 +应用 +应该 +庶乎 +庶几 +庶幾 +廣大 +廣泛 +开外 +开始 +开展 +引起 +弗 +強烈 +強調 +弹指之间 +强烈 +强调 +彈指之間 +归 +归根到底 +归根结底 +归齐 +当 +当下 +当中 +当儿 +当前 +当即 +当口儿 +当地 +当场 +当头 +当庭 +当时 +当然 +当真 +当着 +形成 +彻夜 +彻底 +彼 +彼时 +彼時 +彼此 +往 +往往 +待 +待到 +很 +很多 +很少 +後 +後來 +後来 +後者 +後面 +得 +得了 +得出 +得到 +得天独厚 +得天獨厚 +得起 +從 +從不 +從中 +從事 +從今以後 +從來 +從優 +從古到今 +從古至今 +從嚴 +從寬 +從小 +從新 +從早到晚 +從未 +從此 +從此以後 +從無到有 +從而 +從輕 +從速 +從重 +從頭 +徹夜 +徹底 +心裡 +心里 +必 +必定 +必将 +必將 +必然 +必要 +必須 +必须 +快 +快要 +忽地 +忽然 +怎 +怎么 +怎么办 +怎么样 +怎奈 +怎样 +怎樣 +怎麼 +怎麼樣 +怎麼辦 +怎麽 +怕 +急匆匆 +怪 +怪不得 +总之 +总是 +总的来看 +总的来说 +总的说来 +总结 +总而言之 +恍然 +恐怕 +恰似 +恰好 +恰如 +恰巧 +恰恰 +恰恰相反 +恰逢 +您 +您们 +您們 +您是 +惟其 +惯常 +意思 +愤然 +愿意 +慢說 +慢説 +慢说 +慣常 +憑 +憑藉 +憤然 +應用 +應當 +應該 +成为 +成年 +成年累月 +成心 +成為 +我 +我们 +我們 +我是 +我的 +或 +或则 +或則 +或多或少 +或是 +或曰 +或者 +或許 +或许 +战斗 +截然 +截至 +戰鬥 +所 +所以 +所在 +所幸 +所有 +所謂 +所谓 +才 +才能 +扑通 +打 +打从 +打开天窗说亮话 +打從 +打開天窗說亮話 +扩大 +把 +抑或 +报导 +报道 +抽冷子 +拦腰 +拿 +指 +指出 +按 +按时 +按時 +按期 +按照 +按理 +按說 +按说 +挨个 +挨個 +挨家挨戶 +挨家挨户 +挨次 +挨着 +挨著 +挨門挨戶 +挨門逐戶 +挨门挨户 +挨门逐户 +换句话说 +换言之 +据 +据实 +据悉 +据我所知 +据此 +据称 +据说 +掌握 +採取 +接下來 +接下来 +接着 +接著 +接连不断 +接連不斷 +換句話說 +換句話説 +換言之 +撲通 +據 +據實 +據悉 +據我所知 +據此 +據稱 +據說 +擴大 +攔腰 +放量 +故 +故意 +故此 +故而 +敞开儿 +敞開兒 +敢 +敢于 +敢情 +敢於 +数/ +整个 +整個 +數/ +断然 +斷然 +方 +方便 +方才 +方能 +方面 +於 +於是 +於是乎 +旁人 +无 +无宁 +无法 +无论 +既 +既...又 +既往 +既是 +既然 +日 +日复一日 +日復一日 +日渐 +日漸 +日益 +日臻 +日見 +日见 +时 +时候 +昂然 +明显 +明确 +明確 +明顯 +是 +是不是 +是以 +是否 +是的 +显然 +显著 +時 +時候 +普通 +普遍 +暗中 +暗地裡 +暗地里 +暗自 +更 +更为 +更加 +更為 +更进一步 +更進一步 +曾 +曾經 +曾经 +替 +替代 +最 +最后 +最大 +最好 +最後 +最近 +最高 +會 +月 +有 +有些 +有关 +有利 +有力 +有及 +有所 +有效 +有时 +有時 +有点 +有的 +有的是 +有着 +有著 +有關 +有點 +望 +朝 +朝着 +朝著 +末##末 +本 +本人 +本地 +本着 +本著 +本身 +权时 +来 +来不及 +来得及 +来看 +来着 +来自 +来讲 +来说 +极 +极为 +极了 +极其 +极力 +极大 +极度 +极端 +构成 +果然 +果真 +某 +某个 +某些 +某個 +某某 +根据 +根據 +根本 +格外 +梆 +極 +極了 +極其 +極力 +極大 +極度 +極為 +極端 +概 +構成 +權時 +次第 +欢迎 +欤 +歟 +歡迎 +正值 +正在 +正如 +正巧 +正常 +正是 +此 +此中 +此后 +此地 +此处 +此外 +此後 +此时 +此時 +此次 +此處 +此間 +此间 +歷 +歸 +歸根到底 +歸根結底 +歸齊 +殆 +毋宁 +毋寧 +每 +每个 +每個 +每天 +每年 +每当 +每时每刻 +每時每刻 +每每 +每當 +每逢 +比 +比及 +比如 +比如說 +比如说 +比方 +比照 +比起 +比較 +比较 +毕竟 +毫不 +毫无 +毫无例外 +毫无保留地 +毫無 +毫無例外 +毫無保留地 +汝 +決不 +決定 +決非 +沒 +沒奈何 +沒有 +沙沙 +没 +没奈何 +没有 +沿 +沿着 +沿著 +況且 +注意 +活 +深入 +清楚 +湊巧 +準備 +满 +满足 +滿 +滿足 +漫說 +漫説 +漫说 +為 +為主 +為了 +為什麼 +為什麽 +為何 +為止 +為此 +為著 +烏乎 +焉 +無 +無寧 +無法 +無論 +然 +然则 +然則 +然后 +然後 +然而 +照 +照着 +照著 +爭取 +爲了 +爲什麼 +爲何 +爲甚麼 +爲着 +爲著 +爾 +爾後 +爾爾 +爾等 +牢牢 +特別是 +特别是 +特殊 +特点 +特約 +特约 +特點 +犹且 +犹自 +独 +独媒特约 +独自 +猛然 +猛然間 +猛然间 +猶且 +猶自 +獨 +獨媒特約 +獨自 +獲得 +率尔 +率然 +率爾 +现代 +现在 +現代 +現在 +理应 +理当 +理應 +理當 +理該 +理该 +瑟瑟 +甚且 +甚么 +甚或 +甚而 +甚至 +甚至于 +甚至於 +甚麼 +甚麼樣 +甚麽 +產生 +用 +用來 +用来 +甫 +甭 +由 +由于 +由於 +由是 +由此 +由此可見 +由此可见 +畢竟 +略 +略为 +略加 +略微 +略為 +當 +當下 +當中 +當兒 +當前 +當即 +當口兒 +當地 +當場 +當庭 +當時 +當然 +當真 +當着 +當著 +當頭 +白 +白白 +的 +的确 +的確 +的話 +的话 +皆可 +盡 +盡可能 +盡如人意 +盡心盡力 +盡心竭力 +盡快 +盡然 +盡量 +目前 +直到 +直接 +相似 +相信 +相反 +相同 +相对 +相对而言 +相對 +相對而言 +相应 +相当 +相應 +相當 +相等 +省得 +看 +看上去 +看來 +看出 +看到 +看来 +看样子 +看樣子 +看看 +看見 +看见 +看起來 +看起来 +真是 +真正 +眨眼 +着 +着呢 +矣 +矣乎 +矣哉 +知道 +砰 +确定 +碰巧 +確定 +社会主义 +社會主義 +离 +种 +积极 +称 +移动 +移動 +種 +稱 +積極 +究竟 +穷年累月 +突出 +突然 +窃 +窮年累月 +竊 +立 +立刻 +立即 +立地 +立时 +立時 +立馬 +立马 +竟 +竟然 +竟而 +第 +第二 +等 +等到 +等等 +策略地 +简直 +简而言之 +简言之 +管 +範圍 +簡直 +簡而言之 +簡言之 +类如 +粗 +精光 +純 +純粹 +紧接着 +累年 +累次 +組成 +結合 +結果 +絕 +絕不 +絕對 +絕非 +絕頂 +給 +經 +經常 +經過 +綜上所述 +維持 +緊接著 +練習 +縱 +縱令 +縱使 +縱然 +縷縷 +總之 +總括來説 +總括而言 +總是 +總的來看 +總的來說 +總的來説 +總的說來 +總的説來 +總結 +總而言之 +繼之 +繼後 +繼續 +繼而 +纯 +纯粹 +纵 +纵令 +纵使 +纵然 +练习 +组成 +经 +经常 +经过 +结合 +结果 +给 +绝 +绝不 +绝对 +绝非 +绝顶 +继之 +继后 +继续 +继而 +维持 +综上所述 +缕缕 +罢了 +罷了 +老 +老大 +老是 +老老实实 +老老實實 +考慮 +考虑 +者 +而 +而且 +而况 +而又 +而后 +而外 +而已 +而後 +而是 +而況 +而言 +而論 +而论 +联系 +联袂 +聯繫 +聯袂 +背地裡 +背地里 +背靠背 +能 +能否 +能够 +能夠 +腾 +臨 +臨到 +自 +自个儿 +自从 +自個兒 +自各儿 +自各兒 +自后 +自家 +自己 +自後 +自從 +自打 +自身 +臭 +至 +至于 +至今 +至於 +至若 +致 +與 +與其 +與其說 +與否 +與此同時 +舉凡 +舉行 +般的 +良好 +若 +若夫 +若是 +若果 +若非 +范围 +莫 +莫不 +莫不然 +莫如 +莫若 +莫非 +获得 +萬一 +著 +著呢 +藉以 +藉此 +處在 +處理 +處處 +虽 +虽则 +虽然 +虽说 +蛮 +蠻 +行为 +行动 +行動 +行為 +衝 +表明 +表示 +被 +裡面 +複雜 +要 +要不 +要不是 +要不然 +要么 +要是 +要求 +要麼 +見 +規定 +親口 +親手 +親眼 +親自 +親身 +覺得 +见 +规定 +觉得 +設使 +設或 +設若 +許多 +話說 +該 +該當 +認為 +認爲 +認真 +認識 +誠如 +誠然 +說 +說來 +說明 +說說 +誰 +誰人 +誰料 +誰知 +請勿 +論 +論說 +諸 +諸位 +諸如 +謹 +譬喻 +譬如 +變成 +讓 +认为 +认真 +认识 +让 +许多 +论 +论说 +设使 +设或 +设若 +诚如 +诚然 +话说 +该 +该当 +说 +说明 +说来 +说说 +请勿 +诸 +诸位 +诸如 +谁 +谁人 +谁料 +谁知 +谨 +豁然 +豈 +豈但 +豈止 +豈非 +豐富 +賊死 +賴以 +贼死 +赖以 +赶 +赶快 +赶早不赶晚 +起 +起來 +起先 +起初 +起头 +起来 +起見 +起见 +起頭 +起首 +趁 +趁便 +趁势 +趁勢 +趁早 +趁机 +趁機 +趁热 +趁熱 +趁着 +趁著 +越是 +趕 +趕快 +趕早不趕晚 +距 +跟 +路經 +路经 +較 +較之 +較比 +較為 +轉動 +轉變 +轉貼 +轟然 +转动 +转变 +转贴 +轰然 +较 +较为 +较之 +较比 +边 +达到 +达旦 +迄 +迅速 +过 +过于 +过去 +过来 +运用 +近 +近來 +近几年来 +近年來 +近年来 +近幾年來 +近来 +还 +还是 +还有 +还要 +这 +这一来 +这个 +这么 +这么些 +这么样 +这么点儿 +这些 +这会儿 +这儿 +这就是说 +这时 +这样 +这次 +这点 +这种 +这般 +这边 +这里 +这麽 +进入 +进去 +进来 +进步 +进而 +进行 +连 +连同 +连声 +连日 +连日来 +连袂 +连连 +迟早 +迫于 +迫於 +适应 +适当 +适用 +逐步 +逐渐 +逐漸 +這 +這一來 +這些 +這個 +這兒 +這就是說 +這就是説 +這時 +這會兒 +這樣 +這次 +這種 +這般 +這裏 +這裡 +這邊 +這麼 +這麼些 +這麼樣 +這麼點兒 +這麽 +這點 +通常 +通过 +通過 +造成 +逢 +連 +連同 +連日 +連日來 +連聲 +連袂 +連連 +進來 +進入 +進去 +進步 +進而 +進行 +遇到 +運用 +過 +過來 +過去 +過於 +達到 +達旦 +適應 +適用 +適當 +遭到 +遲早 +遵循 +遵照 +避免 +還 +還是 +還有 +還要 +邊 +那 +那个 +那么 +那么些 +那么样 +那些 +那会儿 +那個 +那儿 +那兒 +那时 +那時 +那會兒 +那末 +那样 +那樣 +那般 +那裏 +那裡 +那边 +那邊 +那里 +那麼 +那麼些 +那麼樣 +那麽 +部分 +都 +鄙人 +采取 +里面 +重大 +重新 +重要 +針對 +鉴于 +鑑於 +鑒於 +针对 +長期以來 +長此下去 +長線 +長話短說 +长期以来 +长此下去 +长线 +长话短说 +開外 +開始 +開展 +間或 +關於 +问题 +间或 +防止 +阿 +附近 +陈年 +限制 +陡然 +除 +除了 +除却 +除卻 +除去 +除外 +除开 +除此 +除此之外 +除此以外 +除此而外 +除開 +除非 +陳年 +随 +随后 +随时 +随着 +随著 +隔夜 +隔日 +隨 +隨後 +隨時 +隨著 +难得 +难怪 +难说 +难道 +难道说 +集中 +雖 +雖則 +雖然 +雖說 +雖説 +雙方 +離 +難得 +難怪 +難說 +難道 +難道說 +雲爾 +零 +需要 +非但 +非常 +非徒 +非得 +非特 +非独 +非獨 +靠 +鞏固 +頂多 +頃 +頃刻 +頃刻之間 +頃刻間 +順 +順着 +順著 +頓時 +頗 +願意 +類如 +顯然 +顯著 +顶多 +顷 +顷刻 +顷刻之间 +顷刻间 +顺 +顺着 +顿时 +颇 +風雨無阻 +风雨无阻 +飽 +餘外 +餵 +饱 +首先 +馬上 +騰 +马上 +高低 +高兴 +高興 +麼 +默然 +默默地 +齊 +齐 +︿ +! +# +$ +% +& +' +( +) +)÷(1- +)、 +* ++ ++ξ +++ +, +,也 +- +-β +-- +-[*]- +. +/ +0 +0:2 +1 +1. +12% +2 +2.3% +3 +4 +5 +5:0 +6 +7 +8 +9 +: +; +< +<± +<Δ +<λ +<φ +<< += +=″ +=☆ +=( +=- +=[ +={ +> +>λ +? +@ +[ +[①①] +[①②] +[①③] +[①④] +[①⑤] +[①⑥] +[①⑦] +[①⑧] +[①⑨] +[①] +[①A] +[①B] +[①C] +[①D] +[①E] +[①f] +[①g] +[①h] +[①i] +[①o] +[② +[②①] +[②②] +[②③] +[②④ +[②⑤] +[②⑥] +[②⑦] +[②⑧] +[②⑩] +[②] +[②a] +[②B] +[②c] +[②d] +[②e] +[②f] +[②G] +[②h] +[②i] +[②j] +[③①] +[③⑩] +[③] +[③a] +[③b] +[③c] +[③d] +[③e] +[③F] +[③g] +[③h] +[④] +[④a] +[④b] +[④c] +[④d] +[④e] +[⑤] +[⑤]] +[⑤a] +[⑤b] +[⑤d] +[⑤e] +[⑤f] +[⑥] +[⑦] +[⑧] +[⑨] +[⑩] +[*] +[- +[] +] +]∧′=[ +][ +_ +A +a] +b] +c] +e] +f] +LI +ng昉 +R. L. +R.L. +ZXFITL +{ +{- +| +} +}> +~ +~± +~+ +¥ \ No newline at end of file diff --git a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/StoriesBase.pm b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/StoriesBase.pm index 2e1512d9c4..15a6c2df1c 100644 --- a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/StoriesBase.pm +++ b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/StoriesBase.pm @@ -14,6 +14,7 @@ use namespace::autoclean; use MediaWords::DBI::Stories; use MediaWords::DBI::Stories::WordMatrix; +use MediaWords::DBI::Stories::WordMatrixOldStopwords; # FIXME remove once stopword comparison is over use MediaWords::Solr; use MediaWords::Solr::TagCounts; use MediaWords::Util::ParseHTML; @@ -165,16 +166,23 @@ SQL } # add a word_count field to each story that includes a word count for that story -sub _attach_word_counts_to_stories($$) +# FIXME remove extra "$" once stopword comparison is over +sub _attach_word_counts_to_stories($$$) { - my ( $db, $stories ) = @_; + # FIXME remove extra parameter once stopword comparison is over + my ( $db, $stories, $old_stopwords ) = @_; my $stories_ids = [ map { $_->{ stories_id } } @{ $stories } ]; my $stories_lookup = {}; map { $stories_lookup->{ $_->{ stories_id } } = $_ } @{ $stories }; - my ( $word_matrix, $word_list ) = MediaWords::DBI::Stories::WordMatrix::get_story_word_matrix( $db, $stories_ids ); + my ( $word_matrix, $word_list ); + if ( $old_stopwords ) { + ( $word_matrix, $word_list ) = MediaWords::DBI::Stories::WordMatrixOldStopwords::get_story_word_matrix( $db, $stories_ids ); + } else { + ( $word_matrix, $word_list ) = MediaWords::DBI::Stories::WordMatrix::get_story_word_matrix( $db, $stories_ids ); + } while ( my ( $stories_id, $word_counts ) = each( %{ $word_matrix } ) ) { @@ -322,7 +330,9 @@ SQL $stories = MediaWords::DBI::Stories::attach_story_data_to_stories( $stories, $feed_data, 'feeds' ); } - $stories = _attach_word_counts_to_stories( $db, $stories ) if ( int( $self->{ show_wc } // 0 ) ); + if ( int( $self->{ show_wc } // 0 ) ) { + $stories = _attach_word_counts_to_stories( $db, $stories, $self->{ old_stopwords } ); + } return $stories; } @@ -381,6 +391,8 @@ sub _fetch_list($$$$$$) $self->{ show_text } = int( $c->req->params->{ text } // 0 ); $self->{ show_ap_stories_id } = int( $c->req->params->{ ap_stories_id } // 0 ); $self->{ show_wc } = int( $c->req->params->{ wc } // 0 ); + # FIXME remove once stopword comparison is over + $self->{ old_stopwords } = int( $c->req->params->{ old_stopwords } // 0 ); $self->{ show_feeds } = int( $c->req->params->{ show_feeds } // 0 ); $rows //= 20; @@ -544,7 +556,13 @@ sub word_matrix_GET my $stories_ids = MediaWords::Solr::search_solr_for_stories_ids( $db, { q => $q, fq => $fq, rows => $rows, sort => 'random_1 asc' } ); - my ( $word_matrix, $word_list ) = MediaWords::DBI::Stories::WordMatrix::get_story_word_matrix( $db, $stories_ids ); + my ( $word_matrix, $word_list ); + if ( $c->req->params->{ old_stopwords } ) { + # FIXME remove once stopword comparison is over + ( $word_matrix, $word_list ) = MediaWords::DBI::Stories::WordMatrixOldStopwords::get_story_word_matrix( $db, $stories_ids ); + } else { + ( $word_matrix, $word_list ) = MediaWords::DBI::Stories::WordMatrix::get_story_word_matrix( $db, $stories_ids ); + } $self->status_ok( $c, entity => { word_matrix => $word_matrix, word_list => $word_list } ); diff --git a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Wc.pm b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Wc.pm index c3c39f024f..d1c560185d 100644 --- a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Wc.pm +++ b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Wc.pm @@ -9,6 +9,9 @@ use List::Util qw(first max maxstr min minstr reduce shuffle sum); use Moose; use namespace::autoclean; use MediaWords::Solr; +use MediaWords::Solr::WordCounts; +use MediaWords::Solr::WordCountsOldStopwords; # FIXME remove once stopword comparison is over + =head1 NAME @@ -47,7 +50,13 @@ sub list_GET : PathPrefix( '/api' ) $c->req->params->{ sample_size } = $sample_size; - my $wc = MediaWords::Solr::WordCounts->new( { db => $c->dbis, cgi_params => $c->req->params } ); + my $wc; + if ( $c->req->params->{ old_stopwords } ) { + # FIXME remove once stopword comparison is over + $wc = MediaWords::Solr::WordCountsOldStopwords->new( { db => $c->dbis, cgi_params => $c->req->params } ); + } else { + $wc = MediaWords::Solr::WordCounts->new( { db => $c->dbis, cgi_params => $c->req->params } ); + } my $words = $wc->get_words; diff --git a/apps/webapp-api/src/perl/MediaWords/DBI/Stories/WordMatrixOldStopwords.pm b/apps/webapp-api/src/perl/MediaWords/DBI/Stories/WordMatrixOldStopwords.pm new file mode 100644 index 0000000000..b664d8116d --- /dev/null +++ b/apps/webapp-api/src/perl/MediaWords/DBI/Stories/WordMatrixOldStopwords.pm @@ -0,0 +1,152 @@ +# FIXME remove once stopword comparison is over +package MediaWords::DBI::Stories::WordMatrixOldStopwords; + +use strict; +use warnings; + +use Modern::Perl "2015"; +use MediaWords::CommonLibs; + +use List::MoreUtils qw(natatime); + +use MediaWords::Solr::WordCountsOldStopwords; + +# get a postgres cursor that will return the concatenated story_sentences for each of the given stories_ids. use +# $sentence_separator to join the sentences for each story. +sub _get_story_word_matrix_rows($$$) +{ + my ( $db, $stories_ids, $sentence_separator ) = @_; + + return [] unless ( @{ $stories_ids } ); + + my $stories_ids_list = join( ',', map { int( $_ ) } @{ $stories_ids } ); + + my $ids_table = $db->get_temporary_ids_table( $stories_ids ); + my $rows = $db->query( <hashes; +select stories_id, language, string_agg( sentence, \$1 ) story_text + from story_sentences + where stories_id in ( $stories_ids_list ) + group by stories_id, language + order by stories_id, language +SQL + + return $rows; +} + +# Given a list of stories_ids, generate a matrix consisting of the vector of word stem counts for each stories_id on each +# line. Return a hash of story word counts and a list of word stems. +# +# The list of story word counts is in the following format: +# { +# { => +# { => , +# +# } +# }, +# ... +# ] +# +# The id of each word is the indes of the given word in the word list. The word list is a list of lists, with each +# member list consisting of the stem followed by the most commonly used term. +# +# For example, for stories_ids 1 and 2, both of which contain 4 mentions of 'foo' and 10 of 'bars', the word count +# has and and word list look like: +# +# [ { 1 => { 0 => 4, 1 => 10 } }, { 2 => { 0 => 4, 1 => 10 } } ] +# +# [ [ 'foo', 'foo' ], [ 'bar', 'bars' ] ] +# +# The story_sentences for each story will be used for word counting. If $max_words is specified, only the most common +# $max_words will be used for each story. +# +# The function uses MediaWords::Util::IdentifyLanguage to identify the stemming and stopwording language for each story. +# If the language of a given story is not supported, stemming and stopwording become null operations. For the list of +# languages supported, see @MediaWords::Langauges::Language::_supported_languages. +sub get_story_word_matrix($$;$) +{ + my ( $db, $stories_ids, $max_words ) = @_; + + my $word_index_lookup = {}; + my $word_index_sequence = 0; + my $word_term_counts = {}; + + my $use_transaction = !$db->in_transaction(); + $db->begin if ( $use_transaction ); + + my $sentence_separator = 'SPLITSPLIT'; + my $story_text_cursor = + + my $word_matrix = {}; + my $iter = natatime( 100, @{ $stories_ids } ); + while ( my @chunk_stories_ids = $iter->() ) + { + my $stories = _get_story_word_matrix_rows( $db, \@chunk_stories_ids, $sentence_separator ); + + for my $story ( @{ $stories } ) + { + my $wc = MediaWords::Solr::WordCountsOldStopwords->new(); + + # Remove stopwords from the stems + $wc->include_stopwords( 0 ); + + my $sentences_and_story_languages = []; + for my $sentence ( split( $sentence_separator, $story->{ story_text } ) ) + { + push( + @{ $sentences_and_story_languages }, + { + 'story_language' => $story->{ language }, + 'sentence' => $sentence, + } + ); + } + + my $stem_counts = $wc->count_stems( $sentences_and_story_languages ); + + my $stem_count_list = []; + while ( my ( $stem, $data ) = each( %{ $stem_counts } ) ) + { + push( @{ $stem_count_list }, [ $stem, $data->{ count }, $data->{ terms } ] ); + } + + if ( $max_words ) + { + $stem_count_list = [ sort { $b->[ 1 ] <=> $a->[ 1 ] } @{ $stem_count_list } ]; + splice( @{ $stem_count_list }, 0, $max_words ); + } + + $word_matrix->{ $story->{ stories_id } } //= {}; + my $stem_vector = $word_matrix->{ $story->{ stories_id } }; + for my $stem_count ( @{ $stem_count_list } ) + { + my ( $stem, $count, $terms ) = @{ $stem_count }; + + $word_index_lookup->{ $stem } //= $word_index_sequence++; + my $index = $word_index_lookup->{ $stem }; + + $stem_vector->{ $index } += $count; + + map { $word_term_counts->{ $stem }->{ $_ } += $terms->{ $_ } } keys( %{ $terms } ); + } + } + } + + $db->commit if ( $use_transaction ); + + my $word_list = []; + for my $stem ( keys( %{ $word_index_lookup } ) ) + { + my $term_pairs = []; + while ( my ( $term, $count ) = each( %{ $word_term_counts->{ $stem } } ) ) + { + push( @{ $term_pairs }, [ $term, $count ] ); + } + + $term_pairs = [ sort { $b->[ 1 ] <=> $a->[ 1 ] } @{ $term_pairs } ]; + $word_list->[ $word_index_lookup->{ $stem } ] = [ $stem, $term_pairs->[ 0 ]->[ 0 ] ]; + } + + return ( $word_matrix, $word_list ); +} + +1; diff --git a/apps/webapp-api/src/perl/MediaWords/Solr/WordCountsOldStopwords.pm b/apps/webapp-api/src/perl/MediaWords/Solr/WordCountsOldStopwords.pm new file mode 100644 index 0000000000..8af5a061e5 --- /dev/null +++ b/apps/webapp-api/src/perl/MediaWords/Solr/WordCountsOldStopwords.pm @@ -0,0 +1,447 @@ +# FIXME remove once stopword comparison is over +package MediaWords::Solr::WordCountsOldStopwords; + +use Moose; + +=head1 NAME + +MediaWords::Solr::WordCounts - handle word counting from solr + +=head1 DESCRIPTION + +Uses sampling to generate quick word counts from solr queries. + +=cut + +use strict; +use warnings; +use utf8; + +use Modern::Perl "2015"; +use MediaWords::CommonLibs; + +use CHI; +use Data::Dumper; +use Encode; +use List::Util; +use Readonly; +use URI::Escape; + +use MediaWords::Languages::Language; +use MediaWords::Solr; +use MediaWords::Solr::Query::MatchingSentences; +use MediaWords::Util::ParseJSON; +use MediaWords::Util::Text; + +# Max. length of the sentence to tokenize +Readonly my $MAX_SENTENCE_LENGTH => 1024; + +# Max. number of times to count a word in a single sentence +Readonly my $MAX_REPEATS_PER_SENTENCE => 3; + +# mediawords.wc_cache_version from config +my $_wc_cache_version; + +# Moose instance fields + +has 'q' => ( is => 'rw', isa => 'Str' ); +has 'fq' => ( is => 'rw', isa => 'ArrayRef' ); +has 'num_words' => ( is => 'rw', isa => 'Int', default => 500 ); +has 'sample_size' => ( is => 'rw', isa => 'Int', default => 1000 ); +has 'random_seed' => ( is => 'rw', isa => 'Int', default => 1 ); +has 'ngram_size' => ( is => 'rw', isa => 'Int', default => 1 ); +has 'include_stopwords' => ( is => 'rw', isa => 'Bool' ); +has 'include_stats' => ( is => 'rw', isa => 'Bool' ); +has 'cached_combined_stopwords' => ( is => 'rw', isa => 'HashRef' ); +has 'db' => ( is => 'rw' ); + +# list of all attribute names that should be exposed as cgi params +sub __get_cgi_param_attributes() +{ + return [ qw(q fq num_words sample_size random_seed include_stopwords include_stats ngram_size) ]; +} + +# return hash of attributes for use as cgi params +sub _get_cgi_param_hash($) +{ + my ( $self ) = @_; + + my $keys = __get_cgi_param_attributes(); + + my $meta = $self->meta; + + my $hash = {}; + map { $hash->{ $_ } = $meta->get_attribute( $_ )->get_value( $self ) } @{ $keys }; + + return $hash; +} + +# add support for constructor in this form: +# WordsCounts->new( cgi_params => $cgi_params ) +# where $cgi_params is a hash of cgi params directly from a web request +around BUILDARGS => sub { + my $orig = shift; + my $class = shift; + + my $args; + if ( ref( $_[ 0 ] ) ) + { + $args = $_[ 0 ]; + } + elsif ( defined( $_[ 0 ] ) ) + { + $args = { @_ }; + } + else + { + $args = {}; + } + + my $vals; + if ( $args->{ cgi_params } ) + { + my $cgi_params = $args->{ cgi_params }; + + $vals = {}; + my $keys = __get_cgi_param_attributes(); + for my $key ( @{ $keys } ) + { + if ( exists( $cgi_params->{ $key } ) ) + { + $vals->{ $key } = $cgi_params->{ $key }; + } + } + + if ( $args->{ db } ) + { + $vals->{ db } = $args->{ db }; + } + } + else + { + $vals = $args; + } + + if ( $vals->{ fq } && !ref( $vals->{ fq } ) ) + { + $vals->{ fq } = [ $vals->{ fq } ]; + } + + $vals->{ fq } ||= []; + + return $class->$orig( $vals ); +}; + +# Cache merged hashes of stopwords for speed +sub _combine_stopwords($$) +{ + my ( $self, $languages ) = @_; + + unless ( ref( $languages ) eq ref( [] ) ) + { + die "Languages is not an arrayref."; + } + unless ( scalar( @{ $languages } ) > 0 ) + { + die "Languages should have at least one language set."; + } + + my $language_lookup = {}; + my $deduped_languages = []; + for my $language ( @{ $languages } ) + { + unless ( $language_lookup->{ $language->language_code() } ) + { + push( @{ $deduped_languages }, $language ); + $language_lookup->{ $language->language_code() } = 1; + } + } + + $languages = $deduped_languages; + + my $language_codes = []; + foreach my $language ( @{ $languages } ) + { + push( @{ $language_codes }, $language->language_code() ); + } + $language_codes = [ sort( @{ $language_codes } ) ]; + + my $cache_key = join( '-', @{ $language_codes } ); + + unless ( $self->cached_combined_stopwords() ) + { + $self->cached_combined_stopwords( {} ); + } + + unless ( defined $self->cached_combined_stopwords->{ $cache_key } ) + { + my $combined_stopwords = {}; + foreach my $language ( @{ $languages } ) + { + my $stopwords = $language->stop_words_old_map(); + $combined_stopwords = { ( %{ $combined_stopwords }, %{ $stopwords } ) }; + } + + $self->cached_combined_stopwords->{ $cache_key } = $combined_stopwords; + } + + return $self->cached_combined_stopwords->{ $cache_key }; +} + +# expects story_sentence hashes, with a story_language field. +# +# parse the text and return a count of stems and terms in the sentence in the +# following format: +# +# { $stem => { count => $stem_count, terms => { $term => $term_count, ... } } } +# +# if ngram_size is > 1, use the unstemmed phrases of ngram_size as the stems +sub count_stems($$) +{ + my ( $self, $story_sentences ) = @_; + + # Set any duplicate sentences blank + my $dup_sentences = {}; + + # Tokenize each sentence and add count to $words for each token + my $stem_counts = {}; + for my $story_sentence ( @{ $story_sentences } ) + { + next unless ( defined( $story_sentence ) ); + + my $sentence = $story_sentence->{ 'sentence' }; + next unless ( defined( $sentence ) ); + + next if ( $dup_sentences->{ $sentence } ); + $dup_sentences->{ $sentence } = 1; + + # Very long sentences tend to be noise -- html text and the like. + $sentence = substr( $sentence, 0, $MAX_SENTENCE_LENGTH ) if ( length( $sentence ) > $MAX_SENTENCE_LENGTH ); + + # Remove urls so they don't get tokenized into noise + if ( $sentence =~ m~https?://[^\s]+~i ) + { + $sentence =~ s~https?://[^\s]+~~gi; + } + + my $story_language = $story_sentence->{ 'story_language' } || 'en'; + my $sentence_language = $story_sentence->{ language } || 'en'; + + # Language objects are cached in ::Languages::Language, no need to have a separate cache + my $lang_en = MediaWords::Languages::Language::default_language(); + my $lang_story = MediaWords::Languages::Language::language_for_code( $story_language ) || $lang_en; + my $lang_sentence = MediaWords::Languages::Language::language_for_code( $sentence_language ) || $lang_en; + + # Tokenize into words + my $sentence_words = $lang_sentence->split_sentence_to_words( $sentence ); + + # Remove stopwords; + # (don't stem stopwords first as they will usually be stemmed too much) + my $combined_stopwords = {}; + unless ( $self->include_stopwords ) + { + # Use both sentence's language and English stopwords + $combined_stopwords = $self->_combine_stopwords( [ $lang_en, $lang_story, $lang_sentence ] ); + } + + sub _word_is_valid_token($$) + { + my ( $word, $stopwords ) = @_; + + # Remove numbers + if ( $word =~ /^\d+?$/ ) + { + return 0; + } + + # Remove stopwords + if ( $stopwords->{ $word } ) + { + return 0; + } + + return 1; + } + + $sentence_words = [ grep { _word_is_valid_token( $_, $combined_stopwords ) } @{ $sentence_words } ]; + + # Stem using sentence language's algorithm + my $sentence_word_stems = + ( $self->ngram_size > 1 ) ? $sentence_words : $lang_sentence->stem_words( $sentence_words ); + + my $n = $self->ngram_size; + my $num_ngrams = scalar( @{ $sentence_words } ) - $n + 1; + + my $sentence_stem_counts = {}; + + for ( my $i = 0 ; $i < $num_ngrams ; ++$i ) + { + my $term = join( ' ', @{ $sentence_words }[ $i .. ( $i + $n - 1 ) ] ); + my $stem = join( ' ', @{ $sentence_word_stems }[ $i .. ( $i + $n - 1 ) ] ); + + $sentence_stem_counts->{ $stem } //= {}; + ++$sentence_stem_counts->{ $stem }->{ count }; + + next if ( $sentence_stem_counts->{ $stem }->{ count } > $MAX_REPEATS_PER_SENTENCE ); + + $stem_counts->{ $stem } //= {}; + ++$stem_counts->{ $stem }->{ count }; + + $stem_counts->{ $stem }->{ terms } //= {}; + ++$stem_counts->{ $stem }->{ terms }->{ $term }; + } + } + + return $stem_counts; +} + +# connect to solr server directly and count the words resulting from the query +sub _get_words_from_solr_server($) +{ + my ( $self ) = @_; + + my $db = $self->db; + + unless ( $self->q() || ( $self->fq && @{ $self->fq } ) ) + { + return []; + } + + my $solr_params = { + q => $self->q(), + fq => $self->fq, + rows => $self->sample_size, + sort => 'random_' . $self->random_seed . ' asc' + }; + + DEBUG( "executing solr query ..." ); + DEBUG Dumper( $solr_params ); + + my $story_sentences = MediaWords::Solr::Query::MatchingSentences::query_matching_sentences( $self->db, $solr_params, $self->sample_size ); + + DEBUG( "counting sentences..." ); + my $words = $self->count_stems( $story_sentences ); + DEBUG( "done counting sentences" ); + + my @word_list; + while ( my ( $stem, $count ) = each( %{ $words } ) ) + { + push( @word_list, { stem => $stem, count => $count->{ count } } ); + } + + @word_list = sort { + $b->{ count } <=> $a->{ count } or # + $b->{ stem } cmp $a->{ stem } # + } @word_list; + + my $counts = []; + for my $w ( @word_list ) + { + my $terms = $words->{ $w->{ stem } }->{ terms }; + my ( $max_term, $max_term_count ); + while ( my ( $term, $term_count ) = each( %{ $terms } ) ) + { + if ( !$max_term || ( $term_count > $max_term_count ) ) + { + $max_term = $term; + $max_term_count = $term_count; + } + } + + if ( !MediaWords::Util::Text::is_valid_utf8( $w->{ stem } ) || !MediaWords::Util::Text::is_valid_utf8( $max_term ) ) + { + WARN "invalid utf8: $w->{ stem } / $max_term"; + next; + } + + push( @{ $counts }, { stem => $w->{ stem }, count => $w->{ count }, term => $max_term } ); + } + + splice( @{ $counts }, $self->num_words ); + + if ( $self->include_stats ) + { + return { + stats => { + num_words_returned => scalar( @{ $counts } ), + num_sentences_returned => scalar( @{ $story_sentences } ), + num_words_param => $self->num_words, + sample_size_param => $self->sample_size, + random_seed => $self->random_seed + }, + words => $counts + }; + } + else + { + return $counts; + } +} + +# return CHI cache for word counts +sub _get_cache +{ + return CHI->new( + driver => 'File', + expires_in => '1 day', + expires_variance => '0.1', + root_dir => "/var/cache/word_counts", + depth => 4 + ); +} + +# return key that uniquely identifies the query +sub _get_cache_key +{ + my ( $self ) = @_; + + $_wc_cache_version = '1'; + + my $meta = $self->meta; + + my $keys = $self->__get_cgi_param_attributes(); + + my $hash_key = "$_wc_cache_version:" . Dumper( map { $meta->get_attribute( $_ )->get_value( $self ) } @{ $keys } ); + + return $hash_key; +} + +# get a cached value for the given word count +sub _get_cached_words +{ + my ( $self ) = @_; + + return $self->_get_cache->get( $self->_get_cache_key ); +} + +# set a cached value for the given word count +sub _set_cached_words +{ + my ( $self, $value ) = @_; + + return $self->_get_cache->set( $self->_get_cache_key, $value ); +} + +# get sorted list of most common words in sentences matching a Solr query, +# exclude stop words. Assumes english stemming and stopwording for now. +sub get_words +{ + my ( $self ) = @_; + + my $words; + + $words = $self->_get_cached_words; + + if ( $words ) + { + return $words; + } + + $words ||= $self->_get_words_from_solr_server(); + + $self->_set_cached_words( $words ); + + return $words; +} + +1; From 1f2397552e1f6c1ca3e8b0fbc447a7feb8236db3 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Thu, 1 Apr 2021 15:41:44 -0400 Subject: [PATCH 043/175] update english list --- .../mediawords/languages/en/en_stop_words.txt | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt index 742cb22dbd..bafefdb294 100644 --- a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt @@ -324,7 +324,6 @@ co. com come comes -computer con concerning consequently @@ -473,7 +472,6 @@ forward found four fr -free from front full @@ -608,21 +606,17 @@ i.e. id ie if -ignored ii il ill im immediate immediately -importance -important in inasmuch inc inc. indeed -index indicate indicated indicates @@ -702,17 +696,11 @@ liked likely likewise line -little lk ll -long -longer -longest look looking looks -low -lower lr ls lt @@ -807,8 +795,6 @@ needn't neednt needs neither -net -netscape never neverf neverless @@ -861,9 +847,6 @@ one's ones only onto -open -opened -opening opens or ord @@ -1029,7 +1012,6 @@ significantly similar similarly since -sincere site six sixty @@ -1133,7 +1115,6 @@ theyd theyll theyre theyve -thick thin thing things @@ -1194,7 +1175,6 @@ um un under underneath -undoing unless unlike until @@ -1213,7 +1193,6 @@ uy uz v va -value various vc ve @@ -1310,7 +1289,6 @@ wish with within without -won won't wont would @@ -1334,9 +1312,6 @@ you're you've youd youll -young -younger -youngest your youre yours From 1fba261b25c1c08a0388b6a2f4f423b43db9a701 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Thu, 1 Apr 2021 15:57:57 -0400 Subject: [PATCH 044/175] further trip eng stopwords --- .../mediawords/languages/en/en_stop_words.txt | 94 ------------------- 1 file changed, 94 deletions(-) diff --git a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt index bafefdb294..e2869511c4 100644 --- a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt @@ -33,7 +33,6 @@ B Bart Bldg Brig -Bros C Capt Cmdr @@ -166,8 +165,6 @@ ain't aint al all -allow -allows almost alone along @@ -229,12 +226,6 @@ awfully az b ba -back -backed -backing -backs -backward -backwards bb bd be @@ -246,12 +237,6 @@ becoming been before beforehand -began -begin -beginning -beginnings -begins -behind being beings believe @@ -266,8 +251,6 @@ bf bg bh bi -big -bill billion biol bj @@ -275,10 +258,7 @@ bm bn bo both -bottom br -brief -briefly bs bt but @@ -309,13 +289,9 @@ certainly cf cg ch -changes ci ck cl -clear -clearly -click cm cmon cn @@ -332,14 +308,12 @@ considering contain containing contains -copy corresponding could could've couldn couldn't couldnt -course cr cry cs @@ -357,10 +331,7 @@ date de dear definitely -describe -described despite -detail did didn didn't @@ -382,19 +353,12 @@ don don't done dont -doubtful -down -downed -downing -downs -downwards due during dz e e.g each -early ec ed edu @@ -408,11 +372,6 @@ either eleven else elsewhere -empty -end -ended -ending -ends enough entirely er @@ -436,14 +395,6 @@ exactly example except f -face -faces -fairly -far -farther -felt -few -fewer ff fi fifteen @@ -451,7 +402,6 @@ fifth fifty fify find -finds first five fix @@ -459,21 +409,14 @@ fj fk fm fo -followed -following -follows for forever -former -formerly forth forty forward -found four fr from -front full fully further @@ -509,18 +452,12 @@ go goes going gone -good -goods got gotten gov gp gq gr -group -grouped -grouping -groups gs gt gu @@ -547,9 +484,7 @@ he'd he'll he's hed -hell hello -help hence her here @@ -566,10 +501,6 @@ herself herse” hes hi -hid -high -higher -highest him himse himse" @@ -610,8 +541,6 @@ ii il ill im -immediate -immediately in inasmuch inc @@ -620,8 +549,6 @@ indeed indicate indicated indicates -inner -inside insofar instead int @@ -673,18 +600,14 @@ ky kz l la -large largely last lately later -latest latter latterly lb lc -least -length less lest let @@ -715,7 +638,6 @@ mainly make makes making -man many may maybe @@ -788,12 +710,8 @@ near nearly necessarily necessary -need -needed -needing needn't neednt -needs neither never neverf @@ -814,7 +732,6 @@ none nonetheless noone nor -normally nos not noted @@ -863,7 +780,6 @@ ours ourselves ourselves out -outside over overall owing @@ -897,7 +813,6 @@ point pointed pointing points -poorly possible possibly potentially @@ -935,17 +850,12 @@ really reasonably recent recently -ref -refs regarding regardless regards related relatively respectively -rev -right -ring ro ru run @@ -1067,8 +977,6 @@ td tell ten tends -test -text tf tg th @@ -1284,8 +1192,6 @@ why's widely width will -willing -wish with within without From 3d9415141703b12457548c142337a9e723036b2a Mon Sep 17 00:00:00 2001 From: jtotoole Date: Fri, 2 Apr 2021 16:04:37 -0400 Subject: [PATCH 045/175] update en + tr lists --- .../mediawords/languages/en/en_stop_words.txt | 236 +----------------- .../mediawords/languages/tr/tr_stop_words.txt | 1 - 2 files changed, 1 insertion(+), 236 deletions(-) diff --git a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt index e2869511c4..b7db4e6c12 100644 --- a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt @@ -146,21 +146,16 @@ actually ad added adj -adopted -ae -af affected affecting affects after afterwards -ag again against ago ah ahead -ai ain't aint al @@ -193,13 +188,10 @@ anything anyway anyways anywhere -ao apart apparently appear approximately -aq -ar are area areas @@ -217,17 +209,11 @@ asking asks associated at -au auth available -aw away awfully -az b -ba -bb -bd be became because @@ -247,26 +233,10 @@ best better between beyond -bf -bg -bh -bi -billion -biol -bj -bm -bn -bo both -br -bs -bt but buy -bv -bw by -bz c c'mon c's @@ -283,24 +253,14 @@ cases cause causes cc -cd certain certainly -cf -cg -ch -ci -ck -cl -cm cmon -cn co co. com come comes -con concerning consequently consider @@ -314,21 +274,13 @@ could've couldn couldn't couldnt -cr cry -cs -cu currently -cv -cx -cy -cz d dare daren't darent date -de dear definitely despite @@ -340,9 +292,6 @@ differ different differently directly -dj -dk -dm do does doesn @@ -355,17 +304,12 @@ done dont due during -dz e e.g each -ec -ed edu -ee effect eg -eh eight eighty either @@ -374,9 +318,6 @@ else elsewhere enough entirely -er -es -esp especially et et-al @@ -390,12 +331,10 @@ everybody everyone everything everywhere -ex exactly example except f -ff fi fifteen fifth @@ -405,17 +344,12 @@ find first five fix -fj -fk -fm -fo for forever forth forty forward four -fr from full fully @@ -424,30 +358,18 @@ furthered furthering furthermore furthers -fx g -ga gave -gb -gd -ge general generally get gets getting -gf -gg -gh -gi give given gives giving -gl -gm gmt -gn go goes going @@ -455,14 +377,6 @@ gone got gotten gov -gp -gq -gr -gs -gt -gu -gw -gy h had hadn't @@ -508,9 +422,6 @@ himself himse” his hither -hk -hm -hn home homepage hopefully @@ -553,9 +464,6 @@ insofar instead int into -io -iq -ir is isn isn't @@ -572,67 +480,41 @@ itself itse” ive j -je -jm -jo join -jp just k ke keep keeps kept -kg -kh -ki kind km -kn knew know known knows -kp -kr -kw -ky -kz l -la largely last lately later latter latterly -lb -lc less lest let let's lets -li like liked likely likewise line -lk -ll look looking looks -lr -ls -lt ltd -lu -lv -ly m -ma made mainly make @@ -643,8 +525,6 @@ may maybe mayn't maynt -mc -md me mean means @@ -654,58 +534,37 @@ member members men merely -mg -mh might might've mightn't mightnt mil mill -million mine minus miss -mk -ml -mm -mn -mo more moreover most mostly move -mp -mq mr mrs -ms -msie -mt -mu much mug must must've mustn't mustnt -mv -mw -mx my myse" myself myse” mz n -na name namely nay -nc -nd -ne near nearly necessarily @@ -718,12 +577,8 @@ neverf neverless nevertheless next -nf -ng -ni nine ninety -nl no no-one nobody @@ -739,13 +594,9 @@ nothing notwithstanding now nowhere -np -nr -nu null number numbers -nz o obtain obtained @@ -756,7 +607,6 @@ often oh ok okay -om on once one @@ -785,7 +635,6 @@ overall owing own p -pa part parted particular @@ -793,22 +642,13 @@ particularly parting parts past -pe per perhaps -pf -pg -ph -pk -pl place placed places please plus -pm -pmid -pn point pointed pointing @@ -816,8 +656,6 @@ points possible possibly potentially -pp -pr predominantly present presented @@ -830,21 +668,14 @@ probably promptly provided provides -pt put puts -pw -py q -qa que quite -qv r ran rather -rd -re readily really reasonably @@ -856,23 +687,14 @@ regards related relatively respectively -ro -ru run -rw s -sa said same saw say saying says -sb -sc -sd -se -sec second secondly seconds @@ -891,8 +713,6 @@ sent seven seventy several -sg -sh shall shan't shant @@ -914,7 +734,6 @@ showing shown showns shows -si side sides significant @@ -925,13 +744,7 @@ since site six sixty -sj -sk -sl slightly -sm -sn -so some somebody someday @@ -948,14 +761,11 @@ specifically specified specify specifying -sr -st state states still stop strongly -su sub substantially successfully @@ -964,22 +774,14 @@ sufficiently suggest sup sure -sv -sy -sz t t's take taken taking -tc -td tell ten tends -tf -tg -th than that that'll @@ -1046,10 +848,6 @@ til till tip tis -tj -tk -tm -tn to today together @@ -1058,29 +856,16 @@ took top toward towards -tp -tr tried tries -trillion try trying -ts -tt -tv -tw twas twelve twenty twice two -tz u -ua -ug -uk -um -un under underneath unless @@ -1096,25 +881,12 @@ used uses using usually -uucp -uy -uz v -va various -vc -ve versus very -vg -vi via viz -vn -vol -vols -vs -vu w want wanted @@ -1139,7 +911,6 @@ weren weren't werent weve -wf what what'd what'll @@ -1224,10 +995,5 @@ yours yourself yourselves youve -yt -yu z -za -zero -zm -zr \ No newline at end of file +zero \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt b/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt index 03ff70f6d6..75c94f0913 100644 --- a/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt @@ -7,7 +7,6 @@ # https://github.com/stopwords-iso/stopwords-tr/blob/master/stopwords-tr.txt # -INSERmi a acep açıkçası From c005fa86a674829f3fe171d88462ab63f2130caf Mon Sep 17 00:00:00 2001 From: jtotoole Date: Mon, 5 Apr 2021 13:25:31 -0400 Subject: [PATCH 046/175] sort lists --- .../mediawords/languages/ca/ca_stop_words.txt | 2 +- .../mediawords/languages/da/da_stop_words.txt | 1 - .../mediawords/languages/de/de_stop_words.txt | 3 +- .../mediawords/languages/en/en_stop_words.txt | 4 - .../mediawords/languages/es/es_stop_words.txt | 3 +- .../mediawords/languages/fi/fi_stop_words.txt | 4 +- .../mediawords/languages/fr/fr_stop_words.txt | 4 +- .../mediawords/languages/ha/ha_stop_words.txt | 4 +- .../mediawords/languages/hi/hi_stop_words.txt | 4 +- .../mediawords/languages/hu/hu_stop_words.txt | 4 +- .../mediawords/languages/it/it_stop_words.txt | 4 +- .../mediawords/languages/ja/ja_stop_words.txt | 4 +- .../mediawords/languages/lt/lt_stop_words.txt | 442 +++++++++--------- .../mediawords/languages/nl/nl_stop_words.txt | 4 +- .../mediawords/languages/no/no_stop_words.txt | 4 +- .../mediawords/languages/pt/pt_stop_words.txt | 10 - .../mediawords/languages/ro/ro_stop_words.txt | 120 ----- .../mediawords/languages/ru/ru_stop_words.txt | 2 - .../mediawords/languages/sv/sv_stop_words.txt | 3 +- .../mediawords/languages/tr/tr_stop_words.txt | 1 - .../mediawords/languages/zh/zh_stop_words.txt | 19 +- 21 files changed, 241 insertions(+), 405 deletions(-) diff --git a/apps/common/src/python/mediawords/languages/ca/ca_stop_words.txt b/apps/common/src/python/mediawords/languages/ca/ca_stop_words.txt index 8aa8988fd8..40abbeb608 100644 --- a/apps/common/src/python/mediawords/languages/ca/ca_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ca/ca_stop_words.txt @@ -776,4 +776,4 @@ vuitenes vuitens xano-xano xau-xau -xec +xec \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/da/da_stop_words.txt b/apps/common/src/python/mediawords/languages/da/da_stop_words.txt index adea1f4031..ea271bda3a 100644 --- a/apps/common/src/python/mediawords/languages/da/da_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/da/da_stop_words.txt @@ -3,7 +3,6 @@ # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # https://github.com/stopwords-iso/stopwords-da/blob/master/stopwords-da.txt # (Lightly edited to remove words in the original lists that are actually meaningful) -# ad af diff --git a/apps/common/src/python/mediawords/languages/de/de_stop_words.txt b/apps/common/src/python/mediawords/languages/de/de_stop_words.txt index ec3f32fd30..57a23fd1e9 100644 --- a/apps/common/src/python/mediawords/languages/de/de_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/de/de_stop_words.txt @@ -3,7 +3,6 @@ # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # https://github.com/stopwords-iso/stopwords-de/blob/master/stopwords-de.txt # (Lightly edited to remove words in the original lists that are actually meaningful) -# a ab @@ -611,4 +610,4 @@ zweiten zweiter zweites zwischen -zwölf +zwölf \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt index b7db4e6c12..318b310ec3 100644 --- a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt @@ -1,7 +1,6 @@ # This is a "long" stop word list for the English language. # # Sources: -# # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # http://www.lextek.com/manuals/onix/stopwords1.html # http://xpo6.com/list-of-english-stop-words/ @@ -12,7 +11,6 @@ # https://github.com/stopwords-iso/stopwords-en/blob/master/stopwords-en.txt # https://www.link-assistant.com/seo-stop-words.html # https://www.ranks.nl/stopwords -# # (Lightly edited to remove words in the original lists that are actually meaningful) 'll @@ -626,8 +624,6 @@ oughtn't oughtnt our ours -ours -ourselves ourselves out over diff --git a/apps/common/src/python/mediawords/languages/es/es_stop_words.txt b/apps/common/src/python/mediawords/languages/es/es_stop_words.txt index 38af73f9af..91e465d8f7 100644 --- a/apps/common/src/python/mediawords/languages/es/es_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/es/es_stop_words.txt @@ -3,7 +3,6 @@ # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # https://github.com/stopwords-iso/stopwords-es/blob/master/stopwords-es.txt # (Lightly edited to remove words in the original lists that are actually meaningful) -# a acerca @@ -724,4 +723,4 @@ x y ya yo -z +z \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt b/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt index 961a260907..aa2aa23aec 100644 --- a/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt @@ -2,9 +2,7 @@ # # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt -# # (Lightly edited to remove words in the original lists that are actually meaningful) -# aiemmin aika @@ -701,4 +699,4 @@ tuotä vaan vai vaikka -yli +yli \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/fr/fr_stop_words.txt b/apps/common/src/python/mediawords/languages/fr/fr_stop_words.txt index c9ec02754b..291bd4a78d 100644 --- a/apps/common/src/python/mediawords/languages/fr/fr_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/fr/fr_stop_words.txt @@ -2,9 +2,7 @@ # # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # https://github.com/stopwords-iso/stopwords-fr/blob/master/stopwords-fr.txt -# # (Lightly edited to remove words in the original lists that are actually meaningful) -# a à @@ -670,4 +668,4 @@ w x y z -zut +zut \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/ha/ha_stop_words.txt b/apps/common/src/python/mediawords/languages/ha/ha_stop_words.txt index 9aebb2f755..cc7896d042 100644 --- a/apps/common/src/python/mediawords/languages/ha/ha_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ha/ha_stop_words.txt @@ -1,10 +1,8 @@ # # This is a stop word list for the Hausa language. -# # Sources: # https://github.com/stopwords-iso/stopwords-ha/blob/master/raw/gh-stopwords-json-ha.txt # (Lightly edited to remove words in the original lists that are actually meaningful) -# a amma @@ -44,4 +42,4 @@ ya yake yana yi -za +za \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt b/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt index 63f9448b7a..328ec01437 100644 --- a/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt @@ -5,9 +5,7 @@ # http://www.ranks.nl/stopwords/hindi # https://github.com/stopwords-iso/stopwords-hi/blob/master/stopwords-hi.txt # https://sites.google.com/site/kevinbouge/stopwords-lists -# # (Lightly edited to remove words in the original lists that are actually meaningful) -# न व @@ -270,4 +268,4 @@ जिन्हें जिन्हों तिन्हें -तिन्हों +तिन्हों \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt b/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt index b6f4db9c6c..eb4446fcca 100644 --- a/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt @@ -2,9 +2,7 @@ # # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # https://github.com/stopwords-iso/stopwords-hu/blob/master/stopwords-hu.txt -# # (Lightly edited to remove words in the original lists that are actually meaningful) -# a abba @@ -791,4 +789,4 @@ voltunk ő ők őket -őt +őt \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/it/it_stop_words.txt b/apps/common/src/python/mediawords/languages/it/it_stop_words.txt index b0d8b6a12d..85e66e3ba8 100644 --- a/apps/common/src/python/mediawords/languages/it/it_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/it/it_stop_words.txt @@ -2,9 +2,7 @@ # # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # https://github.com/stopwords-iso/stopwords-it/blob/master/stopwords-it.txt -# # (Lightly edited to remove words in the original lists that are actually meaningful) -# a abbastanza @@ -624,4 +622,4 @@ volte vostra vostre vostri -vostro +vostro \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt b/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt index 1f5e21f4f3..944e44788e 100755 --- a/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt @@ -1,11 +1,9 @@ # # This is a stop word list for the Japanese language. -# # (Lightly edited to remove words in the original lists that are actually meaningful) # Sources: # https://github.com/stopwords/japanese-stopwords/blob/master/data/japanese-stopwords.txt # Lucene's stopwords_ja.txt -# $ % @@ -564,4 +562,4 @@ url → ↓ ■ -○ +○ \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt b/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt index 766f17d86f..8c8079e262 100644 --- a/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt @@ -1,38 +1,23 @@ -# Sources: -# -# http://www.filewatcher.com/p/punbb-1.2.16.tbz.620109/www/punbb/lang/Lithuanian/stopwords.txt.html -# https://github.com/stopwords-iso/stopwords-lt/blob/master/stopwords-lt.txt -# auto-generated sources -# -# - -a -á abi abidvi -abiejø abiejose abiejuose +abiejĆø abiem abigaliai abipus abu abudu ai -ákypai ana anaiptol anaisiais -anàja -anàjá -anàjà anajai anajam anajame anapus anas anasai -anàsias anasis anei aniedvi @@ -40,7 +25,6 @@ anieji aniesiems anoji anojo -anøjø anojoje anokia anoks @@ -55,56 +39,42 @@ anuodu anuoju anuosiuose anuosius +anĆ ja +anĆ jĆ +anĆ jĆ” +anĆ sias +anĆøjĆø apie aplink ar arba argi arti -ástriþai -aukðèiau -að -aš +aukĆ°ĆØiau +aĆ° be bei beje -bemaþ +bemaĆ¾ bent bet betgi beveik -bus -buvo -būti -būtų -d -dabar dar -darbo dargi -daryti -daug -daugiau -daugiausia -daugmaþ -dažnai +daugmaĆ¾ deja -dëka -dël -dëlei -dëlto -dieną -dėl +dĆ«ka +dĆ«l +dĆ«lei +dĆ«lto ech et gal -galbût +galbĆ»t galgi -gali gan gana -gauna -gauti gi greta idant @@ -113,24 +83,16 @@ ir irgi it itin -ið -iðilgai -iðvis -iš +iĆ° +iĆ°ilgai +iĆ°vis jaisiais -jájá -jàja -jàjà jajai jajam jajame -jam -jàsias -jau jei jeigu ji -jie jiedu jiedvi jieji @@ -138,13 +100,12 @@ jiesiems jinai jis jisai -jo jog joji jojo -jøjø jojoje -jos +jokia +joks josiomis josioms josios @@ -160,29 +121,34 @@ juoju juosiuose juosius jus -jûs -jûsiðkë -jûsiðkis -jûsø -jį -jų +jĆ ja +jĆ jĆ +jĆ sias +jĆ”jĆ” +jĆøjĆø +jĆ»s +jĆ»siĆ°kis +jĆ»siĆ°kĆ« +jĆ»sĆø kad kada kadangi kai kaip kaipgi -kam kas katra katras katriedvi -kaþin -kaþkas -kaþkatra -kaþkokia -kaþkuri -kaþkuris +katruodu +kaĆ¾in +kaĆ¾kas +kaĆ¾katra +kaĆ¾katras +kaĆ¾kokia +kaĆ¾koks +kaĆ¾kuri +kaĆ¾kuris kiaurai kiek kiekvienas @@ -191,56 +157,42 @@ kita kitas kitokia kitoks -klausimas -klausti -kodël +kodĆ«l kokia koks kol kolei -kovo +kone kuomet kur +kurgi kuri -kurie kuriedvi -kurios kuris kuriuodu -kurių -labai lai -lietuva -lietuvoje -lietuvos lig ligi link lyg -m man manaisiais -manàja -manàjá -manàjà manajai manajam manajame manas -manæs manasai -manàsias manasis mane manieji maniesiems manim manimi -maniðkis +maniĆ°kis +maniĆ°kĆ« mano manoji manojo -manøjø manojoje manosiomis manosioms @@ -250,116 +202,102 @@ manuoju manuosiuose manuosius manyje +manĆ ja +manĆ jĆ +manĆ jĆ” +manĆ sias +manƦs +manĆøjĆø mat -maþdaug -maþne -mažai -mažas -mažiau +maĆ¾daug +maĆ¾ne mes -metais -metu -metus -metų mudu mudvi mumis mums mumyse mus -mûsiðkë -mûsiðkis -mûsø -mūsų +mĆ»siĆ°kis +mĆ»siĆ°kĆ« +mĆ»sĆø na nagi ne -në nebe nebent -negali negi negu nei nejau +nejaugi +nekaip +nelyginant nes net netgi netoli neva -niekada -niekas nors nuo -nėra +nĆ« o ogi oi -paèiais -paèiam -paèiame paeiliui -paèiø -paèiu -paèiuose -paèius pagal pakeliui palaipsniui palei pas pasak -pasakė paskos paskui paskum pat -patá pati patiems paties pats patys +patĆ” +paĆØiais +paĆØiam +paĆØiame +paĆØiu +paĆØiuose +paĆØius +paĆØiĆø per pernelyg pirm pirma pirmiau po -prašau prie -prieð -prieðais -prieš +prieĆ° +prieĆ°ais pro pusiau -r rasi -reikia rodos -sakyti -sakė sau savaisiais -savàja -savàjá -savàjà savajai savajam savajame savas -savæs savasai -savàsias savasis save savieji saviesiems savimi +saviĆ°kis +saviĆ°kĆ« savo savoji savojo -savøjø savojoje savosiomis savosioms @@ -369,22 +307,24 @@ savuoju savuosiuose savuosius savyje +savĆ ja +savĆ jĆ +savĆ jĆ” +savĆ sias +savƦs +savĆøjĆø skersai -skradþiai -staèiai +skradĆ¾iai +staĆØiai su sulig ta tad -taèiau tai taigi taip -taip pat taipogi -tàja -tàjá -tàjà +taisiais tajai tajam tajame @@ -395,29 +335,25 @@ tartum tarytum tas tasai -tàsias tau tavaisiais -tavàja -tavàjà tavajai tavajam tavajame tavas -tavæs -tavàsias +tavasai tavasis tave tavieji taviesiems tavimi -taviðkë -taviðkis +taviĆ°kis +taviĆ°kĆ« tavo tavoji tavojo -tavøjø tavojoje +tavosiomis tavosioms tavosios tavosiose @@ -425,7 +361,13 @@ tavuoju tavuosiuose tavuosius tavyje -tačiau +tavĆ ja +tavĆ jĆ +tavĆ jĆ” +tavĆ sias +tavƦs +tavĆøjĆø +taĆØiau te tegu tegul @@ -435,14 +377,10 @@ ties tiesiems tiesiog tik -tikrai tikriausiai tiktai -to -todėl toji tojo -tøjø tojoje tokia toks @@ -454,24 +392,22 @@ tosioms tosios tosiose tu -tûlas -tuo tuodu tuoju tuosiuose tuosius -turbût -turi -turėjo -uþ -uþtat -uþvis -už +turbĆ»t +tĆ ja +tĆ jĆ +tĆ jĆ” +tĆ sias +tĆøjĆø +tĆ»las +uĆ¾ +uĆ¾tat +uĆ¾vis va vai -val -vël -vëlgi viduj vidury vien @@ -479,62 +415,138 @@ vienas vienokia vienoks vietoj -virð -virðuj -virðum +virĆ° +virĆ°uj +virĆ°um vis -vis dëlto +vis dĆ«lto visa visas visgi -visi visokia visoks vos -ypaè +vĆ«l +vĆ«lgi +ypaĆØ +Ć” +Ć”kypai +Ć”striĆ¾ai +Ć°alia +Ć°e +Ć°i +Ć°iaisiais +Ć°iajai +Ć°iajam +Ć°iajame +Ć°iapus +Ć°iedvi +Ć°ieji +Ć°iesiems +Ć°ioji +Ć°iojo +Ć°iojoje +Ć°iokia +Ć°ioks +Ć°iosiomis +Ć°iosioms +Ć°iosios +Ć°iosiose +Ć°is +Ć°isai +Ć°it +Ć°ita +Ć°itas +Ć°itiedvi +Ć°itokia +Ć°itoks +Ć°ituodu +Ć°iuodu +Ć°iuoju +Ć°iuosiuose +Ć°iuosius +Ć°iĆ ja +Ć°iĆ jĆ +Ć°iĆ sias +Ć°iĆøjĆø +Ć°tai +Ć°Ć”jĆ” +Ć¾emiau +a +aÅ” +bus +buvo +bÅ«ti +bÅ«tų +d +dabar +darbo +daryti +daug +daugiau +daugiausia +dažnai +dienÄ + +dėl +gali +gauna +gauti +iÅ” +jam +jau +jie +jo +jos +jÄÆ +jų +kam +kartÄ +klausimas +klausti +kovo +kurie +kurios +kurių +labai +lietuva +lietuvoje +lietuvos +m +mažai +mažas +mažiau +metais +metu +metus +metų +mÅ«sų +negali +niekada +niekas +nėra +pasakė +praÅ”au +prieÅ” +r +reikia +sakyti +sakė +taip pat +tačiau +tikrai +to +todėl +tuo +turi +turėjo +už +val +visi yra -ðájá -ðalia -ðe -ði -ðiaisiais -ðiàja -ðiàjà -ðiajai -ðiajam -ðiajame -ðiapus -ðiàsias -ðiedvi -ðieji -ðiesiems -ðioji -ðiojo -ðiøjø -ðiojoje -ðiokia -ðioks -ðiosiomis -ðiosioms -ðiosios -ðiosiose -ðis -ðisai -ðit -ðita -ðitas -ðitiedvi -ðitokia -ðitoks -ðituodu -ðiuodu -ðiuoju -ðiuosiuose -ðiuosius -ðtai -þemiau -į -šalia -čia -šios -žmonių +čia +ÄÆ +Å”alia +Å”alies +Å”ios +žmonių \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/nl/nl_stop_words.txt b/apps/common/src/python/mediawords/languages/nl/nl_stop_words.txt index 97e6741d4f..6ef3790c11 100644 --- a/apps/common/src/python/mediawords/languages/nl/nl_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/nl/nl_stop_words.txt @@ -2,9 +2,7 @@ # # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # https://github.com/stopwords-iso/stopwords-nl/blob/master/stopwords-nl.txt -# # (Lightly edited to remove words in the original lists that are actually meaningful) -# aan aangaande @@ -414,4 +412,4 @@ zowat zulk zulke zullen -zult +zult \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/no/no_stop_words.txt b/apps/common/src/python/mediawords/languages/no/no_stop_words.txt index f29adcd41e..5949a9c321 100644 --- a/apps/common/src/python/mediawords/languages/no/no_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/no/no_stop_words.txt @@ -2,9 +2,7 @@ # # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # https://github.com/stopwords-iso/stopwords-no/blob/master/stopwords-no.txt -# # (Lightly edited to remove words in the original lists that are actually meaningful) -# å alle @@ -221,4 +219,4 @@ vore vöre vors vort -vört +vört \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/pt/pt_stop_words.txt b/apps/common/src/python/mediawords/languages/pt/pt_stop_words.txt index c518b2c5eb..964c0d13d1 100644 --- a/apps/common/src/python/mediawords/languages/pt/pt_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/pt/pt_stop_words.txt @@ -3,8 +3,6 @@ # # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # https://github.com/stopwords-iso/stopwords-pt/blob/master/stopwords-pt.txt -# -# a à @@ -395,14 +393,6 @@ diária diariamente diárias diário -diária -diariamente -diárias -diário -diária -diariamente -diárias -diário dias dica dicas diff --git a/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt b/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt index 5522f31479..cae4ee0cf7 100644 --- a/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt @@ -4,8 +4,6 @@ # # http://snowball.tartarus.org/otherapps/romanian/intro.html (romanian2.tgz) # https://github.com/stopwords-iso/stopwords-ro/blob/master/stopwords-ro.txt -# -# a abia @@ -20,7 +18,6 @@ aceia aceiaşi acel acela -acela acelasi acelaşi acele @@ -35,7 +32,6 @@ acelui aceluiaşi acest acesta -acesta aceste acestea acestei @@ -50,7 +46,6 @@ aceştia acolo acord acum -acum adica ai aia @@ -58,7 +53,6 @@ aibă aici aiurea al -al ala alaturi ale @@ -66,8 +60,6 @@ alea alt alta altceva -altceva -altcineva altcineva altcuiva alte @@ -82,12 +74,10 @@ altora altui altuia altul -altul altă alţi alţii am -am amândoi amândouă amânduror @@ -108,10 +98,8 @@ asta astazi astea astfel -astfel astăzi asupra -asupra atare atat atât @@ -130,7 +118,6 @@ atita atitea atitia atunci -atunci au avea aveai @@ -140,7 +127,6 @@ aveaţi avem aveţi avut -avut azi aş aşa @@ -149,25 +135,20 @@ aţi b ba bine -bine bucur bună c ca -ca cam cand când -când capat care -care careia carora caruia cat cât -cât câte câteva câtor @@ -175,13 +156,11 @@ câtora câtorva catre câtva -câtva câtă caut câţi câţiva ce -ce cea cealaltă ceea @@ -200,18 +179,12 @@ celui celuilalt celălalt ceva -ceva -chiar chiar ci -ci -cinci cinci cind cînd cine -cine -cineva cineva cit cît @@ -226,42 +199,32 @@ cîţi conform contra cu -cu -cui cui cuiva cum -cum cumva curând curînd cutare că -că căci cărei căror cărui către -către d da daca dacă -dacă -dar dar dat -dat datorită dată dau de -de deasupra decât deci -deci decit degraba deja @@ -269,24 +232,18 @@ deoarece departe desi despre -despre destui destul destule destulă deşi -deşi -din din dinaintea dintr dintr- dintre -dintre -doar doar doi -doi doilea doime doua @@ -295,34 +252,23 @@ drept dumneavoastră dupa după -după dă e ea -ea ei -ei -el el ele -ele -era era erai eram -eram erau este -este -eu eu exact există eşti -eşti f face -face fara fata faţă @@ -330,7 +276,6 @@ fel fi fie fiecare -fiecare fiecărei fiecăreia fiecărui @@ -339,18 +284,14 @@ fii fiind fim fiu -fiu fiţi foarte -foarte făcut g h i ia iar -iar -ieri ieri ii îi @@ -361,14 +302,12 @@ imi împotriva in în -în inainte înainte înaintea inapoi inca încât -încât incit încît încotro @@ -407,37 +346,28 @@ j k l la -la -lângă lângă le li lîngă lor -lor -lui lui m ma mai -mai mâine mare -mare mea mei mele mereu meu -meu mi mie -mie mîine mine mod mult -mult multa multe multi @@ -450,49 +380,35 @@ ne nevoie ni nici -nici niciodata nicăieri nimeni -nimeni nimeri nimic -nimic nimănui niste nişte -nişte noastre noastră noi -noi noroc nostri nostru -nostru nou noua -noua -nouă nouă noştri nu -nu numai -numai -o o oarecare oi om opt -opt or ori oricând oricare -oricare -oricât oricât oricâte oricâtor @@ -500,10 +416,8 @@ oricâtora oricâtă oricâţi orice -orice oricînd oricine -oricine oricît oricui oricum @@ -518,25 +432,18 @@ oţi p pai până -până parte patra patru -patru patrulea pe -pe -pentru pentru peste -peste pic pina pînă plus poate -poate -pot pot prea prima @@ -546,16 +453,13 @@ primelor primii primilor primul -primul primului prin -prin printr- printre putea putini puţin -puţin puţina puţine puţini @@ -569,8 +473,6 @@ sa-ti sai sale sau -sau -se se si sie @@ -583,24 +485,18 @@ sîntem sînteţi spate spre -spre spune spus sub -sub sunt -sunt -suntem suntem sunteţi -sunteţi sus sutime sută să săi său -său t ta tale @@ -613,31 +509,25 @@ toate toată tocmai tot -tot toti totul totusi totuşi -totuşi toţi trebuie trei -trei treia treilea treime tu -tu tuturor tăi tău -tău u ul un una unde -unde undeva unei uneia @@ -647,11 +537,9 @@ unii unor unora unu -unu unui unuia unul -unul v va vei @@ -660,12 +548,9 @@ vi voastre voastră voi -voi vom vor vostru -vostru -vouă vouă voştri vreme @@ -685,20 +570,15 @@ vă x z zece -zece zero zi zice şapte -şapte -şase şase şi ţi -şi ăia ţie -ţie ăla ălea ăleia diff --git a/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt b/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt index 99e38c779c..f3195998bb 100644 --- a/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt @@ -3,8 +3,6 @@ # # Source: # https://github.com/stopwords-iso/stopwords-ru/blob/master/stopwords-ru.txt -# -# adriver amp diff --git a/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt b/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt index 9758857e62..4252bf2627 100644 --- a/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt @@ -5,7 +5,6 @@ # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # https://github.com/stopwords-iso/stopwords-sv/blob/master/stopwords-sv.txt # that one Swedish journalist -# aderton adertonde @@ -355,4 +354,4 @@ vilka vilkas vilken vilket -vill +vill \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt b/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt index 75c94f0913..0ead125f7d 100644 --- a/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt @@ -5,7 +5,6 @@ # http://nlp.ceng.fatih.edu.tr/blog/?p=101 # http://www.ranks.nl/stopwords/turkish.html # https://github.com/stopwords-iso/stopwords-tr/blob/master/stopwords-tr.txt -# a acep diff --git a/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt b/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt index 533e638f96..c2324e184a 100644 --- a/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt @@ -3,6 +3,7 @@ # Sources: # http://blog.csdn.net/shijiebei2009/article/details/39696571 # http://github.com/stopwords-iso/stopwords-zh + ! " # @@ -178,7 +179,6 @@ sup 上来 上述 上面 -下 下來 下列 下去 @@ -306,7 +306,6 @@ sup 並無 並肩 並非 -个 个人 个别 中 @@ -374,7 +373,6 @@ sup 互 互相 五 -些 亦 产生 亲口 @@ -473,7 +471,6 @@ sup 但凡 但是 但願 -何 何乐而不为 何以 何况 @@ -504,7 +501,6 @@ sup 使 使得 使用 -來 來不及 來得及 來看 @@ -525,7 +521,6 @@ sup 俺 俺们 俺們 -個 個人 個別 倍加 @@ -665,12 +660,10 @@ sup 分期 分期分批 分頭 -切 切不可 切切 切勿 切莫 -则 则甚 刚 刚好 @@ -704,7 +697,6 @@ sup 到處 到頭 到頭來 -則 則甚 前后 前後 @@ -1033,7 +1025,6 @@ sup 它是 它的 完成 -定 实际 密切 實現 @@ -1370,7 +1361,6 @@ sup 方 方便 方才 -方面 於 於是 於是乎 @@ -1391,7 +1381,6 @@ sup 日漸 日益 日臻 -时 时候 明显 明确 @@ -1404,7 +1393,6 @@ sup 是的 显然 显著 -時 時候 暗中 暗地裡 @@ -1455,7 +1443,6 @@ sup 本著 本身 权时 -来 来不及 来得及 来看 @@ -1878,7 +1865,6 @@ sup 至今 至於 至若 -致 與 與其 與其說 @@ -2002,7 +1988,6 @@ sup 赖以 赶快 赶早不赶晚 -起 起來 起先 起初 @@ -2033,7 +2018,6 @@ sup 较为 较之 较比 -边 达到 迄 过 @@ -2151,7 +2135,6 @@ sup 還是 還有 還要 -邊 那 那个 那么 From 7cb995ad7e1712f9b61bef48c845eaa520d5e527 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Mon, 5 Apr 2021 15:23:09 -0400 Subject: [PATCH 047/175] correct lt encoding --- .../mediawords/languages/lt/lt_stop_words.txt | 365 +++++++----------- 1 file changed, 148 insertions(+), 217 deletions(-) diff --git a/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt b/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt index 8c8079e262..1cf21e2ae6 100644 --- a/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt @@ -1,8 +1,17 @@ +# +# This is a stop word list for the Lithuanian language. +# +# Sources: +# http://www.filewatcher.com/p/punbb-1.2.16.tbz.620109/www/punbb/lang/Lithuanian/stopwords.txt.html +# https://github.com/stopwords-iso/stopwords-lt/blob/master/stopwords-lt.txt +# auto-generated sources +# + abi abidvi abiejose abiejuose -abiejĆø +abiejų abiem abigaliai abipus @@ -39,39 +48,39 @@ anuodu anuoju anuosiuose anuosius -anĆ ja -anĆ jĆ -anĆ jĆ” -anĆ sias -anĆøjĆø +anąja +anąją +anąjį +anąsias +anųjų apie aplink ar arba argi arti -aukĆ°ĆØiau -aĆ° +aukščiau +aš be bei beje -bemaĆ¾ +bemaž bent bet betgi beveik dar dargi -daugmaĆ¾ +daugmaž deja -dĆ«ka -dĆ«l -dĆ«lei -dĆ«lto +dėka +dėl +dėlei +dėlto ech et gal -galbĆ»t +galbūt galgi gan gana @@ -83,9 +92,9 @@ ir irgi it itin -iĆ° -iĆ°ilgai -iĆ°vis +iš +išilgai +išvis jaisiais jajai jajam @@ -121,15 +130,15 @@ juoju juosiuose juosius jus -jĆ ja -jĆ jĆ -jĆ sias -jĆ”jĆ” -jĆøjĆø -jĆ»s -jĆ»siĆ°kis -jĆ»siĆ°kĆ« -jĆ»sĆø +jąja +jąją +jąsias +jįjį +jųjų +jūs +jūsiškis +jūsiškė +jūsų kad kada kadangi @@ -141,14 +150,14 @@ katra katras katriedvi katruodu -kaĆ¾in -kaĆ¾kas -kaĆ¾katra -kaĆ¾katras -kaĆ¾kokia -kaĆ¾koks -kaĆ¾kuri -kaĆ¾kuris +kažin +kažkas +kažkatra +kažkatras +kažkokia +kažkoks +kažkuri +kažkuris kiaurai kiek kiekvienas @@ -157,7 +166,7 @@ kita kitas kitokia kitoks -kodĆ«l +kodėl kokia koks kol @@ -188,8 +197,8 @@ manieji maniesiems manim manimi -maniĆ°kis -maniĆ°kĆ« +maniškis +maniškė mano manoji manojo @@ -202,15 +211,15 @@ manuoju manuosiuose manuosius manyje -manĆ ja -manĆ jĆ -manĆ jĆ” -manĆ sias -manƦs -manĆøjĆø +manąja +manąją +manąjį +manąsias +manęs +manųjų mat -maĆ¾daug -maĆ¾ne +maždaug +mažne mes mudu mudvi @@ -218,9 +227,9 @@ mumis mums mumyse mus -mĆ»siĆ°kis -mĆ»siĆ°kĆ« -mĆ»sĆø +mūsiškis +mūsiškė +mūsų na nagi ne @@ -240,7 +249,7 @@ netoli neva nors nuo -nĆ« +nė o ogi oi @@ -260,14 +269,14 @@ patiems paties pats patys -patĆ” -paĆØiais -paĆØiam -paĆØiame -paĆØiu -paĆØiuose -paĆØius -paĆØiĆø +patį +pačiais +pačiam +pačiame +pačiu +pačiuose +pačius +pačių per pernelyg pirm @@ -275,8 +284,8 @@ pirma pirmiau po prie -prieĆ° -prieĆ°ais +prieš +priešais pro pusiau rasi @@ -293,8 +302,8 @@ save savieji saviesiems savimi -saviĆ°kis -saviĆ°kĆ« +saviškis +saviškė savo savoji savojo @@ -307,15 +316,15 @@ savuoju savuosiuose savuosius savyje -savĆ ja -savĆ jĆ -savĆ jĆ” -savĆ sias -savƦs -savĆøjĆø +savąja +savąją +savąjį +savąsias +savęs +savųjų skersai -skradĆ¾iai -staĆØiai +skradžiai +stačiai su sulig ta @@ -347,8 +356,8 @@ tave tavieji taviesiems tavimi -taviĆ°kis -taviĆ°kĆ« +taviškis +taviškė tavo tavoji tavojo @@ -361,13 +370,13 @@ tavuoju tavuosiuose tavuosius tavyje -tavĆ ja -tavĆ jĆ -tavĆ jĆ” -tavĆ sias -tavƦs -tavĆøjĆø -taĆØiau +tavąja +tavąją +tavąjį +tavąsias +tavęs +tavųjų +tačiau te tegu tegul @@ -396,16 +405,16 @@ tuodu tuoju tuosiuose tuosius -turbĆ»t -tĆ ja -tĆ jĆ -tĆ jĆ” -tĆ sias -tĆøjĆø -tĆ»las -uĆ¾ -uĆ¾tat -uĆ¾vis +turbūt +tąja +tąją +tąjį +tąsias +tųjų +tūlas +už +užtat +užvis va vai viduj @@ -415,138 +424,60 @@ vienas vienokia vienoks vietoj -virĆ° -virĆ°uj -virĆ°um +virš +viršuj +viršum vis -vis dĆ«lto +vis dėlto visa visas visgi visokia visoks vos -vĆ«l -vĆ«lgi -ypaĆØ -Ć” -Ć”kypai -Ć”striĆ¾ai -Ć°alia -Ć°e -Ć°i -Ć°iaisiais -Ć°iajai -Ć°iajam -Ć°iajame -Ć°iapus -Ć°iedvi -Ć°ieji -Ć°iesiems -Ć°ioji -Ć°iojo -Ć°iojoje -Ć°iokia -Ć°ioks -Ć°iosiomis -Ć°iosioms -Ć°iosios -Ć°iosiose -Ć°is -Ć°isai -Ć°it -Ć°ita -Ć°itas -Ć°itiedvi -Ć°itokia -Ć°itoks -Ć°ituodu -Ć°iuodu -Ć°iuoju -Ć°iuosiuose -Ć°iuosius -Ć°iĆ ja -Ć°iĆ jĆ -Ć°iĆ sias -Ć°iĆøjĆø -Ć°tai -Ć°Ć”jĆ” -Ć¾emiau -a -aÅ” -bus -buvo -bÅ«ti -bÅ«tų -d -dabar -darbo -daryti -daug -daugiau -daugiausia -dažnai -dienÄ - -dėl -gali -gauna -gauti -iÅ” -jam -jau -jie -jo -jos -jÄÆ -jų -kam -kartÄ -klausimas -klausti -kovo -kurie -kurios -kurių -labai -lietuva -lietuvoje -lietuvos -m -mažai -mažas -mažiau -metais -metu -metus -metų -mÅ«sų -negali -niekada -niekas -nėra -pasakė -praÅ”au -prieÅ” -r -reikia -sakyti -sakė -taip pat -tačiau -tikrai -to -todėl -tuo -turi -turėjo -už -val -visi -yra -čia -ÄÆ -Å”alia -Å”alies -Å”ios -žmonių \ No newline at end of file +vėl +vėlgi +ypač +į +įkypai +įstrižai +šalia +še +ši +šiaisiais +šiajai +šiajam +šiajame +šiapus +šiedvi +šieji +šiesiems +šioji +šiojo +šiojoje +šiokia +šioks +šiosiomis +šiosioms +šiosios +šiosiose +šis +šisai +šit +šita +šitas +šitiedvi +šitokia +šitoks +šituodu +šiuodu +šiuoju +šiuosiuose +šiuosius +šiąja +šiąją +šiąsias +šiųjų +štai +šįjį +žemiau \ No newline at end of file From 96bfa436c0006e222975c75df4d82209e9914e24 Mon Sep 17 00:00:00 2001 From: jtotoole Date: Mon, 5 Apr 2021 17:33:10 -0400 Subject: [PATCH 048/175] fix tests --- apps/common/tests/python/mediawords/languages/test_lt.py | 2 +- apps/common/tests/python/mediawords/languages/test_pt.py | 2 +- apps/common/tests/python/mediawords/languages/test_sv.py | 2 +- apps/common/tests/python/mediawords/languages/test_zh.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/common/tests/python/mediawords/languages/test_lt.py b/apps/common/tests/python/mediawords/languages/test_lt.py index 87aa142519..5ecb4a916a 100644 --- a/apps/common/tests/python/mediawords/languages/test_lt.py +++ b/apps/common/tests/python/mediawords/languages/test_lt.py @@ -17,7 +17,7 @@ def test_sample_sentence(self): def test_stop_words_map(self): stop_words = self.__tokenizer.stop_words_map() - assert "buvo" in stop_words + assert "dargi" in stop_words assert "not_a_stopword" not in stop_words def test_stem(self): diff --git a/apps/common/tests/python/mediawords/languages/test_pt.py b/apps/common/tests/python/mediawords/languages/test_pt.py index b385cad73b..cec4e0c41a 100644 --- a/apps/common/tests/python/mediawords/languages/test_pt.py +++ b/apps/common/tests/python/mediawords/languages/test_pt.py @@ -17,7 +17,7 @@ def test_sample_sentence(self): def test_stop_words_map(self): stop_words = self.__tokenizer.stop_words_map() - assert "fãs" in stop_words + assert "abre" in stop_words assert "not_a_stopword" not in stop_words def test_stem(self): diff --git a/apps/common/tests/python/mediawords/languages/test_sv.py b/apps/common/tests/python/mediawords/languages/test_sv.py index 48c217e7e3..cd9f8a7fa4 100644 --- a/apps/common/tests/python/mediawords/languages/test_sv.py +++ b/apps/common/tests/python/mediawords/languages/test_sv.py @@ -17,7 +17,7 @@ def test_sample_sentence(self): def test_stop_words_map(self): stop_words = self.__tokenizer.stop_words_map() - assert "vår" in stop_words + assert "åttio" in stop_words assert "not_a_stopword" not in stop_words def test_stem(self): diff --git a/apps/common/tests/python/mediawords/languages/test_zh.py b/apps/common/tests/python/mediawords/languages/test_zh.py index 9e21730b36..ae0b55ee59 100644 --- a/apps/common/tests/python/mediawords/languages/test_zh.py +++ b/apps/common/tests/python/mediawords/languages/test_zh.py @@ -17,7 +17,7 @@ def test_sample_sentence(self): def test_stop_words_map(self): stop_words = self.__tokenizer.stop_words_map() - assert "不勝" in stop_words + assert "不起" in stop_words assert "not_a_stopword" not in stop_words def test_stem(self): From d05b0462957e0430411fb4bc00440d11ceb5562c Mon Sep 17 00:00:00 2001 From: jtotoole Date: Tue, 6 Apr 2021 09:51:05 -0400 Subject: [PATCH 049/175] another pass at sorting --- .../mediawords/languages/en/en_stop_words.txt | 28 - .../mediawords/languages/fi/fi_stop_words.txt | 170 +-- .../mediawords/languages/hi/hi_stop_words.txt | 346 +++--- .../mediawords/languages/hu/hu_stop_words.txt | 400 +++--- .../mediawords/languages/ja/ja_stop_words.txt | 4 +- .../mediawords/languages/lt/lt_stop_words.txt | 81 +- .../mediawords/languages/ro/ro_stop_words.txt | 170 +-- .../mediawords/languages/ru/ru_stop_words.txt | 1104 ++++++++--------- .../mediawords/languages/sv/sv_stop_words.txt | 144 +-- .../mediawords/languages/tr/tr_stop_words.txt | 133 +- .../mediawords/languages/zh/zh_stop_words.txt | 70 +- 11 files changed, 1344 insertions(+), 1306 deletions(-) diff --git a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt index 318b310ec3..b69d36a7f3 100644 --- a/apps/common/src/python/mediawords/languages/en/en_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/en/en_stop_words.txt @@ -141,7 +141,6 @@ accordingly across act actually -ad added adj affected @@ -156,7 +155,6 @@ ah ahead ain't aint -al all almost alone @@ -198,7 +196,6 @@ aren't arent arise around -arpa as aside ask @@ -238,7 +235,6 @@ by c c'mon c's -ca call came can @@ -250,7 +246,6 @@ case cases cause causes -cc certain certainly cmon @@ -305,7 +300,6 @@ during e e.g each -edu effect eg eight @@ -333,15 +327,12 @@ exactly example except f -fi fifteen fifth fifty -fify find first five -fix for forever forth @@ -374,7 +365,6 @@ going gone got gotten -gov h had hadn't @@ -429,12 +419,9 @@ how'll how's howbeit however -hr -ht htm html http -hu hundred i i'd @@ -443,13 +430,9 @@ i'm i've i.e i.e. -id ie if -ii -il ill -im in inasmuch inc @@ -460,7 +443,6 @@ indicated indicates insofar instead -int into is isn @@ -481,12 +463,10 @@ j join just k -ke keep keeps kept kind -km knew know known @@ -536,8 +516,6 @@ might might've mightn't mightnt -mil -mill mine minus miss @@ -549,7 +527,6 @@ move mr mrs much -mug must must've mustn't @@ -558,7 +535,6 @@ my myse" myself myse” -mz n name namely @@ -614,8 +590,6 @@ only onto opens or -ord -org other others otherwise @@ -762,7 +736,6 @@ states still stop strongly -sub substantially successfully such @@ -870,7 +843,6 @@ until unto up upon -ups us use used diff --git a/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt b/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt index aa2aa23aec..d1457203fe 100644 --- a/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/fi/fi_stop_words.txt @@ -74,9 +74,9 @@ avuksi avulla avun avutta -edellä edelle edelleen +edellä edeltä edemmäs edes @@ -96,7 +96,6 @@ ellet ellette emme en -enää enemmän eniten ennen @@ -106,22 +105,23 @@ ensimmäiseksi ensimmäisen ensimmäisenä ensimmäiset -ensimmäisiä ensimmäisiksi ensimmäisinä +ensimmäisiä ensimmäistä ensin entinen entisen entisiä -entistä entisten -eräät -eräiden -eräs +entistä +enää eri erittäin erityisesti +eräiden +eräs +eräät esi esiin esillä @@ -130,9 +130,9 @@ et eteen etenkin etessa -että ette ettei +että haikki halua haluaa @@ -151,23 +151,13 @@ halusitte halusivat halutessa haluton -hän -häneen -hänellä -hänelle -häneltä -hänen -hänessä -hänestä -hänet -häntä he hei heidän heidät heihin -heillä heille +heillä heiltä heissä heistä @@ -180,10 +170,6 @@ hitaasti hoikein huolimatta huomenna -hyvä -hyvää -hyvät -hyviä hyvien hyviin hyviksi @@ -193,15 +179,26 @@ hyvin hyvinä hyvissä hyvistä +hyviä +hyvä +hyvät +hyvää +hän +häneen +hänelle +hänellä +häneltä +hänen +hänessä +hänestä +hänet +häntä ihan ilmeisesti itse -itseään itsensä +itseään ja -jää -jälkeen -jälleen jo johon joiden @@ -259,6 +256,9 @@ joutumaan joutuu joutuvat juuri +jälkeen +jälleen +jää kahdeksan kahdeksannen kahdella @@ -299,18 +299,18 @@ kehen keiden keihin keiksi -keillä keille +keillä keiltä keinä keissä keistä -keitä keitten +keitä keneen keneksi -kenellä kenelle +kenellä keneltä kenen kenenä @@ -325,8 +325,8 @@ kerta kertaa keskellä kesken -ketä ketkä +ketä kiitos kohti koko @@ -357,34 +357,34 @@ kuten kuuden kuusi kuutta -kyllä kylliksi +kyllä kymmenen kyse +liian +liki +lisäksi +lisää +lla +luo +luona lähekkäin -lähellä lähelle +lähellä läheltä lähemmäs lähes lähinnä lähtien läpi -liian -liki -lisää -lisäksi -lla -luo -luona mahdollisimman mahdollista me meidän meidät meihin -meillä meille +meillä meiltä meissä meistä @@ -407,17 +407,16 @@ mennessä mennyt menossa mihin -mikä -mikään -mikäli mikin miksi -millä +mikä +mikäli +mikään mille milloin milloinkan +millä miltä -minä minkä minne minua @@ -429,12 +428,13 @@ minussa minusta minut minuun +minä missä mistä -mitä -mitään miten mitkä +mitä +mitään moi molemmat mones @@ -464,34 +464,18 @@ muutaman muuten myöhemmin myös -myöskään myöskin +myöskään myötä -näiden -näihin -näiksi -näillä -näille -näiltä -näin -näinä -näissä -näissähin -näissälle -näissältä -näissästä -näistä -näitä -nämä ne neljä -neljää neljän +neljää niiden niihin niiksi -niillä niille +niillä niiltä niin niinä @@ -512,6 +496,22 @@ noita nro nuo nyt +näiden +näihin +näiksi +näille +näillä +näiltä +näin +näinä +näissä +näissähin +näissälle +näissältä +näissästä +näistä +näitä +nämä ohi oikea oikealla @@ -560,7 +560,6 @@ on onkin onko ovat -päälle paikoittain paitsi pakosti @@ -569,13 +568,13 @@ paremmin parempi parhaillaan parhaiten -peräti perusteella +peräti pian pieneen pieneksi -pienellä pienelle +pienellä pieneltä pienempi pienestä @@ -583,6 +582,7 @@ pienin poikki puolesta puolestaan +päälle saakka sadam sama @@ -610,12 +610,11 @@ siis siitä sijaan siksi -sillä sille silloin -siltä +sillä silti -sinä +siltä sinne sinua sinulla @@ -626,9 +625,10 @@ sinussa sinusta sinut sinuun -sitä +sinä siten sitten +sitä ssa sta suoraan @@ -641,33 +641,23 @@ suurten taa taas taemmas -tähän tahansa tai takaa takaisin takana takia -täksi tallä -tälle -tältä -tämä -tämän -tänä tapauksessa tarpeeksi -tässä -tästä -tätä tavalla tavoitteena te teidän teidät teihin -teillä teille +teillä teiltä teissä teistä @@ -696,6 +686,16 @@ tuona tuossa tuosta tuotä +tähän +täksi +tälle +tältä +tämä +tämän +tänä +tässä +tästä +tätä vaan vai vaikka diff --git a/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt b/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt index 328ec01437..0682f8985c 100644 --- a/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/hi/hi_stop_words.txt @@ -7,265 +7,265 @@ # https://sites.google.com/site/kevinbouge/stopwords-lists # (Lightly edited to remove words in the original lists that are actually meaningful) -न -व -अत -अप -अब -आज -आप -इन -इस -उन -उस -एक -एस -ओर -और -कइ -कई -कम -कल -का -कि -की -के -को -गई -गए -जब -जा -जो -तक -तब -तो -था -थि -थी -थे -दो -ना -ने -पर -पे -भि -भी -मे -मै -यह -या -ये -वह -वे -से -सो -हि -ही -हे -है -हो -अदि -अभि -अभी -आदि -इसि -इसी -इसे -उसि -उसी -उसे -ऊपर -एवं -एसे -ऐसा -ऐसे -कभी -कहा -किए -कुछ -कुल -कोइ -कोई -कोन -कौन -गया -गयी -गये -जिन -जिस -तथा -तरह -तिन -तिस -तुम -दूर -फिर -बनि -बनी -बहि -बाद -बीच -मगर -में -यदि -यहि -यही -यिह -रहा -रहे -लिए -वुह -संग -सभि -सभी -समय -साथ -हुअ -हुआ -हुइ -हुई -हुए -हें -हैं -के अंदर +अत अथवा +अदि अन्य +अप अपना अपनि अपनी अपने +अब +अभि +अभी +आज +आदि +आप +इंहिं +इंहें +इंहों +इतयादि +इत्यादि +इन इनका इनके +इन्हीं +इन्हें +इन्हों +इस इसका इसकि इसकी इसके +इसमें +इसि +इसी +इसे +उंहिं +उंहें +उंहों उच्च +उत्तर +उन उनका उनकि उनकी उनके उनको +उन्हीं +उन्हें +उन्हों +उस उसकी उसके +उसि +उसी +उसे +ऊपर +एक +एवं +एस +एसे +ऐसा +ऐसे +ओर +और +कइ +कई +कभी +कम करता करते करना करने करें +कल कहते +कहा +का काफि काफ़ी +कि +किंहें +किंहों +किए +कितना +किन्हें +किन्हों किया किसी किसे +की +कुछ +कुल +के +को +कोइ +कोई +कोन +कोनसा +कौन +कौनसा +गई +गए +गया +गयी +गये +जब जहाँ जहां +जा जाता जाती जाते जाने +जिंहें +जिंहों +जितना जिधर +जिन +जिन्हें +जिन्हों +जिस +जिसमें +जिससे जिसे जीधर जेसा जेसे जैसा जैसे +जो +तक +तथा +तब +तरह +तिंहें +तिंहों +तिन +तिन्हें +तिन्हों +तिस तिसे +तुम +तो +था +थि +थी +थे +दबारा +दवारा दिया +दुसरा +दुसरे +दूर +दूसरे +दो +दोनों +द्वारा +न नहिं नहीं +ना निचे +निहायत नीचे +ने +पर +परंतु पहले पुरा पूरा पूरे +पे +प्रति +फिर बड़ा बड़े +बनि +बनी +बहि बहुत +बाद बाला बाहर +बिलकुल +बीच +भि भितर +भी भीतर +मगर मध्य मानो +मे +में +मै +यदि +यह यहाँ यहां +यहि +यही +या +यिह +ये रखें +रवासा रहती +रहा +रहे +ऱ्वासा +लिए लिया लिये लेकर +लेकिन +व +वगेरह वर्ग +वह वहाँ वहां वहिं वहीं वाले +वुह +वे +वग़ैरह +संग सकता सकती सकते सबसे +सभि +सभी +समय +साथ +साबुत सारा +से +सो +स्थान +हि +ही +हुअ +हुआ +हुइ +हुई +हुए हुये +हे +हें +है +हैं +हो होता होति होती होते होना होने -इंहिं -इंहें -इंहों -इसमें -उंहिं -उंहें -उंहों -उत्तर -कितना -कोनसा -कौनसा -जितना -जिससे -दबारा -दवारा -दुसरा -दुसरे -दूसरे -दोनों -परंतु -प्रति -रवासा -लेकिन -वगेरह -वग़ैरह -साबुत -स्थान -इतयादि -इन्हीं -इन्हें -इन्हों -उन्हीं -उन्हें -उन्हों -किंहें -किंहों -जिंहें -जिंहों -जिसमें -तिंहें -तिंहों -द्वारा -निहायत -बिलकुल -ऱ्वासा -इत्यादि -किन्हें -किन्हों -जिन्हें -जिन्हों -तिन्हें -तिन्हों \ No newline at end of file +के \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt b/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt index eb4446fcca..da87882c7a 100644 --- a/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/hu/hu_stop_words.txt @@ -12,17 +12,11 @@ addig ahhoz ahogy ahol -akár aki akik akkor -alá -alád -alájuk -alám -alánk +akár alapján -alátok alatt alatta alattad @@ -30,6 +24,12 @@ alattam alattatok alattuk alattunk +alá +alád +alájuk +alám +alánk +alátok alól alóla alólad @@ -37,9 +37,6 @@ alólam alólatok alóluk alólunk -által -általában -ám amely amelybol amelyek @@ -49,50 +46,46 @@ amelyet amelyik amelynek ami -amíg amikor amit amolyan amott +amíg annak annál arra arról -át attól az -azért aznap azok azokat azokba azokban azokból -azokért azokhoz azokig -azokká azokkal +azokká azoknak azoknál azokon azokra azokról azoktól +azokért azon azonban azonnal azt aztán azután -azzá azzal +azzá +azért bal balra ban -bár -bárcsak -bármilyen be belé beléd @@ -114,6 +107,9 @@ bennem bennetek bennük bennünk +bár +bárcsak +bármilyen búcsú cikk cikkek @@ -128,10 +124,7 @@ ebbe ebben ebből eddig -egész -egészen egy -egyéb egyebek egyebet egyedül @@ -143,32 +136,34 @@ egyik egymás egyre egyszerre +egyéb együtt +egész +egészen ehhez ekkor el -elé -eléd -elég eleinte -eléjük -elém -elénk -elétek -éljen ellen -ellenére ellenes elleni +ellenére elmondta -elõ -elõször -elõtt elsõ első elsők elsősorban elsőt +elé +eléd +elég +eléjük +elém +elénk +elétek +elõ +elõször +elõtt elő előbb elől @@ -188,96 +183,72 @@ előttük előttünk előző emilyen -én engem ennek -ennél ennyi +ennél enyém -éppen erre erről -érte -érted -értem -értetek -értük -értünk -és esetben ettől -év -évben -éve -évek -éves -évi -évvel ez ezek ezekbe ezekben ezekből ezeken -ezekért ezeket ezekhez ezekig -ezekké ezekkel +ezekké ezeknek ezeknél ezekre ezekről ezektől +ezekért ezen ezentúl ezer -ezért ezret ezt ezután -ezzé ezzel +ezzé +ezért fel -fél fele -felé felek felet felett +felé fent fenti +fél fölé gyakran ha halló hamar hanem -hány -hányszor harmadik harmadikat -hármat harminc -három hat -hát -hátha hatodik hatodikat hatot -hátulsó hatvan helyett -hét hetedik hetediket hetet hetven -hiába hirtelen hiszen +hiába hogy hogyan hol @@ -292,14 +263,21 @@ hozzám hozzánk hozzátok hurrá -húsz huszadik +hány +hányszor +hármat +három +hát +hátha +hátulsó +hét +húsz ide ide-оda idén igazán igen -így ill ill. illetve @@ -312,9 +290,9 @@ ismét ison itt jelenleg -jó jobban jobbra +jó jól jólesik jóval @@ -323,17 +301,8 @@ kell kellene kellett kelljen -képest -kérem keressünk keresztül -kérlek -kész -késő -később -későn -két -kétszer ketten kettő kettőt @@ -343,24 +312,22 @@ kiben kiből kicsit kicsoda -kié -kiért kihez kik kikbe kikben kikből kiken -kikért kiket kikhez -kikké kikkel +kikké kiknek kiknél kikre kikről kiktől +kikért kilenc kilencedik kilencediket @@ -373,18 +340,29 @@ kire kiről kit kitől -kivé kivel -kívül +kivé +kié +kiért korábban +képest +kérem +kérlek +kész +késő +később +későn +két +kétszer +kívül körül köszönhetően köszönöm közben -közé közel -közepén közepesen +közepén +közé között közül külön @@ -402,9 +380,9 @@ lehetett lehetőleg lehetőség lenne +lenni lennék lennének -lenni lesz leszek lesznek @@ -418,37 +396,19 @@ ma maga magad magam -magát magatokat magukat magunkat +magát mai majd majdnem manapság -már -más -másik -másikat -másnap -második -másodszor -mások -másokat -mást meg -még megcsinál megcsinálnak megint -mégis megvan -mellé -melléd -melléjük -mellém -mellénk -mellétek mellett mellette melletted @@ -456,6 +416,12 @@ mellettem mellettetek mellettük mellettünk +mellé +melléd +melléjük +mellém +mellénk +mellétek mellől mellőle mellőled @@ -479,25 +445,23 @@ miattunk mibe miben miből -miért -míg mihez mik mikbe mikben mikből miken -mikért miket mikhez -mikké mikkel +mikké miknek miknél mikor mikre mikről miktől +mikért milyen min mind @@ -511,15 +475,32 @@ mindenütt mindig mindketten minek -minél mint mintha +minél mire miről mit mitől -mivé mivel +mivé +miért +mondta +most +mostanáig +már +más +másik +másikat +másnap +második +másodszor +mások +másokat +mást +még +mégis +míg mögé mögéd mögéjük @@ -539,38 +520,24 @@ mögülem mögületek mögülük mögülünk -mondta -most -mostanáig múltkor múlva na nagy nagyobb nagyon -nála -nálad -nálam -nálatok -náluk -nálunk naponta napot ne -négy negyedik negyediket -négyet negyven -néha -néhány neked nekem neki nekik nektek nekünk -nélkül nem nemcsak nemrég @@ -580,78 +547,51 @@ nyolcadik nyolcadikat nyolcat nyolcvan +nála +nálad +nálam +nálatok +náluk +nálunk +négy +négyet +néha +néhány +nélkül o -õ oda ok -õk -õket olyan -ön -önbe -önben -önből -önért -önhöz onnan -önnek -önnel -önnél -önök -önökbe -önökben -önökből -önökért -önöket -önökhöz -önökkel -önöknek -önöknél -önökön -önökre -önökről -önöktől -önön -önre -önről -önt -öntől -össze -öt -óta -ötödik -ötödiket -ötöt ott -ötven -pár pedig -például persze -rá -rád +pár +például rajta rajtad rajtam rajtatok rajtuk rajtunk +rendben +rosszul +rá +rád rájuk rám ránk rátok régen régóta -rendben részére -rögtön róla rólad rólam rólatok róluk rólunk -rosszul +rögtön s saját se @@ -662,19 +602,14 @@ semmiség senki soha sok -sokáig sokan sokat sokkal sokszor +sokáig során stb. -számára -száz -századik -százat szemben -szépen szerint szerinte szerinted @@ -683,49 +618,54 @@ szerintetek szerintük szerintünk szervusz -szét szinte +számára +száz +századik +százat +szépen +szét szíves szívesen szíveskedjék sőt talán tavaly -távol te -téged tegnap tegnapelőtt tehát tele teljes -tényleg tessék ti tied titeket -tíz tizedik tizediket tizenegy tizenegyedik -tizenhárom tizenhat +tizenhárom tizenhét -tizenkét tizenkettedik tizenkettő tizenkilenc -tizennégy +tizenkét tizennyolc +tizennégy tizenöt tizet +tovább +további +továbbá +távol +téged +tényleg +tíz több többi többször -tovább -továbbá -további túl tőle tőled @@ -733,24 +673,17 @@ tőlem tőletek tőlük tőlünk -úgy ugyanakkor ugyanez ugyani ugye -úgyis -úgynevezett -új -újabb -újra -úr urak uram urat -után -utána utoljára utolsó +után +utána vagy vagyis vagyok @@ -766,9 +699,6 @@ valamint való van vannak -végén -végre -végül vele veled velem @@ -786,6 +716,76 @@ volt voltak voltam voltunk +végre +végén +végül +által +általában +ám +át +éljen +én +éppen +érte +érted +értem +értetek +értük +értünk +és +év +évben +éve +évek +éves +évi +évvel +így +óta +õ +õk +õket +ön +önbe +önben +önből +önhöz +önnek +önnel +önnél +önre +önről +önt +öntől +önért +önök +önökbe +önökben +önökből +önöket +önökhöz +önökkel +önöknek +önöknél +önökre +önökről +önöktől +önökért +önökön +önön +össze +öt +ötven +ötödik +ötödiket +ötöt +úgy +úgyis +úgynevezett +új +újabb +újra +úr ő ők őket diff --git a/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt b/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt index 944e44788e..6ec40c9b08 100755 --- a/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ja/ja_stop_words.txt @@ -1,9 +1,9 @@ -# # This is a stop word list for the Japanese language. -# (Lightly edited to remove words in the original lists that are actually meaningful) # Sources: +# # https://github.com/stopwords/japanese-stopwords/blob/master/data/japanese-stopwords.txt # Lucene's stopwords_ja.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) $ % diff --git a/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt b/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt index 1cf21e2ae6..5db1a5f6ef 100644 --- a/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/lt/lt_stop_words.txt @@ -1,12 +1,12 @@ -# # This is a stop word list for the Lithuanian language. -# # Sources: +# +# auto-generated sources # http://www.filewatcher.com/p/punbb-1.2.16.tbz.620109/www/punbb/lang/Lithuanian/stopwords.txt.html # https://github.com/stopwords-iso/stopwords-lt/blob/master/stopwords-lt.txt -# auto-generated sources -# + +a abi abidvi abiejose @@ -69,10 +69,23 @@ bent bet betgi beveik +bus +buvo +būti +būtų +d +dabar dar +darbo dargi +daryti +daug +daugiau +daugiausia daugmaž +dažnai deja +dieną dėka dėl dėlei @@ -82,8 +95,11 @@ et gal galbūt galgi +gali gan gana +gauna +gauti gi greta idant @@ -99,9 +115,12 @@ jaisiais jajai jajam jajame +jam +jau jei jeigu ji +jie jiedu jiedvi jieji @@ -109,12 +128,14 @@ jiesiems jinai jis jisai +jo jog joji jojo jojoje jokia joks +jos josiomis josioms josios @@ -133,18 +154,22 @@ jus jąja jąją jąsias +jį jįjį -jųjų jūs jūsiškis jūsiškė jūsų +jų +jųjų kad kada kadangi kai kaip kaipgi +kam +kartą kas katra katras @@ -166,24 +191,35 @@ kita kitas kitokia kitoks +klausimas +klausti kodėl kokia koks kol kolei kone +kovo kuomet kur kurgi kuri +kurie kuriedvi +kurios kuris kuriuodu +kurių +labai lai +lietuva +lietuvoje +lietuvos lig ligi link lyg +m man manaisiais manajai @@ -218,9 +254,16 @@ manąsias manęs manųjų mat +mažai +mažas maždaug +mažiau mažne mes +metais +metu +metus +metų mudu mudvi mumis @@ -235,6 +278,7 @@ nagi ne nebe nebent +negali negi negu nei @@ -247,9 +291,12 @@ net netgi netoli neva +niekada +niekas nors nuo nė +nėra o ogi oi @@ -260,6 +307,7 @@ palaipsniui palei pas pasak +pasakė paskos paskui paskum @@ -283,13 +331,18 @@ pirm pirma pirmiau po +prašau prie prieš priešais pro pusiau +r rasi +reikia rodos +sakyti +sakė sau savaisiais savajai @@ -332,6 +385,7 @@ tad tai taigi taip +taip pat taipogi taisiais tajai @@ -386,8 +440,11 @@ ties tiesiems tiesiog tik +tikrai tikriausiai tiktai +to +todėl toji tojo tojoje @@ -401,22 +458,26 @@ tosioms tosios tosiose tu +tuo tuodu tuoju tuosiuose tuosius turbūt +turi +turėjo tąja tąją tąjį tąsias -tųjų tūlas +tųjų už užtat užvis va vai +val viduj vidury vien @@ -432,16 +493,20 @@ vis dėlto visa visas visgi +visi visokia visoks vos vėl vėlgi ypač +yra +čia į įkypai įstrižai šalia +šalies še ši šiaisiais @@ -457,6 +522,7 @@ ypač šiojoje šiokia šioks +šios šiosiomis šiosioms šiosios @@ -480,4 +546,5 @@ ypač šiųjų štai šįjį -žemiau \ No newline at end of file +žemiau +žmonių \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt b/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt index cae4ee0cf7..a426c284e6 100644 --- a/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ro/ro_stop_words.txt @@ -1,9 +1,9 @@ # A Romanian stop word list. -# (Lightly edited to remove words in the original lists that are actually meaningful) # Sources: # # http://snowball.tartarus.org/otherapps/romanian/intro.html (romanian2.tgz) # https://github.com/stopwords-iso/stopwords-ro/blob/master/stopwords-ro.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) a abia @@ -78,14 +78,14 @@ altă alţi alţii am -amândoi -amândouă -amânduror -amândurora ambele ambelor ambii ambilor +amândoi +amândouă +amânduror +amândurora anume apoi aproape @@ -102,22 +102,22 @@ astăzi asupra atare atat -atât atata -atâta atatea -atâtea atatia -atâtor -atâtora -atâţi -atâţia ati atit atita atitea atitia atunci +atât +atâta +atâtea +atâtor +atâtora +atâţi +atâţia au avea aveai @@ -141,25 +141,14 @@ c ca cam cand -când capat care careia carora caruia cat -cât -câte -câteva -câtor -câtora -câtorva catre -câtva -câtă caut -câţi -câţiva ce cea cealaltă @@ -183,19 +172,14 @@ chiar ci cinci cind -cînd cine cineva cit -cît cita cite -cîte citeva citi citiva -cîtva -cîţi conform contra cu @@ -206,6 +190,22 @@ cumva curând curînd cutare +când +cât +câte +câteva +câtor +câtora +câtorva +câtva +câtă +câţi +câţiva +cînd +cît +cîte +cîtva +cîţi că căci cărei @@ -223,9 +223,9 @@ dată dau de deasupra -decât deci decit +decât degraba deja deoarece @@ -294,68 +294,31 @@ ia iar ieri ii -îi il -îl imi -îmi -împotriva in -în inainte -înainte -înaintea inapoi inca -încât incit -încît -încotro -încă -îndoit insa -însele -însene -însevă -înseşi -înspre -însumi -însutit -însuşi -însuţi -însă -însămi -însăţi -însăşi -întâia -întâiul intr intre -între -întreit -întrucât -întrucît -înşine -înşivă -înşişi isi iti -îşi -îţi j k l la -lângă le li -lîngă lor lui +lângă +lîngă m ma mai -mâine mare mea mei @@ -364,7 +327,6 @@ mereu meu mi mie -mîine mine mod mult @@ -374,6 +336,8 @@ multi multă mulţi mulţumesc +mâine +mîine mă n ne @@ -407,20 +371,20 @@ om opt or ori -oricând oricare +orice +oricine +oricui +oricum +oricând oricât oricâte oricâtor oricâtora oricâtă oricâţi -orice oricînd -oricine oricît -oricui -oricum oricărei oricăreia oricăror @@ -431,7 +395,6 @@ oriunde oţi p pai -până parte patra patru @@ -441,7 +404,6 @@ pentru peste pic pina -pînă plus poate pot @@ -464,6 +426,8 @@ puţina puţine puţini puţină +până +pînă r rog s @@ -479,10 +443,7 @@ sie sieşi sine sint -sînt sintem -sîntem -sînteţi spate spre spune @@ -494,6 +455,9 @@ sunteţi sus sutime sută +sînt +sîntem +sînteţi să săi său @@ -573,12 +537,43 @@ zece zero zi zice -şapte -şase -şi -ţi +îi +îl +îmi +împotriva +în +înainte +înaintea +încotro +încât +încît +încă +îndoit +însele +însene +însevă +înseşi +înspre +însumi +însutit +însuşi +însuţi +însă +însămi +însăşi +însăţi +între +întreit +întrucât +întrucît +întâia +întâiul +înşine +înşivă +înşişi +îşi +îţi ăia -ţie ăla ălea ăleia @@ -587,5 +582,10 @@ zice ăsta ăstea ăstuia +ăştia +şapte +şase +şi ştiu -ăştia \ No newline at end of file +ţi +ţie \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt b/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt index f3195998bb..f4721e80e3 100644 --- a/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/ru/ru_stop_words.txt @@ -1,8 +1,8 @@ # This is a stop word list for the Russian language. -# (Lightly edited to remove words in the original lists that are actually meaningful) # -# Source: # https://github.com/stopwords-iso/stopwords-ru/blob/master/stopwords-ru.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) +# Source: adriver amp @@ -20,690 +20,690 @@ sid style www А -Б -В -Г -Д -Е -И -К -Л -М -Н -О -П -С -У -Ю -Я а -в -г -д -е -ж -и -й -к -м -о -с -т -у -х -я -с кем +августа +алло АО -Во -Вы -Да -До -За -Из -Их -Мы -НА -НЕ -На -Не -Ни -Но -Ну -Об -Он -От -По -То -бы -во -вы -го -да -до -ее -ей -ею -её -же -за -из -им -их -ли -мы -на -не -ни -но -ну -нх -об -он -от -по -со -та -те -то -ту -ты -уж +апрель +апреля +Б Без -Вот -Все -Для -Его -Еще -Как -Кто -Мне -Нет -Нью -Она -Они -При -Так -Там -Тем -Уже -Что -Эта -Эти -Это без -был -вам -вас -ваш -вид -вон -вот -все -всю -вся -всё -где -год -дал -два -две -для -дни -дня -его -ему -еще -ещё -иди -или -ими -имя -как -кем -кто -лет -мая -мне -мог -мож -мои -мой -моя -моё -над -нам -нас -наш -нее -ней -нем -нет -нею -неё -ним -них -оба -она -они -оно -под -пол -пор -при -про -раз -ряд -сам -сих -сто -так -там -тем -тех -той -том -тот -тою -три -тут -уже -час -чем -что -эта -эти -это -эту -все еще -Ведь -Даже -Если -Есть -ИТАР -Один -Пока -ТАСС -Хотя -Этот -алло +близко блог +Более +более +больше +будем +будет +будете +будешь +будто буду +будут будь +бы +бывает +бывший бывь +был была были было +быстро быть +В +в +важная +важное +важные +важный +вам вами +вас +ваш ваша ваше ваши +вверх +вдали +вдруг +Ведь ведь +везде весь +весьма +взгляд +взять +вид виде +видел +видеть +видимо +включая +власть +вместе +вместо вниз -вряд -всей -всем -всех -всею -выше -глаз -года -году -годы -дает -даже -дать -двух -день -дней -днях -едва -если -есть -жить -зато -знаю -идет -идти -имел -июля -июня -кого -кому -куда -либо -лицо -лишь -люди -мало -мене -меня -мимо -мира -мире -мной -мною -могу -мочь -нами -наша -наше -наши -него -нему -ниже -ними -ныне -один -одна -одно -одну -отец -пока -пора -пути -путь -пяти -пять -раза -сама -сами -само -саму -свое -свои -свой -свою -себе -себя -семь -срок -стал -таки -твои -твой -твоя -твоё -тебе -тебя -теми -того -тоже -тому -тонн -трех -туда -хоть -хотя -часа -чаще -чего -чему -чтоб -чуть -этим -этих -этой -этом -этот -явно -Более -Здесь -Когда -Кроме -Между -Может -Можно -После -Потом -Среди -Таким -Тогда -Через -Чтобы -более -будем -будет -будто -будут -вверх -вдали -вдруг -везде -взять -видел внизу вновь +Во +во вовсе -время -вроде -всеми -всему -всюду -даром -делал -делаю -друго -ждать -занят -затем -зачем -здесь -знает -знать -знают -имеет -имени -иметь -имеют -иначе -итоге -какая -какие -каким -каких -какой -когда -конец -конца -конце -кроме -любая -любой -людей -между -менее -места -место -месяц -метра -минут -много -могла -могли -могут -может -можно -можхо -назад -найти -нашей -наших -никак -никто -ничто -нужно -одним -одной -одном -около -опять -ответ -очень -перед -позже -пойти -после -потом -почти -прямо -пятая -пятый -разве -ранее -решил -рядом -самим -самих -самое -самой -самом -самые -самый -самым -самых -своей -своем -своим -своих -снова -собой -собою -сразу -среди -стала -стали -стало -стате -стать -стоит -столь -сумму -такая -также -такие -таким -таких -такое -такой -тобой -тобою -тогда -тысяч -уметь -часов -части -часто -часть -через -числе -число -чтобы -шесть -этими -этого -этому -якобы -хотел бы -может быть -Именно -Кстати -Многие -Однако -Почему -Правда -Причем -Рейтер -Сейчас -Теперь -Только -апрель -апреля -близко -больше -будете -будешь -бывает -бывший -быстро -важная -важное -важные -важный -весьма -взгляд -видеть -видимо -власть -вместе -вместо вокруг +вон вообще вопрос восемь +восьмой +Вот +вот +впервые вполне +Впрочем +впрочем +времена +времени +время +вроде +вряд +Все +все +все еще всегда +всей +всем +всеми +всему +всех +всею +встречи +всю +всюду +вся +всё вторая второй +Вы +вы +выше +Г +г +где +главное +главный +главным +глаз +го +говорил +говорит говоря +говорят +год +года +году +годы голова города городе группа группы +Д +д +Да +да давать +дает +Даже +даже +дал +далекий далеко дальше +даром +дать +два +две +двух +девятый девять +декабря делаем +делал делать +делаю +день +десятый десять +Для +для +дней +дни +дня +днях +До +до должен должна должно должны +должный дорога другая другие другим других +друго другое другой другом думать +Е +е +Его +его +едва +ее +ей +ему +Если +если +Есть +есть +Еще +еще +ещё +ею +её +ж +ждать +же +жить +За +за +занят +затем +зато +зачем заявил +Здесь +здесь +знает +знать значит +знаю +знают +И +и +идет +иди +идти +Из +из +или +им +имеет +имел +имени +Именно именно +иметь +имеют +ими +имя +иначе иногда +ИТАР +итоге +Их +их +июля +июня +й +К +к каждая +каждого каждое каждые каждый +кажется +Как +как +какая +какие +каким +каких +какой +кем +Когда +когда +кого +команда +команды +комната +кому +конец +Конечно +конечно +конца +конце +которая +которое +которой +котором +которую +которые +который +которым +которых +Кроме +кроме кругом кстате +Кстати кстати +Кто +кто +куда +Л +лет +ли +либо +лицо +лишь +любая +любой +людей +люди +М +м +мало +мая +Между +между +мене +менее меньше +меня +места +место +месяц месяца +месяцев +метра метров +миллион +мимо +минут минута +мира +мире +Мне +мне +Многие многие многих +много +мной +мною +мог +могла +могли +могу +могут +мож +Может +может +может быть +Можно +можно +можхо +мои +мой момент +мочь +моя +моё +Мы +мы +Н +НА +На +на +наверху +над +назад +найти +наконец +нам +нами +нас начала начале начать +наш +наша +наше нашего +нашей +наши +наших +НЕ +Не +не +него +недавно недели неделю +нее +ней +нем немало +нему +Нет +нет +нею +неё +Ни +ни нибудь +ниже +никак +никаких +никакой +никогда +никто никуда +ним +ними +них ничего +ничто +Но +но ноября +Ну +ну +нужно нужный +нх +ныне +Нью +О +о +Об +об +оба +области +образом обычно +Один +один +одна +однажды +Однако однако +одним +одно одного +одной +одном +одну +около +октября +Он +он +Она +она +Они +они +оно +опять +От +от +ответ +отец откуда отсюда +очень +очередь +П первая +первого первой первую первые первый первым первых +перед период +По +по поводу +под +подойди +позже +пойти +Пока +пока +пол +пор +пора +После +после +Потом +потом потому похоже +Почему почему +почти +Поэтому +Правда прежде +При +при +Причем +про просто +прямо +пути +путь +пятая +пяти +пятый +пять +раз +раза +разве +ранее раньше +Рейтер +решил решили решить +ряд +рядом +С +с +с кем +сам +сама +сами +самим самими +самих +само самого +самое +самой +самом самому +саму +самые +самый +самым +самых +свое своего +своей +своем своему +свои +своим своими +своих +свой +свою сделал +себе +себя +Сегодня +Сейчас сейчас +семь сидеть +сих скажем сказал скорее +снова +со +собой +собою совсем сообща +сразу +Среди +среди +срок +стал +стала +стали +стало станет +стате +стать +сто +стоит +столь +сумму +т +та +Так +так +такая +также +таки +такие +Таким +таким +таких такого +такое +такой +Там +там +ТАСС +твои +твой +твоя +твоё +те +тебе +тебя +Тем +тем +теми +Теперь теперь +тех +То +то +тобой +тобою +Тогда +тогда +того +тоже +той +Только только +том +тому +тонн +тот +тою третий +трех +три трудно +ту +туда +тут +ты +тысяч тысячи +У +у +уж +Уже +уже +уметь уровне +х хорошо +хотел бы хотеть +хоть +Хотя +хотя хочешь +час +часа +часов +части +часто +часть +чаще +чего +чем +чему +Через +через четыре +числе +число членов +Что +что +чтоб +Чтобы +чтобы +чуть шестой -января -Впрочем -Конечно -Поэтому -Сегодня -августа -включая -восьмой -впервые -впрочем -времена -времени -встречи -главное -главный -главным -говорил -говорит -говорят -далекий -девятый -декабря -десятый -должный -каждого -кажется -команда -команды -комната -конечно -которая -которое -которой -котором -которую -которые -который -которым -которых -месяцев -миллион -наверху -наконец -недавно -никаких -никакой -никогда -области -образом -однажды -октября -очередь -первого -подойди \ No newline at end of file +шесть +Эта +эта +Эти +эти +этим +этими +этих +Это +это +этого +этой +этом +этому +Этот +этот +эту +Ю +Я +я +явно +якобы +января \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt b/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt index 4252bf2627..5ad5904ccd 100644 --- a/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/sv/sv_stop_words.txt @@ -1,10 +1,10 @@ # This is a stop word list for the Swedish language. # -# (Lightly edited to remove words in the original lists that are actually meaningful) # Sources: # http://search.cpan.org/~creamyg/Lingua-StopWords-0.09/ # https://github.com/stopwords-iso/stopwords-sv/blob/master/stopwords-sv.txt # that one Swedish journalist +# (Lightly edited to remove words in the original lists that are actually meaningful) aderton adertonde @@ -15,38 +15,25 @@ allas allt alltid alltså -än andra andras annan annat -ännu -är artonde artonn -åt -åtminstone att -åtta -åttio -åttionde -åttonde av -även -båda -bådas bara bland borde bort borta -då +båda +bådas dag dagar dagarna dagen -där -därför de del delen @@ -66,6 +53,9 @@ dit ditt dock du +där +därför +då e efter eftersom @@ -84,10 +74,7 @@ ers ert ett ettusen -få fanns -får -fått fem femte femtio @@ -96,26 +83,22 @@ femton femtonde fick finnas -fjärde fjorton fjortonde +fjärde fler flera flesta -följande -för -före från fyra fyrtio fyrtionde -gå -gälla -gäller -gällt -går -gärna -gått +få +får +fått +följande +för +före genom gick gjorde @@ -124,6 +107,13 @@ god goda godare godast +gälla +gäller +gällt +gärna +gå +går +gått gör göra ha @@ -132,26 +122,26 @@ haft han hans har -här heller hellre hen henne hennes hit -högst hon honom hundra hundraen hundraett hur +här +högst i ibland icke idag -igår igen +igår imorgon in inför @@ -166,10 +156,10 @@ inte inuti ja jag -jämfört jo ju just +jämfört kan kanske knappast @@ -186,8 +176,6 @@ legat ligga ligger man -många -måste med mej mellan @@ -200,21 +188,14 @@ min mina mitt mittemot +mot +mycket +många +måste möjlig möjligen möjligt möjligtvis -mot -mycket -någon -någonting -något -några -nån -nånting -när -nästa -nåt nederst nej ner @@ -226,15 +207,24 @@ nittio nittionde nitton nittonde -nödvändig -nödvändiga -nödvändigt -nödvändigtvis nog noll nr nu nummer +när +nästa +någon +någonting +något +några +nån +nånting +nåt +nödvändig +nödvändiga +nödvändigt +nödvändigtvis och också ofta @@ -242,25 +232,14 @@ oftast olika olikt om -över -övermorgon -överst -övre på rakt -rätt redan +rätt sa -så -sådan -sådana -sådant sade -säga -säger sagt samma -sån sedan sen senare @@ -279,20 +258,27 @@ sista siste sitt sitta -själv -sjätte sju sjunde sjuttio sjuttionde sjutton sjuttonde +själv +sjätte ska skall skulle slutligen snart som +säga +säger +så +sådan +sådana +sådant +sån ta tack tar @@ -326,13 +312,8 @@ utanför ute va vad -väl -vänster -vänstra var vara -våra -vårat varför varifrån varit @@ -341,7 +322,6 @@ varken vars varsågod vart -vårt vem vems verkligen @@ -354,4 +334,24 @@ vilka vilkas vilken vilket -vill \ No newline at end of file +vill +väl +vänster +vänstra +våra +vårat +vårt +än +ännu +är +även +åt +åtminstone +åtta +åttio +åttionde +åttonde +över +övermorgon +överst +övre \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt b/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt index 0ead125f7d..4736179ead 100644 --- a/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/tr/tr_stop_words.txt @@ -1,14 +1,12 @@ -# This is a stop word list for the Turkish language. -# (Lightly edited to remove words in the original lists that are actually meaningful) -# # Sources: +# # http://nlp.ceng.fatih.edu.tr/blog/?p=101 # http://www.ranks.nl/stopwords/turkish.html # https://github.com/stopwords-iso/stopwords-tr/blob/master/stopwords-tr.txt +# (Lightly edited to remove words in the original lists that are actually meaningful) a acep -açıkçası adamakıllı adeta ait @@ -28,6 +26,7 @@ aslında aynen ayrıca az +açıkçası b bana bari @@ -36,8 +35,8 @@ bazý bazı bazıları bazısı -baţka başkası +baţka belki ben benden @@ -57,9 +56,6 @@ bir biraz birazdan birbiri -birçok -birçokları -birçoğu birden birdenbire biri @@ -70,11 +66,14 @@ birkaç birkaçı birkez birlikte +birçok +birçokları +birçoğu birþey birþeyi birşey -birţey birşeyi +birţey bitevi biteviye bittabi @@ -88,12 +87,6 @@ bizi bizim bizimki bizzat -böyle -böylece -böylecene -böylelikle -böylemesine -böylesine bu buna bunda @@ -107,27 +100,16 @@ buracıkta burada buradan burası +böyle +böylece +böylecene +böylelikle +böylemesine +böylesine büsbütün bütün c -ç -çabuk -çabukça -çeşitli -çok -çokça -çokları -çoklarınca -çokluk -çoklukla -çoğu -çoğun -çoğuna -çoğunca -çoğunlukla -çoğunu cümlesi -çünkü d da daha @@ -160,8 +142,8 @@ doksan dokuz dolayı dolayısıyla -dört doğru +dört e edecek eden @@ -206,19 +188,18 @@ gayet gayetle gayri gayrı -geçende -geçenlerde gelgelelim gene -gerçi gerek +gerçi +geçende +geçenlerde gibi gibilerden gibisinden göre h hakeza -hâlâ halbuki halen halihazırda @@ -250,9 +231,8 @@ hiçbir hiçbiri hiçbirine hiçbirini +hâlâ i -için -içinde iken iki ila @@ -272,10 +252,11 @@ itibarıyla iyi iyice iyicene +için +içinde iţte j k -kaçı kadar kah kala @@ -283,6 +264,7 @@ kanýmca karşın katrilyon kaynak +kaçı kendi kendilerine kendine @@ -358,20 +340,17 @@ neye neyi neyse nice -niçin nihayet nihayetinde nitekim niye +niçin o -ö -öbürkü -öbürü olan olarak oldu -oldukça olduklarını +oldukça olduğu olduğunu olmadı @@ -387,13 +366,8 @@ olur olursa oluyor on -ön ona onca -önce -önceden -önceleri -öncelikle onculayın onda ondan @@ -414,17 +388,9 @@ oradan oranca oranla oraya -öteki -ötekisi -ötürü otuz -öyle -öylece -öylelikle -öylemesine oysa oysaki -öz p pek pekala @@ -467,9 +433,6 @@ trilyon tüm tümü u -ü -üç -üzere v var vardı @@ -519,6 +482,42 @@ zarfında zaten zati zira +ç +çabuk +çabukça +çeşitli +çok +çokları +çoklarınca +çokluk +çoklukla +çokça +çoğu +çoğun +çoğuna +çoğunca +çoğunlukla +çoğunu +çünkü +ö +öbürkü +öbürü +ön +önce +önceden +önceleri +öncelikle +öteki +ötekisi +ötürü +öyle +öylece +öylelikle +öylemesine +öz +ü +üzere +üç þey þeyden þeyi @@ -532,18 +531,13 @@ zira ı ş şayet -ţayet şey şeyden şeye şeyi şeyler şimdi -ţimdi -şöyle -ţöyle şu -ţu şuna şuncacık şunda @@ -554,4 +548,9 @@ zira şunun şura şuracıkta -şurası \ No newline at end of file +şurası +şöyle +ţayet +ţimdi +ţu +ţöyle \ No newline at end of file diff --git a/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt b/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt index c2324e184a..f80f970b44 100644 --- a/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt +++ b/apps/common/src/python/mediawords/languages/zh/zh_stop_words.txt @@ -1,12 +1,12 @@ -# Appended Traditional Chinese characters (Note: This does not include all stopwords in Cantonese or Taiwanese Mandarin) -# (Lightly edited to remove words in the original lists that are actually meaningful) # Sources: # http://blog.csdn.net/shijiebei2009/article/details/39696571 # http://github.com/stopwords-iso/stopwords-zh +# +# (Lightly edited to remove words in the original lists that are actually meaningful) +# Appended Traditional Chinese characters (Note: This does not include all stopwords in Cantonese or Taiwanese Mandarin ! " -# $ % & @@ -131,38 +131,6 @@ sup 一 一. 一一 -下 -个 -些 -何 -來 -個 -切 -则 -则通过 -則 -則通過 -天 -定 -方面 -旦 -时 -時 -来 -样 -樣 -次 -片 -番 -直 -致 -般 -起 -轉眼 -转眼 -边 -邊 -面 七 万一 三 @@ -179,6 +147,7 @@ sup 上来 上述 上面 +下 下來 下列 下去 @@ -306,6 +275,7 @@ sup 並無 並肩 並非 +个 个人 个别 中 @@ -373,6 +343,7 @@ sup 互 互相 五 +些 亦 产生 亲口 @@ -471,6 +442,7 @@ sup 但凡 但是 但願 +何 何乐而不为 何以 何况 @@ -501,6 +473,7 @@ sup 使 使得 使用 +來 來不及 來得及 來看 @@ -521,6 +494,7 @@ sup 俺 俺们 俺們 +個 個人 個別 倍加 @@ -660,11 +634,14 @@ sup 分期 分期分批 分頭 +切 切不可 切切 切勿 切莫 +则 则甚 +则通过 刚 刚好 刚巧 @@ -697,7 +674,9 @@ sup 到處 到頭 到頭來 +則 則甚 +則通過 前后 前後 前此 @@ -981,6 +960,7 @@ sup 大量 大體 大體上 +天 她 她们 她們 @@ -1025,6 +1005,7 @@ sup 它是 它的 完成 +定 实际 密切 實現 @@ -1361,6 +1342,7 @@ sup 方 方便 方才 +方面 於 於是 於是乎 @@ -1381,6 +1363,8 @@ sup 日漸 日益 日臻 +旦 +时 时候 明显 明确 @@ -1393,6 +1377,7 @@ sup 是的 显然 显著 +時 時候 暗中 暗地裡 @@ -1443,6 +1428,7 @@ sup 本著 本身 权时 +来 来不及 来得及 来看 @@ -1462,6 +1448,7 @@ sup 某些 某個 某某 +样 根据 根據 根本 @@ -1474,7 +1461,9 @@ sup 極端 概 構成 +樣 權時 +次 次第 欢迎 欤 @@ -1584,6 +1573,7 @@ sup 爾後 爾爾 爾等 +片 牢牢 特別是 特别是 @@ -1641,6 +1631,7 @@ sup 略加 略微 略為 +番 當 當下 當中 @@ -1670,6 +1661,7 @@ sup 盡然 盡量 目前 +直 直到 直接 相似 @@ -1865,12 +1857,14 @@ sup 至今 至於 至若 +致 與 與其 與其說 與否 與此同時 舉行 +般 般的 良好 若 @@ -1988,6 +1982,7 @@ sup 赖以 赶快 赶早不赶晚 +起 起來 起先 起初 @@ -2014,10 +2009,13 @@ sup 較之 較比 較為 +轉眼 +转眼 较 较为 较之 较比 +边 达到 迄 过 @@ -2135,6 +2133,7 @@ sup 還是 還有 還要 +邊 那 那个 那么 @@ -2244,6 +2243,7 @@ sup 非独 非獨 靠 +面 頂多 頃 頃刻 From fe09c78d5c53399e4aa268e090eaf1f6891f3d00 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Tue, 6 Apr 2021 19:58:38 +0300 Subject: [PATCH 050/175] Fix up some configuration --- apps/temporal-grafana/grafana.ini | 12 ++--- apps/temporal-prometheus/prometheus.yml | 20 +++++--- apps/temporal-server/Dockerfile | 2 +- .../config/mediacloud_template.yaml | 16 +++++-- apps/temporal-server/docker-compose.tests.yml | 46 +++++++++++++++---- 5 files changed, 69 insertions(+), 27 deletions(-) diff --git a/apps/temporal-grafana/grafana.ini b/apps/temporal-grafana/grafana.ini index fcae717238..9b9d4ca5c8 100644 --- a/apps/temporal-grafana/grafana.ini +++ b/apps/temporal-grafana/grafana.ini @@ -39,7 +39,7 @@ check_for_updates = false #################################### Security #################################### [security] # disable creation of admin user on first start of grafana -disable_initial_admin_creation = true +disable_initial_admin_creation = false # default admin user, created on startup admin_user = mediacloud @@ -91,13 +91,7 @@ disable_signout_menu = true #################################### Anonymous Auth ###################### [auth.anonymous] # enable anonymous access -enabled = true - -# specify organization name that should be used for unauthenticated users -org_name = Media Cloud - -# specify role for unauthenticated users -org_role = Viewer +enabled = false #################################### Logging ########################## [log] @@ -105,6 +99,8 @@ org_role = Viewer # Use space to separate multiple modes, e.g. "console file" mode = console +format = text + #################################### Alerting ############################ [alerting] # Disable alerting engine & UI features diff --git a/apps/temporal-prometheus/prometheus.yml b/apps/temporal-prometheus/prometheus.yml index 1ca6e96f22..0a62dfbacb 100644 --- a/apps/temporal-prometheus/prometheus.yml +++ b/apps/temporal-prometheus/prometheus.yml @@ -1,14 +1,22 @@ global: scrape_interval: 5s - external_labels: - monitor: 'temporal-monitor' + scrape_timeout: 5s scrape_configs: + - job_name: 'prometheus' static_configs: - targets: - 'localhost:9090' - - 'temporal-server:8000' - - 'temporal-server:8001' - - 'temporal-server:8002' - - 'temporal-server:8003' + + - job_name: 'services' + static_configs: + - targets: + # frontend + - 'temporal-server:9091' + # matching + - 'temporal-server:9092' + # history + - 'temporal-server:9093' + # worker + - 'temporal-server:9094' diff --git a/apps/temporal-server/Dockerfile b/apps/temporal-server/Dockerfile index 448d7ff5da..65b1b580ec 100644 --- a/apps/temporal-server/Dockerfile +++ b/apps/temporal-server/Dockerfile @@ -73,7 +73,7 @@ EXPOSE \ # Port descriptions: https://docs.temporal.io/docs/server-architecture/ 6933 6934 6935 6939 7233 7234 7235 7239 \ # Prometheus endpoints - 8000 8001 8002 8003 + 9091 9092 9093 9094 USER temporal diff --git a/apps/temporal-server/config/mediacloud_template.yaml b/apps/temporal-server/config/mediacloud_template.yaml index 2f36bd1621..42b05213ee 100644 --- a/apps/temporal-server/config/mediacloud_template.yaml +++ b/apps/temporal-server/config/mediacloud_template.yaml @@ -117,9 +117,11 @@ services: membershipPort: 6933 bindOnIP: "${MC_TEMPORAL_HOST_IP}" metrics: + tags: + type: frontend prometheus: timerType: "histogram" - listenAddress: "temporal-prometheus:8000" + listenAddress: "0.0.0.0:9091" matching: rpc: @@ -127,9 +129,11 @@ services: membershipPort: 6935 bindOnIP: "${MC_TEMPORAL_HOST_IP}" metrics: + tags: + type: matching prometheus: timerType: "histogram" - listenAddress: "temporal-prometheus:8001" + listenAddress: "0.0.0.0:9092" history: rpc: @@ -137,9 +141,11 @@ services: membershipPort: 6934 bindOnIP: "${MC_TEMPORAL_HOST_IP}" metrics: + tags: + type: history prometheus: timerType: "histogram" - listenAddress: "temporal-prometheus:8002" + listenAddress: "0.0.0.0:9093" worker: rpc: @@ -147,9 +153,11 @@ services: membershipPort: 6939 bindOnIP: "${MC_TEMPORAL_HOST_IP}" metrics: + tags: + type: worker prometheus: timerType: "histogram" - listenAddress: "temporal-prometheus:8003" + listenAddress: "0.0.0.0:9094" clusterMetadata: enableGlobalNamespace: false diff --git a/apps/temporal-server/docker-compose.tests.yml b/apps/temporal-server/docker-compose.tests.yml index 3704a92302..69934ddd68 100644 --- a/apps/temporal-server/docker-compose.tests.yml +++ b/apps/temporal-server/docker-compose.tests.yml @@ -41,10 +41,10 @@ services: - 7234 - 7235 - 7239 - - 8000 - - 8001 - - 8002 - - 8003 + - 9091 + - 9092 + - 9093 + - 9094 ports: # Expose to host for debugging - "6933:6933" @@ -55,10 +55,10 @@ services: - "7234:7234" - "7235:7235" - "7239:7239" - - "8000:8000" - - "8001:8001" - - "8002:8002" - - "8003:8003" + - "9091:9091" + - "9092:9092" + - "9093:9093" + - "9094:9094" volumes: - type: bind source: ./bin/ @@ -118,6 +118,8 @@ services: image: gcr.io/mcback/temporal-prometheus:latest init: true stop_signal: SIGKILL + depends_on: + - temporal-grafana networks: - default expose: @@ -136,6 +138,34 @@ services: cpus: "2" memory: "2G" + temporal-grafana: + image: gcr.io/mcback/temporal-grafana:latest + init: true + stop_signal: SIGKILL + networks: + - default + expose: + - "3000" + ports: + # Expose to host for debugging + - "3000:3000" + volumes: + - type: bind + source: ./../temporal-grafana/grafana.ini + target: /opt/grafana/conf/grafana.ini + - type: bind + source: ./../temporal-grafana/provisioning/ + target: /opt/grafana/provisioning/ + - type: bind + source: ./../temporal-grafana/dashboards/dashboards/ + target: /opt/grafana/dashboards/ + # Limit CPUs and RAM for the process to not get too greedy + deploy: + resources: + limits: + cpus: "2" + memory: "2G" + temporal-webapp: image: gcr.io/mcback/temporal-webapp:latest init: true From e41f8bf13caf5cd95f6033c8a9d3a009bd3d5c5d Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 8 Apr 2021 01:59:55 +0300 Subject: [PATCH 051/175] Unfinished podcast demo --- .gitmodules | 7 +- apps/podcast-fetch-episode/.idea/misc.xml | 7 - .../.idea/podcast-fetch-episode.iml | 17 -- .../.idea/runConfigurations/Dockerfile.xml | 17 -- .../.idea/sqldialects.xml | 7 - .../.idea/webResources.xml | 14 -- .../docker-compose.tests.yml | 59 ------ .../src/requirements.txt | 2 - apps/podcast-fetch-transcript/.dockerignore | 92 --------- .../.idea/externalDependencies.xml | 6 - .../inspectionProfiles/profiles_settings.xml | 6 - apps/podcast-fetch-transcript/.idea/misc.xml | 7 - .../.idea/modules.xml | 8 - .../.idea/sqldialects.xml | 7 - apps/podcast-fetch-transcript/.idea/vcs.xml | 7 - apps/podcast-fetch-transcript/Dockerfile | 26 --- .../bin/podcast_fetch_transcript_worker.py | 57 ------ .../docker-compose.tests.yml | 117 ----------- .../python/podcast_fetch_transcript/config.py | 12 -- .../src/requirements.txt | 1 - .../tests/data/media-samples | 1 - .../tests/python/setup_fetch.py | 182 ----------------- .../tests/python/setup_mock_fetch_store.py | 57 ------ .../tests/python/test_fetch_long_audio.py | 68 ------- .../python/test_fetch_store_full_chain.py | 70 ------- .../tests/python/test_mock_error.py | 45 ----- .../tests/python/test_mock_not_done.py | 53 ----- .../tests/python/test_mock_success.py | 55 ----- .../podcast-poll-due-operations/.dockerignore | 92 --------- .../.idea/externalDependencies.xml | 6 - .../inspectionProfiles/profiles_settings.xml | 6 - .../.idea/misc.xml | 7 - .../.idea/modules.xml | 8 - .../.idea/podcast-poll-due-operations.iml | 11 - .../.idea/runConfigurations/Dockerfile.xml | 17 -- .../podcast-poll-due-operations/.idea/vcs.xml | 6 - apps/podcast-poll-due-operations/Dockerfile | 17 -- .../bin/podcast_poll_due_operations_worker.py | 24 --- .../docker-compose.tests.yml | 54 ----- .../due_operations.py | 112 ----------- .../podcast_poll_due_operations/exceptions.py | 13 -- .../tests/python/__init__.py | 0 .../tests/python/setup_due_operation.py | 55 ----- .../tests/python/test_due_operations.py | 40 ---- .../tests/python/test_failing_job_broker.py | 36 ---- apps/podcast-submit-operation/.dockerignore | 92 --------- .../.idea/externalDependencies.xml | 6 - .../inspectionProfiles/profiles_settings.xml | 6 - apps/podcast-submit-operation/.idea/misc.xml | 7 - .../.idea/modules.xml | 8 - .../.idea/podcast-submit-operation.iml | 14 -- .../.idea/runConfigurations/Dockefile.xml | 17 -- .../.idea/sqldialects.xml | 7 - apps/podcast-submit-operation/.idea/vcs.xml | 6 - apps/podcast-submit-operation/Dockerfile | 26 --- .../bin/podcast_submit_operation_worker.py | 75 ------- .../docker-compose.tests.yml | 56 ------ .../podcast_submit_operation/__init__.py | 0 .../python/podcast_submit_operation/config.py | 12 -- .../src/requirements.txt | 1 - .../tests/python/test_submit_operation.py | 40 ---- .../.dockerignore | 0 .../.idea/.gitignore | 8 + .../.idea/externalDependencies.xml | 0 .../inspectionProfiles/Project_Default.xml | 15 ++ .../inspectionProfiles/profiles_settings.xml | 0 .../.idea/mediawords.sql | 1 + .../podcast-transcribe-episode/.idea/misc.xml | 4 + .../.idea/modules.xml | 2 +- .../.idea/podcast-transcribe-episode.iml} | 2 +- .../.idea/runConfigurations/Dockerfile.xml | 4 +- .../.idea/sqldialects.xml | 2 +- .../.idea/vcs.xml | 0 .../Dockerfile | 10 +- .../bin/podcast_transcribe_episode_worker.py} | 4 +- .../docker-compose.tests.yml | 124 ++++++++++++ .../podcast_transcribe_episode}/__init__.py | 0 .../fetch_episode}/__init__.py | 0 .../fetch_episode}/audio_codecs.py | 0 .../fetch_episode}/bcp47_lang.py | 0 .../fetch_episode}/config.py | 0 .../fetch_episode}/enclosure.py | 53 ----- .../fetch_episode}/exceptions.py | 0 .../fetch_episode}/fetch_and_store.py | 14 +- .../fetch_episode}/fetch_url.py | 3 +- .../fetch_episode}/gcs_store.py | 4 +- .../fetch_episode}/media_file.py | 5 +- .../fetch_transcript}/__init__.py | 0 .../fetch_transcript}/exceptions.py | 0 .../fetch_transcript}/fetch_store.py | 4 +- .../fetch_transcript}/handler.py | 6 +- .../fetch_transcript}/transcript.py | 0 .../podcast_transcribe_episode/shared.py | 190 ++++++++++++++++++ .../submit_operation}/__init__.py | 0 .../submit_operation}/exceptions.py | 0 .../submit_operation}/submit_operation.py | 4 +- .../podcast_transcribe_episode/workflow.py | 56 ++++++ .../src/requirements.txt | 3 + .../tests/data/media-samples | 0 .../tests/python}/__init__.py | 0 .../tests/python/config_random_gcs_prefix.py | 0 .../tests/python/test_bcp47_lang.py | 0 .../tests/python/test_enclosure.py | 0 .../tests/python/test_fetch_and_store.py | 0 .../tests/python/test_fetch_url.py | 0 .../tests/python/test_gcs_store.py | 0 .../tests/python/test_media_file.py | 0 apps/postgresql-server/schema/mediawords.sql | 92 +-------- 108 files changed, 438 insertions(+), 2056 deletions(-) delete mode 100644 apps/podcast-fetch-episode/.idea/misc.xml delete mode 100644 apps/podcast-fetch-episode/.idea/podcast-fetch-episode.iml delete mode 100644 apps/podcast-fetch-episode/.idea/runConfigurations/Dockerfile.xml delete mode 100644 apps/podcast-fetch-episode/.idea/sqldialects.xml delete mode 100644 apps/podcast-fetch-episode/.idea/webResources.xml delete mode 100644 apps/podcast-fetch-episode/docker-compose.tests.yml delete mode 100644 apps/podcast-fetch-episode/src/requirements.txt delete mode 100644 apps/podcast-fetch-transcript/.dockerignore delete mode 100644 apps/podcast-fetch-transcript/.idea/externalDependencies.xml delete mode 100644 apps/podcast-fetch-transcript/.idea/inspectionProfiles/profiles_settings.xml delete mode 100644 apps/podcast-fetch-transcript/.idea/misc.xml delete mode 100644 apps/podcast-fetch-transcript/.idea/modules.xml delete mode 100644 apps/podcast-fetch-transcript/.idea/sqldialects.xml delete mode 100644 apps/podcast-fetch-transcript/.idea/vcs.xml delete mode 100644 apps/podcast-fetch-transcript/Dockerfile delete mode 100755 apps/podcast-fetch-transcript/bin/podcast_fetch_transcript_worker.py delete mode 100644 apps/podcast-fetch-transcript/docker-compose.tests.yml delete mode 100644 apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/config.py delete mode 100644 apps/podcast-fetch-transcript/src/requirements.txt delete mode 160000 apps/podcast-fetch-transcript/tests/data/media-samples delete mode 100644 apps/podcast-fetch-transcript/tests/python/setup_fetch.py delete mode 100644 apps/podcast-fetch-transcript/tests/python/setup_mock_fetch_store.py delete mode 100644 apps/podcast-fetch-transcript/tests/python/test_fetch_long_audio.py delete mode 100644 apps/podcast-fetch-transcript/tests/python/test_fetch_store_full_chain.py delete mode 100644 apps/podcast-fetch-transcript/tests/python/test_mock_error.py delete mode 100644 apps/podcast-fetch-transcript/tests/python/test_mock_not_done.py delete mode 100644 apps/podcast-fetch-transcript/tests/python/test_mock_success.py delete mode 100644 apps/podcast-poll-due-operations/.dockerignore delete mode 100644 apps/podcast-poll-due-operations/.idea/externalDependencies.xml delete mode 100644 apps/podcast-poll-due-operations/.idea/inspectionProfiles/profiles_settings.xml delete mode 100644 apps/podcast-poll-due-operations/.idea/misc.xml delete mode 100644 apps/podcast-poll-due-operations/.idea/modules.xml delete mode 100644 apps/podcast-poll-due-operations/.idea/podcast-poll-due-operations.iml delete mode 100644 apps/podcast-poll-due-operations/.idea/runConfigurations/Dockerfile.xml delete mode 100644 apps/podcast-poll-due-operations/.idea/vcs.xml delete mode 100644 apps/podcast-poll-due-operations/Dockerfile delete mode 100755 apps/podcast-poll-due-operations/bin/podcast_poll_due_operations_worker.py delete mode 100644 apps/podcast-poll-due-operations/docker-compose.tests.yml delete mode 100644 apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/due_operations.py delete mode 100644 apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/exceptions.py delete mode 100644 apps/podcast-poll-due-operations/tests/python/__init__.py delete mode 100644 apps/podcast-poll-due-operations/tests/python/setup_due_operation.py delete mode 100644 apps/podcast-poll-due-operations/tests/python/test_due_operations.py delete mode 100644 apps/podcast-poll-due-operations/tests/python/test_failing_job_broker.py delete mode 100644 apps/podcast-submit-operation/.dockerignore delete mode 100644 apps/podcast-submit-operation/.idea/externalDependencies.xml delete mode 100644 apps/podcast-submit-operation/.idea/inspectionProfiles/profiles_settings.xml delete mode 100644 apps/podcast-submit-operation/.idea/misc.xml delete mode 100644 apps/podcast-submit-operation/.idea/modules.xml delete mode 100644 apps/podcast-submit-operation/.idea/podcast-submit-operation.iml delete mode 100644 apps/podcast-submit-operation/.idea/runConfigurations/Dockefile.xml delete mode 100644 apps/podcast-submit-operation/.idea/sqldialects.xml delete mode 100644 apps/podcast-submit-operation/.idea/vcs.xml delete mode 100644 apps/podcast-submit-operation/Dockerfile delete mode 100755 apps/podcast-submit-operation/bin/podcast_submit_operation_worker.py delete mode 100644 apps/podcast-submit-operation/docker-compose.tests.yml delete mode 100644 apps/podcast-submit-operation/src/python/podcast_submit_operation/__init__.py delete mode 100644 apps/podcast-submit-operation/src/python/podcast_submit_operation/config.py delete mode 100644 apps/podcast-submit-operation/src/requirements.txt delete mode 100644 apps/podcast-submit-operation/tests/python/test_submit_operation.py rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/.dockerignore (100%) create mode 100644 apps/podcast-transcribe-episode/.idea/.gitignore rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/.idea/externalDependencies.xml (100%) create mode 100644 apps/podcast-transcribe-episode/.idea/inspectionProfiles/Project_Default.xml rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/.idea/inspectionProfiles/profiles_settings.xml (100%) create mode 120000 apps/podcast-transcribe-episode/.idea/mediawords.sql create mode 100644 apps/podcast-transcribe-episode/.idea/misc.xml rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/.idea/modules.xml (51%) rename apps/{podcast-fetch-transcript/.idea/podcast-fetch-transcript.iml => podcast-transcribe-episode/.idea/podcast-transcribe-episode.iml} (80%) rename apps/{podcast-fetch-transcript => podcast-transcribe-episode}/.idea/runConfigurations/Dockerfile.xml (83%) rename apps/{podcast-poll-due-operations => podcast-transcribe-episode}/.idea/sqldialects.xml (62%) rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/.idea/vcs.xml (100%) rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/Dockerfile (54%) rename apps/{podcast-fetch-episode/bin/podcast_fetch_episode_worker.py => podcast-transcribe-episode/bin/podcast_transcribe_episode_worker.py} (88%) create mode 100644 apps/podcast-transcribe-episode/docker-compose.tests.yml rename apps/{podcast-fetch-episode/src/python/podcast_fetch_episode => podcast-transcribe-episode/src/python/podcast_transcribe_episode}/__init__.py (100%) rename apps/{podcast-fetch-episode/tests/python => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode}/__init__.py (100%) rename apps/{podcast-fetch-episode/src/python/podcast_fetch_episode => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode}/audio_codecs.py (100%) rename apps/{podcast-fetch-episode/src/python/podcast_fetch_episode => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode}/bcp47_lang.py (100%) rename apps/{podcast-fetch-episode/src/python/podcast_fetch_episode => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode}/config.py (100%) rename apps/{podcast-fetch-episode/src/python/podcast_fetch_episode => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode}/enclosure.py (62%) rename apps/{podcast-fetch-episode/src/python/podcast_fetch_episode => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode}/exceptions.py (100%) rename apps/{podcast-fetch-episode/src/python/podcast_fetch_episode => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode}/fetch_and_store.py (93%) rename apps/{podcast-fetch-episode/src/python/podcast_fetch_episode => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode}/fetch_url.py (95%) rename apps/{podcast-fetch-episode/src/python/podcast_fetch_episode => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode}/gcs_store.py (98%) rename apps/{podcast-fetch-episode/src/python/podcast_fetch_episode => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode}/media_file.py (99%) rename apps/{podcast-fetch-transcript/src/python/podcast_fetch_transcript => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript}/__init__.py (100%) rename apps/{podcast-fetch-transcript/src/python/podcast_fetch_transcript => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript}/exceptions.py (100%) rename apps/{podcast-fetch-transcript/src/python/podcast_fetch_transcript => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript}/fetch_store.py (97%) rename apps/{podcast-fetch-transcript/src/python/podcast_fetch_transcript => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript}/handler.py (97%) rename apps/{podcast-fetch-transcript/src/python/podcast_fetch_transcript => podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript}/transcript.py (100%) create mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/shared.py rename apps/{podcast-fetch-transcript/tests/python => podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation}/__init__.py (100%) rename apps/{podcast-submit-operation/src/python/podcast_submit_operation => podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation}/exceptions.py (100%) rename apps/{podcast-submit-operation/src/python/podcast_submit_operation => podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation}/submit_operation.py (98%) create mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py create mode 100644 apps/podcast-transcribe-episode/src/requirements.txt rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/tests/data/media-samples (100%) rename apps/{podcast-poll-due-operations/src/python/podcast_poll_due_operations => podcast-transcribe-episode/tests/python}/__init__.py (100%) rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/tests/python/config_random_gcs_prefix.py (100%) rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/tests/python/test_bcp47_lang.py (100%) rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/tests/python/test_enclosure.py (100%) rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/tests/python/test_fetch_and_store.py (100%) rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/tests/python/test_fetch_url.py (100%) rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/tests/python/test_gcs_store.py (100%) rename apps/{podcast-fetch-episode => podcast-transcribe-episode}/tests/python/test_media_file.py (100%) diff --git a/.gitmodules b/.gitmodules index 73a4a82248..1078ac0f76 100644 --- a/.gitmodules +++ b/.gitmodules @@ -25,11 +25,8 @@ [submodule "dev/quieter-docker-compose"] path = dev/quieter-docker-compose url = https://github.com/mediacloud/docker-compose-just-quieter.git -[submodule "apps/podcast-fetch-episode/tests/data/media-samples"] - path = apps/podcast-fetch-episode/tests/data/media-samples - url = https://github.com/mediacloud/podcast-media-samples.git -[submodule "apps/podcast-fetch-transcript/tests/data/media-samples"] - path = apps/podcast-fetch-transcript/tests/data/media-samples +[submodule "apps/podcast-transcribe-episode/tests/data/media-samples"] + path = apps/podcast-transcribe-episode/tests/data/media-samples url = https://github.com/mediacloud/podcast-media-samples.git [submodule "apps/elk-journalbeat/journald-log-sample"] path = apps/elk-journalbeat/journald-log-sample diff --git a/apps/podcast-fetch-episode/.idea/misc.xml b/apps/podcast-fetch-episode/.idea/misc.xml deleted file mode 100644 index 64bb3a0baa..0000000000 --- a/apps/podcast-fetch-episode/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/podcast-fetch-episode.iml b/apps/podcast-fetch-episode/.idea/podcast-fetch-episode.iml deleted file mode 100644 index 526ab95d93..0000000000 --- a/apps/podcast-fetch-episode/.idea/podcast-fetch-episode.iml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/runConfigurations/Dockerfile.xml b/apps/podcast-fetch-episode/.idea/runConfigurations/Dockerfile.xml deleted file mode 100644 index 83b1a58573..0000000000 --- a/apps/podcast-fetch-episode/.idea/runConfigurations/Dockerfile.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/sqldialects.xml b/apps/podcast-fetch-episode/.idea/sqldialects.xml deleted file mode 100644 index 790b3f37f8..0000000000 --- a/apps/podcast-fetch-episode/.idea/sqldialects.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/webResources.xml b/apps/podcast-fetch-episode/.idea/webResources.xml deleted file mode 100644 index c30bda4153..0000000000 --- a/apps/podcast-fetch-episode/.idea/webResources.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-episode/docker-compose.tests.yml b/apps/podcast-fetch-episode/docker-compose.tests.yml deleted file mode 100644 index fd1eeaa610..0000000000 --- a/apps/podcast-fetch-episode/docker-compose.tests.yml +++ /dev/null @@ -1,59 +0,0 @@ -version: "3.7" - -services: - - podcast-fetch-episode: - image: gcr.io/mcback/podcast-fetch-episode:latest - init: true - stop_signal: SIGKILL - environment: - MC_PODCAST_GC_AUTH_JSON_BASE64: "${MC_PODCAST_GC_AUTH_JSON_BASE64}" - MC_PODCAST_FETCH_EPISODE_BUCKET_NAME: "${MC_PODCAST_FETCH_EPISODE_BUCKET_NAME}" - # Dev/test environments don't use "MC_PODCAST_FETCH_EPISODE_PATH_PREFIX" environment - # variable as they create a different, timestamped prefix for every test run. - volumes: - - type: bind - source: ./bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./src/ - target: /opt/mediacloud/src/podcast-fetch-episode/ - - type: bind - source: ./tests/ - target: /opt/mediacloud/tests/ - - type: bind - source: ./../common/src/ - target: /opt/mediacloud/src/common/ - depends_on: - - postgresql-pgbouncer - # We don't need "rabbitmq-server" to run tests - - postgresql-pgbouncer: - image: gcr.io/mcback/postgresql-pgbouncer:latest - init: true - stop_signal: SIGKILL - expose: - - 6432 - volumes: - - type: bind - source: ./../postgresql-pgbouncer/conf/ - target: /etc/pgbouncer/ - depends_on: - - postgresql-server - - postgresql-server: - image: gcr.io/mcback/postgresql-server:latest - init: true - stop_signal: SIGKILL - expose: - - 5432 - volumes: - - type: bind - source: ./../postgresql-server/bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./../postgresql-server/schema/ - target: /opt/mediacloud/schema/ - - type: bind - source: ./../postgresql-base/conf/ - target: /etc/postgresql/11/main/ diff --git a/apps/podcast-fetch-episode/src/requirements.txt b/apps/podcast-fetch-episode/src/requirements.txt deleted file mode 100644 index 061d400634..0000000000 --- a/apps/podcast-fetch-episode/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -ffmpeg-python==0.2.0 -google-cloud-storage==1.35.0 diff --git a/apps/podcast-fetch-transcript/.dockerignore b/apps/podcast-fetch-transcript/.dockerignore deleted file mode 100644 index 9b2c362a80..0000000000 --- a/apps/podcast-fetch-transcript/.dockerignore +++ /dev/null @@ -1,92 +0,0 @@ -# -# Files from the build context to be ignored by "docker build". -# -# You might want to add as many of constantly changing files here as possible -# to prevent container's image from getting rebuilt every full moon. -# -# Unfortunately, we can't just symlink this file to every app's directory: -# -# https://github.com/moby/moby/issues/12886 -# -# so for the time being you have to manually copy this file to every app -# subdirectory: -# -# cd apps/ -# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; -# - -*$py.class -*.cover -*.DS_Store -*.egg -*.egg-info/ -*.log -*.manifest -*.mo -*.pot -*.py[cod] -*.sage.py -*.so -*.spec -*.swp -*/*.py[cod] -*/*.swp -*/*/*.py[cod] -*/*/*.swp -*/*/*/*.py[cod] -*/*/*/*.swp -*/*/*/__pycache__/ -*/*/__pycache__/ -*/__pycache__/ -._* -.apdisk -.AppleDB -.AppleDesktop -.AppleDouble -.cache -.com.apple.timemachine.donotpresent -.coverage -.coverage.* -.dockerignore -.DocumentRevisions-V100 -.DS_Store -.eggs -.env -.fseventsd -.git -.gitignore -.hypothesis -.idea -.installed.cfg -.ipynb_checkpoints -.LSOverride -.mypy_cache -.pytest_cache -.Python -.python-version -.ropeproject -.scrapy -.Spotlight-V100 -.spyderproject -.spyproject -.TemporaryItems -.tox -.Trashes -.venv -.VolumeIcon.icns -.webassets-cache -__pycache__ -celerybeat-schedule -coverage.xml -Icon -local_settings.py -Network Trash Folder -nosetests.xml -parts -pip-delete-this-directory.txt -pip-log.txt -sdist -Temporary Items -wheels -_Inline - diff --git a/apps/podcast-fetch-transcript/.idea/externalDependencies.xml b/apps/podcast-fetch-transcript/.idea/externalDependencies.xml deleted file mode 100644 index 7872ffbcf2..0000000000 --- a/apps/podcast-fetch-transcript/.idea/externalDependencies.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/.idea/inspectionProfiles/profiles_settings.xml b/apps/podcast-fetch-transcript/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2da2d..0000000000 --- a/apps/podcast-fetch-transcript/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/.idea/misc.xml b/apps/podcast-fetch-transcript/.idea/misc.xml deleted file mode 100644 index b31733e855..0000000000 --- a/apps/podcast-fetch-transcript/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/.idea/modules.xml b/apps/podcast-fetch-transcript/.idea/modules.xml deleted file mode 100644 index 4ff9c4812f..0000000000 --- a/apps/podcast-fetch-transcript/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/.idea/sqldialects.xml b/apps/podcast-fetch-transcript/.idea/sqldialects.xml deleted file mode 100644 index 790b3f37f8..0000000000 --- a/apps/podcast-fetch-transcript/.idea/sqldialects.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/.idea/vcs.xml b/apps/podcast-fetch-transcript/.idea/vcs.xml deleted file mode 100644 index a4647a1c0e..0000000000 --- a/apps/podcast-fetch-transcript/.idea/vcs.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/Dockerfile b/apps/podcast-fetch-transcript/Dockerfile deleted file mode 100644 index 0a7acb7f8f..0000000000 --- a/apps/podcast-fetch-transcript/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -# -# Collect due transcripts from Google Speech API, store them locally as both raw JSON and download text -# - -FROM gcr.io/mcback/common:latest - -# Install Python dependencies -COPY src/requirements.txt /var/tmp/ -RUN \ - cd /var/tmp/ && \ - pip3 install -r requirements.txt && \ - rm requirements.txt && \ - rm -rf /root/.cache/ && \ - true - -# Copy sources -COPY src/ /opt/mediacloud/src/podcast-fetch-transcript/ -ENV PERL5LIB="/opt/mediacloud/src/podcast-fetch-transcript/perl:${PERL5LIB}" \ - PYTHONPATH="/opt/mediacloud/src/podcast-fetch-transcript/python:${PYTHONPATH}" - -# Copy worker script -COPY bin /opt/mediacloud/bin - -USER mediacloud - -CMD ["podcast_fetch_transcript_worker.py"] diff --git a/apps/podcast-fetch-transcript/bin/podcast_fetch_transcript_worker.py b/apps/podcast-fetch-transcript/bin/podcast_fetch_transcript_worker.py deleted file mode 100755 index ae25385834..0000000000 --- a/apps/podcast-fetch-transcript/bin/podcast_fetch_transcript_worker.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 - -from mediawords.db import connect_to_db -from mediawords.job import JobBroker -from mediawords.util.log import create_logger -from mediawords.util.perl import decode_object_from_bytes_if_needed -from mediawords.util.process import fatal_error - -from podcast_fetch_transcript.exceptions import McPodcastFetchTranscriptSoftException - -from podcast_fetch_transcript.fetch_store import fetch_store_transcript - -log = create_logger(__name__) - - -def run_podcast_fetch_transcript(podcast_episode_transcript_fetches_id: int) -> None: - """Fetch a completed episode transcripts from Speech API for story.""" - - if isinstance(podcast_episode_transcript_fetches_id, bytes): - podcast_episode_transcript_fetches_id = decode_object_from_bytes_if_needed( - podcast_episode_transcript_fetches_id) - podcast_episode_transcript_fetches_id = int(podcast_episode_transcript_fetches_id) - - if not podcast_episode_transcript_fetches_id: - fatal_error("'podcast_episode_transcript_fetches_id' is unset.") - - db = connect_to_db() - - log.info(f"Fetching transcript for fetch ID {podcast_episode_transcript_fetches_id}...") - - try: - stories_id = fetch_store_transcript( - db=db, - podcast_episode_transcript_fetches_id=podcast_episode_transcript_fetches_id, - ) - - if stories_id: - JobBroker(queue_name='MediaWords::Job::ExtractAndVector').add_to_queue(stories_id=stories_id) - - except McPodcastFetchTranscriptSoftException as ex: - # Soft exceptions - log.error(f"Unable to fetch transcript for fetch ID {podcast_episode_transcript_fetches_id}: {ex}") - raise ex - - except Exception as ex: - # Hard and other exceptions - fatal_error(( - f"Fatal / unknown error while fetching transcript " - f"for ID {podcast_episode_transcript_fetches_id}: {ex}" - )) - - log.info(f"Done fetching transcript for ID {podcast_episode_transcript_fetches_id}") - - -if __name__ == '__main__': - app = JobBroker(queue_name='MediaWords::Job::Podcast::FetchTranscript') - app.start_worker(handler=run_podcast_fetch_transcript) diff --git a/apps/podcast-fetch-transcript/docker-compose.tests.yml b/apps/podcast-fetch-transcript/docker-compose.tests.yml deleted file mode 100644 index 3865e9d91f..0000000000 --- a/apps/podcast-fetch-transcript/docker-compose.tests.yml +++ /dev/null @@ -1,117 +0,0 @@ -version: "3.7" - -services: - - podcast-fetch-transcript: - image: gcr.io/mcback/podcast-fetch-transcript:latest - init: true - stop_signal: SIGKILL - environment: - MC_PODCAST_GC_AUTH_JSON_BASE64: "${MC_PODCAST_GC_AUTH_JSON_BASE64}" - MC_PODCAST_FETCH_TRANSCRIPT_RUN_COSTLY_TEST: "${MC_PODCAST_FETCH_TRANSCRIPT_RUN_COSTLY_TEST}" - expose: - # "test_full_chain.py" test server's port - - 8080 - volumes: - - type: bind - source: ./bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./src/ - target: /opt/mediacloud/src/podcast-fetch-transcript/ - - type: bind - source: ./tests/ - target: /opt/mediacloud/tests/ - - type: bind - source: ./../common/src/ - target: /opt/mediacloud/src/common/ - depends_on: - - podcast-fetch-episode - - podcast-submit-operation - # No "podcast-poll-due-operations" as we'll just go ahead and fetch it ourselves - - postgresql-pgbouncer - - rabbitmq-server - - podcast-fetch-episode: - image: gcr.io/mcback/podcast-fetch-episode:latest - init: true - stop_signal: SIGKILL - environment: - MC_PODCAST_GC_AUTH_JSON_BASE64: "${MC_PODCAST_GC_AUTH_JSON_BASE64}" - MC_PODCAST_FETCH_EPISODE_BUCKET_NAME: "${MC_PODCAST_FETCH_EPISODE_BUCKET_NAME}" - MC_PODCAST_FETCH_EPISODE_PATH_PREFIX: "audio-files/" - volumes: - - type: bind - source: ./../podcast-fetch-episode/bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./../podcast-fetch-episode/src/ - target: /opt/mediacloud/src/podcast-fetch-episode/ - - type: bind - source: ./../common/src/ - target: /opt/mediacloud/src/common/ - depends_on: - - postgresql-pgbouncer - - rabbitmq-server - - podcast-submit-operation: - image: gcr.io/mcback/podcast-submit-operation:latest - init: true - stop_signal: SIGKILL - environment: - MC_PODCAST_GC_AUTH_JSON_BASE64: "${MC_PODCAST_GC_AUTH_JSON_BASE64}" - volumes: - - type: bind - source: ./../podcast-submit-operation/bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./../podcast-submit-operation/src/ - target: /opt/mediacloud/src/podcast-submit-operation/ - - type: bind - source: ./../common/src/ - target: /opt/mediacloud/src/common/ - depends_on: - - postgresql-pgbouncer - - rabbitmq-server - - postgresql-pgbouncer: - image: gcr.io/mcback/postgresql-pgbouncer:latest - init: true - stop_signal: SIGKILL - expose: - - 6432 - volumes: - - type: bind - source: ./../postgresql-pgbouncer/conf/ - target: /etc/pgbouncer/ - depends_on: - - postgresql-server - - postgresql-server: - image: gcr.io/mcback/postgresql-server:latest - init: true - stop_signal: SIGKILL - expose: - - 5432 - volumes: - - type: bind - source: ./../postgresql-server/bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./../postgresql-server/schema/ - target: /opt/mediacloud/schema/ - - type: bind - source: ./../postgresql-base/conf/ - target: /etc/postgresql/11/main/ - - rabbitmq-server: - image: gcr.io/mcback/rabbitmq-server:latest - init: true - stop_signal: SIGKILL - expose: - - 5672 - - 15672 - volumes: - - type: bind - source: ./../rabbitmq-server/conf/ - target: /etc/rabbitmq/ diff --git a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/config.py b/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/config.py deleted file mode 100644 index 782ed619c3..0000000000 --- a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/config.py +++ /dev/null @@ -1,12 +0,0 @@ -from mediawords.util.config import file_with_env_value - - -class PodcastFetchTranscriptConfig(object): - """ - Podcast transcript fetcher configuration. - """ - - @staticmethod - def gc_auth_json_file() -> str: - """Return path to Google Cloud authentication JSON file.""" - return file_with_env_value(name='MC_PODCAST_GC_AUTH_JSON_BASE64', encoded_with_base64=True) diff --git a/apps/podcast-fetch-transcript/src/requirements.txt b/apps/podcast-fetch-transcript/src/requirements.txt deleted file mode 100644 index 59e80a7b73..0000000000 --- a/apps/podcast-fetch-transcript/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -google-cloud-speech==2.0.1 diff --git a/apps/podcast-fetch-transcript/tests/data/media-samples b/apps/podcast-fetch-transcript/tests/data/media-samples deleted file mode 160000 index 45b179fd86..0000000000 --- a/apps/podcast-fetch-transcript/tests/data/media-samples +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 45b179fd867b6031c803cdbb7eddafa7e204d5bd diff --git a/apps/podcast-fetch-transcript/tests/python/setup_fetch.py b/apps/podcast-fetch-transcript/tests/python/setup_fetch.py deleted file mode 100644 index 9569d89587..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/setup_fetch.py +++ /dev/null @@ -1,182 +0,0 @@ -import abc -import os -import random -import socket -import time -from typing import Union -from unittest import TestCase - -from mediawords.db import connect_to_db -from mediawords.job import JobBroker -from mediawords.test.db.create import create_test_medium, create_test_feed -from mediawords.test.hash_server import HashServer -from mediawords.util.log import create_logger - -log = create_logger(__name__) - - -class AbstractFetchTranscriptTestCase(TestCase, metaclass=abc.ABCMeta): - __slots__ = [ - 'db', - 'hs', - 'stories_id', - 'transcript_fetches', - ] - - @classmethod - @abc.abstractmethod - def input_media_path(cls) -> str: - """Return full path to input media file.""" - raise NotImplemented("Abstract method") - - @classmethod - @abc.abstractmethod - def input_media_mime_type(cls) -> str: - """Return input media file's MIME type.""" - raise NotImplemented("Abstract method") - - @classmethod - @abc.abstractmethod - def story_title_description(cls) -> str: - """Return a string to store as both story title and description.""" - raise NotImplemented("Abstract method") - - @classmethod - @abc.abstractmethod - def retries_per_step(cls) -> int: - """How many retries to do per each local step.""" - raise NotImplemented("Abstract method") - - @classmethod - @abc.abstractmethod - def seconds_between_retries(cls) -> float: - """How many seconds to wait between retries.""" - raise NotImplemented("Abstract method") - - def setUp(self) -> None: - super().setUp() - - self.db = connect_to_db() - - test_medium = create_test_medium(db=self.db, label='test') - test_feed = create_test_feed(db=self.db, label='test', medium=test_medium) - - # Add a story with a random ID to decrease the chance that object in GCS will collide with another test running - # at the same time - self.stories_id = random.randint(1, 1000000) - - self.db.query(""" - INSERT INTO stories ( - stories_id, - media_id, - url, - guid, - title, - description, - publish_date, - collect_date, - full_text_rss - ) VALUES ( - %(stories_id)s, - %(media_id)s, - 'http://story.test/', - 'guid://story.test/', - 'story', - 'description', - '2016-10-15 08:00:00', - '2016-10-15 10:00:00', - true - ) - """, { - 'stories_id': self.stories_id, - 'media_id': test_feed['media_id'], - }) - - # Create missing partitions for "feeds_stories_map" - self.db.query('SELECT create_missing_partitions()') - - self.db.create( - table='feeds_stories_map', - insert_hash={ - 'feeds_id': int(test_feed['feeds_id']), - 'stories_id': self.stories_id, - } - ) - - assert os.path.isfile(self.input_media_path()), f"Test media file '{self.input_media_path()}' should exist." - - with open(self.input_media_path(), mode='rb') as f: - test_data = f.read() - - # noinspection PyUnusedLocal - def __media_callback(request: HashServer.Request) -> Union[str, bytes]: - response = "".encode('utf-8') - response += "HTTP/1.0 200 OK\r\n".encode('utf-8') - response += f"Content-Type: {self.input_media_mime_type()}\r\n".encode('utf-8') - response += f"Content-Length: {len(test_data)}\r\n".encode('utf-8') - response += "\r\n".encode('utf-8') - response += test_data - return response - - port = 8080 # Port exposed on docker-compose.tests.yml - media_path = '/test_media_file' - pages = { - media_path: { - 'callback': __media_callback, - } - } - - self.hs = HashServer(port=port, pages=pages) - self.hs.start() - - # Using our hostname as it will be another container that will be connecting to us - media_url = f'http://{socket.gethostname()}:{port}{media_path}' - - self.db.insert(table='story_enclosures', insert_hash={ - 'stories_id': self.stories_id, - 'url': media_url, - 'mime_type': self.input_media_mime_type(), - 'length': len(test_data), - }) - - # Add a "podcast-fetch-episode" job - JobBroker(queue_name='MediaWords::Job::Podcast::FetchEpisode').add_to_queue(stories_id=self.stories_id) - - total_time = int(self.retries_per_step() * self.seconds_between_retries()) - - # Wait for "podcast-fetch-episode" to transcode, upload to Google Storage, and write it to "podcast_episodes" - episodes = None - for x in range(1, self.retries_per_step() + 1): - log.info(f"Waiting for episode to appear (#{x})...") - - episodes = self.db.select(table='podcast_episodes', what_to_select='*').hashes() - if episodes: - log.info(f"Episode is here!") - break - - time.sleep(self.seconds_between_retries()) - - assert episodes, f"Episode didn't show up in {total_time} seconds." - - # Wait for "podcast-submit-operation" to submit Speech API operation - self.transcript_fetches = None - for x in range(1, self.retries_per_step() + 1): - log.info(f"Waiting for transcript fetch to appear (#{x})...") - - self.transcript_fetches = self.db.select( - table='podcast_episode_transcript_fetches', - what_to_select='*' - ).hashes() - - if self.transcript_fetches: - log.info(f"Transcript fetch is here!") - break - - time.sleep(self.seconds_between_retries()) - - assert self.transcript_fetches, f"Operation didn't show up in {total_time} seconds." - - def tearDown(self) -> None: - super().tearDown() - - self.hs.stop() diff --git a/apps/podcast-fetch-transcript/tests/python/setup_mock_fetch_store.py b/apps/podcast-fetch-transcript/tests/python/setup_mock_fetch_store.py deleted file mode 100644 index bf8065f670..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/setup_mock_fetch_store.py +++ /dev/null @@ -1,57 +0,0 @@ -import abc -from unittest import TestCase - -from mediawords.db import connect_to_db -from mediawords.test.db.create import create_test_medium, create_test_feed, create_test_story -from mediawords.util.log import create_logger - -log = create_logger(__name__) - - -class AbstractMockFetchStoreTestCase(TestCase, metaclass=abc.ABCMeta): - MOCK_SPEECH_OPERATION_ID = 'foo' - - __slots__ = [ - 'db', - 'enclosure', - 'episode', - 'transcript_fetch', - 'podcast_episode_transcript_fetches_id', - ] - - def setUp(self) -> None: - super().setUp() - - self.db = connect_to_db() - - test_medium = create_test_medium(db=self.db, label='test') - test_feed = create_test_feed(db=self.db, label='test', medium=test_medium) - test_story = create_test_story(db=self.db, feed=test_feed, label='test') - - self.enclosure = self.db.insert(table='story_enclosures', insert_hash={ - 'stories_id': test_story['stories_id'], - 'url': 'foo', - 'mime_type': 'foo', - 'length': 3, - }) - - self.episode = self.db.insert(table='podcast_episodes', insert_hash={ - 'stories_id': test_story['stories_id'], - 'story_enclosures_id': self.enclosure['story_enclosures_id'], - 'gcs_uri': 'gs://test', - 'duration': 3, - 'codec': 'FLAC', - 'sample_rate': 44100, - 'bcp47_language_code': 'en-US', - 'speech_operation_id': self.MOCK_SPEECH_OPERATION_ID, - }) - - self.transcript_fetch = self.db.query(""" - INSERT INTO podcast_episode_transcript_fetches (podcast_episodes_id, add_to_queue_at) - VALUES (%(podcast_episodes_id)s, NOW()) - RETURNING * - """, { - 'podcast_episodes_id': self.episode['podcast_episodes_id'], - }).hash() - - self.podcast_episode_transcript_fetches_id = self.transcript_fetch['podcast_episode_transcript_fetches_id'] diff --git a/apps/podcast-fetch-transcript/tests/python/test_fetch_long_audio.py b/apps/podcast-fetch-transcript/tests/python/test_fetch_long_audio.py deleted file mode 100644 index ef246d5bf8..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/test_fetch_long_audio.py +++ /dev/null @@ -1,68 +0,0 @@ -import os -import time - -import pytest - -from mediawords.util.log import create_logger - -from podcast_fetch_transcript.handler import DefaultHandler - -from .setup_fetch import AbstractFetchTranscriptTestCase - -log = create_logger(__name__) - - -@pytest.mark.skipif('MC_PODCAST_FETCH_TRANSCRIPT_RUN_COSTLY_TEST' not in os.environ, - reason="Costly; each run costs about 60 / 4 * 0.009 = $0.04") -class LongAudioTestCase(AbstractFetchTranscriptTestCase): - """Test the full chain against a long audio file to try out whether podcast-fetch-transcript manages to back off.""" - - @classmethod - def input_media_path(cls) -> str: - return '/opt/mediacloud/tests/data/media-samples/samples/nixon_speech-vorbis-1m.ogg' - - @classmethod - def input_media_mime_type(cls) -> str: - return 'audio/ogg' - - @classmethod - def story_title_description(cls) -> str: - return 'Resignation speech of United States President Richard Nixon' - - @classmethod - def retries_per_step(cls) -> int: - # Try more often and wait for longer as this is a bigger file - return 60 - - @classmethod - def seconds_between_retries(cls) -> float: - return 1.0 - - def test_long_audio(self): - transcript = None - - handler = DefaultHandler() - - # Input audio file is 1m0s, so wait for at least two minutes - for x in range(1, 12 + 1): - log.info(f"Waiting for transcript to be finished (#{x})...") - - podcast_episode_transcript_fetches_id = self.transcript_fetches[0]['podcast_episode_transcript_fetches_id'] - transcript = handler.fetch_transcript( - db=self.db, - podcast_episode_transcript_fetches_id=podcast_episode_transcript_fetches_id - ) - - if transcript: - log.info("Transcript is here!") - break - - time.sleep(5) - - print(transcript) - - assert transcript - assert transcript.stories_id - assert len(transcript.utterances) > 0 - assert len(transcript.utterances[0].alternatives) > 0 - assert 'evening' in transcript.utterances[0].alternatives[0].text.lower() diff --git a/apps/podcast-fetch-transcript/tests/python/test_fetch_store_full_chain.py b/apps/podcast-fetch-transcript/tests/python/test_fetch_store_full_chain.py deleted file mode 100644 index 1e6933a36d..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/test_fetch_store_full_chain.py +++ /dev/null @@ -1,70 +0,0 @@ -import time - -from mediawords.dbi.downloads.store import fetch_content -from mediawords.util.log import create_logger -from podcast_fetch_transcript.handler import DefaultHandler - -from .setup_fetch import AbstractFetchTranscriptTestCase - -log = create_logger(__name__) - - -class FullChainTestCase(AbstractFetchTranscriptTestCase): - """Test the full chain against a small audio file.""" - - @classmethod - def input_media_path(cls) -> str: - # Run the test with AAC file to test out both transcoding to FLAC and whether Speech API can transcribe audio - # files after lossy -> lossless transcoding - return '/opt/mediacloud/tests/data/media-samples/samples/kim_kardashian-aac.m4a' - - @classmethod - def input_media_mime_type(cls) -> str: - return 'audio/mp4' - - @classmethod - def story_title_description(cls) -> str: - # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be - # used to guess the probable language of the podcast episode - return 'keeping up with Kardashians' - - @classmethod - def retries_per_step(cls) -> int: - return 120 - - @classmethod - def seconds_between_retries(cls) -> float: - return 0.5 - - def test_full_chain(self): - transcript = None - - handler = DefaultHandler() - - for x in range(1, 60 + 1): - log.info(f"Waiting for transcript to be finished (#{x})...") - - podcast_episode_transcript_fetches_id = self.transcript_fetches[0]['podcast_episode_transcript_fetches_id'] - transcript = handler.fetch_transcript( - db=self.db, - podcast_episode_transcript_fetches_id=podcast_episode_transcript_fetches_id - ) - if transcript: - log.info("Transcript is here!") - break - - time.sleep(2) - - assert transcript - assert transcript.stories_id - assert len(transcript.utterances) == 1 - assert len(transcript.utterances[0].alternatives) == 1 - assert 'kim kardashian' in transcript.utterances[0].alternatives[0].text.lower() - - downloads_id = handler.store_transcript(db=self.db, transcript=transcript) - - download = self.db.find_by_id(table='downloads', object_id=downloads_id) - - raw_download = fetch_content(db=self.db, download=download) - assert raw_download - assert 'kim kardashian' in raw_download.lower() diff --git a/apps/podcast-fetch-transcript/tests/python/test_mock_error.py b/apps/podcast-fetch-transcript/tests/python/test_mock_error.py deleted file mode 100644 index 31033cbf86..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/test_mock_error.py +++ /dev/null @@ -1,45 +0,0 @@ -from typing import Optional - -import pytest - -from mediawords.db import DatabaseHandler - -from podcast_fetch_transcript.exceptions import McPodcastFetchTranscriptHardException -from podcast_fetch_transcript.fetch_store import fetch_store_transcript -from podcast_fetch_transcript.handler import AbstractHandler -from podcast_fetch_transcript.transcript import Transcript - -from .setup_mock_fetch_store import AbstractMockFetchStoreTestCase - - -class MockTranscriptErrorWithExceptionHandler(AbstractHandler): - """Mock handler that fails the transcription with soft error.""" - - @classmethod - def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetches_id: int) -> Optional[Transcript]: - raise McPodcastFetchTranscriptHardException("Some sort of a permanent problem") - - @classmethod - def store_transcript(cls, db: DatabaseHandler, transcript: Transcript) -> int: - raise NotImplemented("Shouldn't be called.") - - -class MockErrorTestCase(AbstractMockFetchStoreTestCase): - - def test_error(self): - handler = MockTranscriptErrorWithExceptionHandler() - - with pytest.raises(McPodcastFetchTranscriptHardException): - fetch_store_transcript( - db=self.db, - podcast_episode_transcript_fetches_id=self.podcast_episode_transcript_fetches_id, - handler=handler, - ) - - transcript_fetches = self.db.select(table='podcast_episode_transcript_fetches', what_to_select='*').hashes() - assert len(transcript_fetches) == 1 - - transcript_fetch = transcript_fetches[0] - assert transcript_fetch['fetched_at'] - assert transcript_fetch['result'] == 'error' - assert 'permanent problem' in transcript_fetch['error_message'] diff --git a/apps/podcast-fetch-transcript/tests/python/test_mock_not_done.py b/apps/podcast-fetch-transcript/tests/python/test_mock_not_done.py deleted file mode 100644 index 1d4e7cf1f9..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/test_mock_not_done.py +++ /dev/null @@ -1,53 +0,0 @@ -from typing import Optional - -from mediawords.db import DatabaseHandler - -from podcast_fetch_transcript.fetch_store import fetch_store_transcript -from podcast_fetch_transcript.handler import AbstractHandler -from podcast_fetch_transcript.transcript import Transcript - -from .setup_mock_fetch_store import AbstractMockFetchStoreTestCase - - -class MockTranscriptNotDoneHandler(AbstractHandler): - """Mock handler that reports that the transcript is not yet done.""" - - @classmethod - def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetches_id: int) -> Optional[Transcript]: - return None - - @classmethod - def store_transcript(cls, db: DatabaseHandler, transcript: Transcript) -> int: - raise NotImplemented("Shouldn't be called.") - - -class MockFailedTestCase(AbstractMockFetchStoreTestCase): - - def test_not_done(self): - handler = MockTranscriptNotDoneHandler() - - stories_id = fetch_store_transcript( - db=self.db, - podcast_episode_transcript_fetches_id=self.podcast_episode_transcript_fetches_id, - handler=handler, - ) - assert stories_id is None - - transcript_fetches = self.db.query(""" - SELECT * - FROM podcast_episode_transcript_fetches - ORDER BY podcast_episode_transcript_fetches_id - """).hashes() - assert len(transcript_fetches) == 2, "One fetch that's still in progress, another one added for the future." - - transcript_fetch_in_progress = transcript_fetches[0] - assert transcript_fetch_in_progress['fetched_at'] - assert transcript_fetch_in_progress['result'] == 'in_progress' - assert not transcript_fetch_in_progress['error_message'] - - transcript_fetch_readded = transcript_fetches[1] - assert transcript_fetch_readded['add_to_queue_at'] - assert not transcript_fetch_readded['added_to_queue_at'] - assert not transcript_fetch_readded['fetched_at'] - assert not transcript_fetch_readded['result'] - assert not transcript_fetch_readded['error_message'] diff --git a/apps/podcast-fetch-transcript/tests/python/test_mock_success.py b/apps/podcast-fetch-transcript/tests/python/test_mock_success.py deleted file mode 100644 index d388b58f0b..0000000000 --- a/apps/podcast-fetch-transcript/tests/python/test_mock_success.py +++ /dev/null @@ -1,55 +0,0 @@ -from typing import Optional - -from mediawords.db import DatabaseHandler -from podcast_fetch_transcript.fetch_store import fetch_store_transcript - -from podcast_fetch_transcript.handler import AbstractHandler -from podcast_fetch_transcript.transcript import Transcript, Utterance, UtteranceAlternative - -from .setup_mock_fetch_store import AbstractMockFetchStoreTestCase - - -class MockTranscriptSuccessHandler(AbstractHandler): - """Mock handler that fetches the transcription successfully.""" - - @classmethod - def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetches_id: int) -> Optional[Transcript]: - return Transcript( - stories_id=42, - utterances=[ - Utterance( - alternatives=[ - UtteranceAlternative( - text='Kim Kardashian.', - confidence=1.00, - ) - ], - bcp47_language_code='en-US', - ), - ] - ) - - @classmethod - def store_transcript(cls, db: DatabaseHandler, transcript: Transcript) -> int: - return transcript.stories_id - - -class MockSuccessTestCase(AbstractMockFetchStoreTestCase): - - def test_success(self): - handler = MockTranscriptSuccessHandler() - - stories_id = fetch_store_transcript( - db=self.db, - podcast_episode_transcript_fetches_id=self.podcast_episode_transcript_fetches_id, - handler=handler, - ) - assert stories_id - - transcript_fetches = self.db.select(table='podcast_episode_transcript_fetches', what_to_select='*').hashes() - assert len(transcript_fetches) == 1 - - transcript_fetch = transcript_fetches[0] - assert transcript_fetch['fetched_at'] - assert transcript_fetch['result'] == 'success' - assert not transcript_fetch['error_message'] diff --git a/apps/podcast-poll-due-operations/.dockerignore b/apps/podcast-poll-due-operations/.dockerignore deleted file mode 100644 index 9b2c362a80..0000000000 --- a/apps/podcast-poll-due-operations/.dockerignore +++ /dev/null @@ -1,92 +0,0 @@ -# -# Files from the build context to be ignored by "docker build". -# -# You might want to add as many of constantly changing files here as possible -# to prevent container's image from getting rebuilt every full moon. -# -# Unfortunately, we can't just symlink this file to every app's directory: -# -# https://github.com/moby/moby/issues/12886 -# -# so for the time being you have to manually copy this file to every app -# subdirectory: -# -# cd apps/ -# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; -# - -*$py.class -*.cover -*.DS_Store -*.egg -*.egg-info/ -*.log -*.manifest -*.mo -*.pot -*.py[cod] -*.sage.py -*.so -*.spec -*.swp -*/*.py[cod] -*/*.swp -*/*/*.py[cod] -*/*/*.swp -*/*/*/*.py[cod] -*/*/*/*.swp -*/*/*/__pycache__/ -*/*/__pycache__/ -*/__pycache__/ -._* -.apdisk -.AppleDB -.AppleDesktop -.AppleDouble -.cache -.com.apple.timemachine.donotpresent -.coverage -.coverage.* -.dockerignore -.DocumentRevisions-V100 -.DS_Store -.eggs -.env -.fseventsd -.git -.gitignore -.hypothesis -.idea -.installed.cfg -.ipynb_checkpoints -.LSOverride -.mypy_cache -.pytest_cache -.Python -.python-version -.ropeproject -.scrapy -.Spotlight-V100 -.spyderproject -.spyproject -.TemporaryItems -.tox -.Trashes -.venv -.VolumeIcon.icns -.webassets-cache -__pycache__ -celerybeat-schedule -coverage.xml -Icon -local_settings.py -Network Trash Folder -nosetests.xml -parts -pip-delete-this-directory.txt -pip-log.txt -sdist -Temporary Items -wheels -_Inline - diff --git a/apps/podcast-poll-due-operations/.idea/externalDependencies.xml b/apps/podcast-poll-due-operations/.idea/externalDependencies.xml deleted file mode 100644 index 7872ffbcf2..0000000000 --- a/apps/podcast-poll-due-operations/.idea/externalDependencies.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/inspectionProfiles/profiles_settings.xml b/apps/podcast-poll-due-operations/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2da2d..0000000000 --- a/apps/podcast-poll-due-operations/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/misc.xml b/apps/podcast-poll-due-operations/.idea/misc.xml deleted file mode 100644 index 46a8a5a238..0000000000 --- a/apps/podcast-poll-due-operations/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/modules.xml b/apps/podcast-poll-due-operations/.idea/modules.xml deleted file mode 100644 index d113be0932..0000000000 --- a/apps/podcast-poll-due-operations/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/podcast-poll-due-operations.iml b/apps/podcast-poll-due-operations/.idea/podcast-poll-due-operations.iml deleted file mode 100644 index 83a606a6bd..0000000000 --- a/apps/podcast-poll-due-operations/.idea/podcast-poll-due-operations.iml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/runConfigurations/Dockerfile.xml b/apps/podcast-poll-due-operations/.idea/runConfigurations/Dockerfile.xml deleted file mode 100644 index e6a39721ac..0000000000 --- a/apps/podcast-poll-due-operations/.idea/runConfigurations/Dockerfile.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/.idea/vcs.xml b/apps/podcast-poll-due-operations/.idea/vcs.xml deleted file mode 100644 index b2bdec2d71..0000000000 --- a/apps/podcast-poll-due-operations/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-poll-due-operations/Dockerfile b/apps/podcast-poll-due-operations/Dockerfile deleted file mode 100644 index 9de70b053f..0000000000 --- a/apps/podcast-poll-due-operations/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -# -# Poll database for operations which should be done by now, add transcription fetch for due operations -# - -FROM gcr.io/mcback/common:latest - -# Copy sources -COPY src/ /opt/mediacloud/src/podcast-poll-due-operations/ -ENV PERL5LIB="/opt/mediacloud/src/podcast-poll-due-operations/perl:${PERL5LIB}" \ - PYTHONPATH="/opt/mediacloud/src/podcast-poll-due-operations/python:${PYTHONPATH}" - -# Copy worker script -COPY bin /opt/mediacloud/bin - -USER mediacloud - -CMD ["podcast_poll_due_operations_worker.py"] diff --git a/apps/podcast-poll-due-operations/bin/podcast_poll_due_operations_worker.py b/apps/podcast-poll-due-operations/bin/podcast_poll_due_operations_worker.py deleted file mode 100755 index a87a2763ad..0000000000 --- a/apps/podcast-poll-due-operations/bin/podcast_poll_due_operations_worker.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python3 - -from mediawords.job import JobBroker -from mediawords.util.process import fatal_error - -from podcast_poll_due_operations.due_operations import poll_for_due_operations, AbstractFetchTranscriptQueue - - -class JobBrokerFetchTranscriptQueue(AbstractFetchTranscriptQueue): - """Add fetch transcript jobs to job broker's queue.""" - - def add_to_queue(self, podcast_episode_transcript_fetches_id: int) -> None: - JobBroker(queue_name='MediaWords::Job::Podcast::FetchTranscript').add_to_queue( - podcast_episode_transcript_fetches_id=podcast_episode_transcript_fetches_id, - ) - - -if __name__ == '__main__': - try: - fetch_transcript_queue = JobBrokerFetchTranscriptQueue() - poll_for_due_operations(fetch_transcript_queue=fetch_transcript_queue) - except Exception as ex: - # Hard and unknown errors (no soft errors here) - fatal_error(f"Unable to poll for due operations: {ex}") diff --git a/apps/podcast-poll-due-operations/docker-compose.tests.yml b/apps/podcast-poll-due-operations/docker-compose.tests.yml deleted file mode 100644 index 584b501fef..0000000000 --- a/apps/podcast-poll-due-operations/docker-compose.tests.yml +++ /dev/null @@ -1,54 +0,0 @@ -version: "3.7" - -services: - - podcast-poll-due-operations: - image: gcr.io/mcback/podcast-poll-due-operations:latest - init: true - stop_signal: SIGKILL - volumes: - - type: bind - source: ./bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./src/ - target: /opt/mediacloud/src/podcast-poll-due-operations/ - - type: bind - source: ./tests/ - target: /opt/mediacloud/tests/ - - type: bind - source: ./../common/src/ - target: /opt/mediacloud/src/common/ - depends_on: - - postgresql-pgbouncer - # We don't need "rabbitmq-server" to run tests - - postgresql-pgbouncer: - image: gcr.io/mcback/postgresql-pgbouncer:latest - init: true - stop_signal: SIGKILL - expose: - - 6432 - volumes: - - type: bind - source: ./../postgresql-pgbouncer/conf/ - target: /etc/pgbouncer/ - depends_on: - - postgresql-server - - postgresql-server: - image: gcr.io/mcback/postgresql-server:latest - init: true - stop_signal: SIGKILL - expose: - - 5432 - volumes: - - type: bind - source: ./../postgresql-server/bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./../postgresql-server/schema/ - target: /opt/mediacloud/schema/ - - type: bind - source: ./../postgresql-base/conf/ - target: /etc/postgresql/11/main/ diff --git a/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/due_operations.py b/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/due_operations.py deleted file mode 100644 index ea82dc8949..0000000000 --- a/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/due_operations.py +++ /dev/null @@ -1,112 +0,0 @@ -import abc -import time - -from mediawords.db import connect_to_db -from mediawords.util.log import create_logger - -from podcast_poll_due_operations.exceptions import McJobBrokerErrorException - -log = create_logger(__name__) - - -class AbstractFetchTranscriptQueue(object, metaclass=abc.ABCMeta): - """ - Abstract class for adding a story ID to the "podcast-fetch-transcript" queue. - - Useful for testing as having such a class can help us find out whether stories get added to the actual job queue. - """ - - @abc.abstractmethod - def add_to_queue(self, podcast_episode_transcript_fetches_id: int) -> None: - """ - Add story ID to "podcast-fetch-transcript" job queue. - - :param podcast_episode_transcript_fetches_id: Transcript fetch ID. - """ - raise NotImplemented("Abstract method") - - -def poll_for_due_operations(fetch_transcript_queue: AbstractFetchTranscriptQueue, - stop_after_first_empty_chunk: bool = False, - wait_after_empty_poll: int = 30, - stories_chunk_size: int = 100) -> None: - """ - Continuously poll for due operations, add such operations to "podcast-fetch-transcript" queue. - - Never returns, unless 'stop_after_first_empty_chunk' is set. - - :param fetch_transcript_queue: Queue helper object to use for adding a story ID to "podcast-fetch-transcript" - queue (useful for testing). - :param stop_after_first_empty_chunk: If True, stop after the first attempt to fetch a chunk of due story IDs comes - out empty (useful for testing). - :param wait_after_empty_poll: Seconds to wait after there were no due story IDs found. - :param stories_chunk_size: Max. due story IDs to fetch in one go; the chunk will be deleted + returned in a - transaction, which will get reverted if RabbitMQ fails, so we don't want to - hold that transaction for too long. - """ - - if not fetch_transcript_queue: - raise McJobBrokerErrorException(f"Fetch transcript queue object is unset.") - - while True: - - db = connect_to_db() - - log.info("Polling...") - due_operations = db.query(""" - SELECT - podcast_episode_transcript_fetches_id, - add_to_queue_at - FROM podcast_episode_transcript_fetches - - -- Transcript fetch is due - WHERE add_to_queue_at <= NOW() - - -- Transcript fetch wasn't added to the job broker's queue yet - AND podcast_episode_transcript_was_added_to_queue(added_to_queue_at) = 'f' - - -- Get the oldest operations first - ORDER BY add_to_queue_at - - -- Don't fetch too much of stories at once - LIMIT %(stories_chunk_size)s - """, { - 'stories_chunk_size': stories_chunk_size, - }).hashes() - - if due_operations: - - try: - log.info(f"Adding {len(due_operations)} due operations to the transcription fetch queue...") - - for operation in due_operations: - podcast_episode_transcript_fetches_id = operation['podcast_episode_transcript_fetches_id'] - log.debug( - f"Adding fetch ID {podcast_episode_transcript_fetches_id} to the transcription fetch queue..." - ) - fetch_transcript_queue.add_to_queue( - podcast_episode_transcript_fetches_id=podcast_episode_transcript_fetches_id, - ) - - # Update "added_to_queue_at" individually in case RabbitMQ decides to fail on us - db.query(""" - UPDATE podcast_episode_transcript_fetches - SET added_to_queue_at = NOW() - WHERE podcast_episode_transcript_fetches_id = %(podcast_episode_transcript_fetches_id)s - """, { - 'podcast_episode_transcript_fetches_id': podcast_episode_transcript_fetches_id, - }) - - log.info(f"Done adding {len(due_operations)} due operations to the transcription fetch queue") - except Exception as ex: - - raise McJobBrokerErrorException(f"Unable to add one or more stories the the job queue: {ex}") - - else: - - if stop_after_first_empty_chunk: - log.info(f"No due story IDs found, stopping...") - break - else: - log.info(f"No due story IDs found, waiting for {wait_after_empty_poll} seconds...") - time.sleep(wait_after_empty_poll) diff --git a/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/exceptions.py b/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/exceptions.py deleted file mode 100644 index 1bc47e477a..0000000000 --- a/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/exceptions.py +++ /dev/null @@ -1,13 +0,0 @@ -class McPodcastPollDueOperationsHardException(Exception): - """Hard errors exception.""" - pass - - -class McDatabaseErrorException(McPodcastPollDueOperationsHardException): - """Exception thrown when we encounter a database error.""" - pass - - -class McJobBrokerErrorException(McPodcastPollDueOperationsHardException): - """Exception thrown when we encounter a job broker (RabbitMQ) error.""" - pass diff --git a/apps/podcast-poll-due-operations/tests/python/__init__.py b/apps/podcast-poll-due-operations/tests/python/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/podcast-poll-due-operations/tests/python/setup_due_operation.py b/apps/podcast-poll-due-operations/tests/python/setup_due_operation.py deleted file mode 100644 index 5fcc58e52d..0000000000 --- a/apps/podcast-poll-due-operations/tests/python/setup_due_operation.py +++ /dev/null @@ -1,55 +0,0 @@ -import abc -from unittest import TestCase - -from mediawords.db import connect_to_db -from mediawords.test.db.create import create_test_medium, create_test_feed, create_test_story - - -class SetupTestOperation(TestCase, metaclass=abc.ABCMeta): - __slots__ = [ - 'db', - 'test_medium', - 'test_feed', - 'story', - 'stories_id', - ] - - def setUp(self): - self.db = connect_to_db() - - self.test_medium = create_test_medium(db=self.db, label='test') - self.test_feed = create_test_feed(db=self.db, label='test', medium=self.test_medium) - self.story = create_test_story(db=self.db, label='test', feed=self.test_feed) - - stories_id = self.story['stories_id'] - - enclosure = self.db.insert(table='story_enclosures', insert_hash={ - 'stories_id': stories_id, - # URL doesn't really matter as we won't be fetching it - 'url': 'http://example.com/', - 'mime_type': 'audio/mpeg', - 'length': 100000, - }) - - episode = self.db.insert(table='podcast_episodes', insert_hash={ - 'stories_id': stories_id, - 'story_enclosures_id': enclosure['story_enclosures_id'], - 'gcs_uri': 'gs://whatever', - 'duration': 1, - 'codec': 'MP3', - 'sample_rate': 44100, - 'bcp47_language_code': 'en-US', - 'speech_operation_id': 'foo', - }) - - self.db.query(""" - INSERT INTO podcast_episode_transcript_fetches ( - podcast_episodes_id, - add_to_queue_at - ) VALUES ( - %(podcast_episodes_id)s, - NOW() - ) - """, { - 'podcast_episodes_id': episode['podcast_episodes_id'], - }) diff --git a/apps/podcast-poll-due-operations/tests/python/test_due_operations.py b/apps/podcast-poll-due-operations/tests/python/test_due_operations.py deleted file mode 100644 index d121178e03..0000000000 --- a/apps/podcast-poll-due-operations/tests/python/test_due_operations.py +++ /dev/null @@ -1,40 +0,0 @@ -from podcast_poll_due_operations.due_operations import poll_for_due_operations, AbstractFetchTranscriptQueue - -from .setup_due_operation import SetupTestOperation - - -class MockCounterFetchTranscriptQueue(AbstractFetchTranscriptQueue): - __slots__ = [ - 'story_count', - ] - - def __init__(self): - self.story_count = 0 - - def add_to_queue(self, podcast_episode_transcript_fetches_id: int) -> None: - self.story_count += 1 - - -class TestPollForDueOperations(SetupTestOperation): - - def test_poll_for_due_operations(self): - """Simple test.""" - - fetch_transcript_queue = MockCounterFetchTranscriptQueue() - - poll_for_due_operations( - fetch_transcript_queue=fetch_transcript_queue, - stop_after_first_empty_chunk=True, - ) - - all_fetches = self.db.select( - table='podcast_episode_transcript_fetches', - what_to_select='*', - ).hashes() - - assert len(all_fetches) == 1, "The fetch should have been kept in the table." - fetch = all_fetches[0] - - assert fetch['added_to_queue_at'], "Timestamp for when the fetch as added to the queue should be set." - - assert fetch_transcript_queue.story_count == 1, "A single story should have been added to the fetch queue." diff --git a/apps/podcast-poll-due-operations/tests/python/test_failing_job_broker.py b/apps/podcast-poll-due-operations/tests/python/test_failing_job_broker.py deleted file mode 100644 index a0e6897f80..0000000000 --- a/apps/podcast-poll-due-operations/tests/python/test_failing_job_broker.py +++ /dev/null @@ -1,36 +0,0 @@ -import pytest - -from podcast_poll_due_operations.due_operations import poll_for_due_operations, AbstractFetchTranscriptQueue -from podcast_poll_due_operations.exceptions import McJobBrokerErrorException - -from .setup_due_operation import SetupTestOperation - - -class MockFailingFetchTranscriptQueue(AbstractFetchTranscriptQueue): - - def add_to_queue(self, podcast_episode_transcript_fetches_id: int) -> None: - raise Exception("Job broker is down") - - -class TestFailingJobBroker(SetupTestOperation): - - def test_failing_job_broker(self): - """Test what happens if the job broker fails.""" - - fetch_transcript_queue = MockFailingFetchTranscriptQueue() - - with pytest.raises(McJobBrokerErrorException): - poll_for_due_operations( - fetch_transcript_queue=fetch_transcript_queue, - stop_after_first_empty_chunk=True, - ) - - all_fetches = self.db.select( - table='podcast_episode_transcript_fetches', - what_to_select='*', - ).hashes() - - assert len(all_fetches) == 1, "The fetch should have been kept in the table." - fetch = all_fetches[0] - - assert not fetch['added_to_queue_at'], "Timestamp for when the fetch as added to the queue should be empty." diff --git a/apps/podcast-submit-operation/.dockerignore b/apps/podcast-submit-operation/.dockerignore deleted file mode 100644 index 9b2c362a80..0000000000 --- a/apps/podcast-submit-operation/.dockerignore +++ /dev/null @@ -1,92 +0,0 @@ -# -# Files from the build context to be ignored by "docker build". -# -# You might want to add as many of constantly changing files here as possible -# to prevent container's image from getting rebuilt every full moon. -# -# Unfortunately, we can't just symlink this file to every app's directory: -# -# https://github.com/moby/moby/issues/12886 -# -# so for the time being you have to manually copy this file to every app -# subdirectory: -# -# cd apps/ -# find . -maxdepth 1 -type d \( ! -name . \) -exec bash -c "cd '{}' && cp ../dockerignore.dist ./.dockerignore" \; -# - -*$py.class -*.cover -*.DS_Store -*.egg -*.egg-info/ -*.log -*.manifest -*.mo -*.pot -*.py[cod] -*.sage.py -*.so -*.spec -*.swp -*/*.py[cod] -*/*.swp -*/*/*.py[cod] -*/*/*.swp -*/*/*/*.py[cod] -*/*/*/*.swp -*/*/*/__pycache__/ -*/*/__pycache__/ -*/__pycache__/ -._* -.apdisk -.AppleDB -.AppleDesktop -.AppleDouble -.cache -.com.apple.timemachine.donotpresent -.coverage -.coverage.* -.dockerignore -.DocumentRevisions-V100 -.DS_Store -.eggs -.env -.fseventsd -.git -.gitignore -.hypothesis -.idea -.installed.cfg -.ipynb_checkpoints -.LSOverride -.mypy_cache -.pytest_cache -.Python -.python-version -.ropeproject -.scrapy -.Spotlight-V100 -.spyderproject -.spyproject -.TemporaryItems -.tox -.Trashes -.venv -.VolumeIcon.icns -.webassets-cache -__pycache__ -celerybeat-schedule -coverage.xml -Icon -local_settings.py -Network Trash Folder -nosetests.xml -parts -pip-delete-this-directory.txt -pip-log.txt -sdist -Temporary Items -wheels -_Inline - diff --git a/apps/podcast-submit-operation/.idea/externalDependencies.xml b/apps/podcast-submit-operation/.idea/externalDependencies.xml deleted file mode 100644 index 7872ffbcf2..0000000000 --- a/apps/podcast-submit-operation/.idea/externalDependencies.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/inspectionProfiles/profiles_settings.xml b/apps/podcast-submit-operation/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2da2d..0000000000 --- a/apps/podcast-submit-operation/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/misc.xml b/apps/podcast-submit-operation/.idea/misc.xml deleted file mode 100644 index 06b8bbff3f..0000000000 --- a/apps/podcast-submit-operation/.idea/misc.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/modules.xml b/apps/podcast-submit-operation/.idea/modules.xml deleted file mode 100644 index 26bb21f27e..0000000000 --- a/apps/podcast-submit-operation/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/podcast-submit-operation.iml b/apps/podcast-submit-operation/.idea/podcast-submit-operation.iml deleted file mode 100644 index 1c3aa105bd..0000000000 --- a/apps/podcast-submit-operation/.idea/podcast-submit-operation.iml +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/runConfigurations/Dockefile.xml b/apps/podcast-submit-operation/.idea/runConfigurations/Dockefile.xml deleted file mode 100644 index b5e047b7b0..0000000000 --- a/apps/podcast-submit-operation/.idea/runConfigurations/Dockefile.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/sqldialects.xml b/apps/podcast-submit-operation/.idea/sqldialects.xml deleted file mode 100644 index 790b3f37f8..0000000000 --- a/apps/podcast-submit-operation/.idea/sqldialects.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/.idea/vcs.xml b/apps/podcast-submit-operation/.idea/vcs.xml deleted file mode 100644 index b2bdec2d71..0000000000 --- a/apps/podcast-submit-operation/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/podcast-submit-operation/Dockerfile b/apps/podcast-submit-operation/Dockerfile deleted file mode 100644 index acf06f32e4..0000000000 --- a/apps/podcast-submit-operation/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -# -# Submit a long running operation to Google Speech to Text API for it to transcribe the episode -# - -FROM gcr.io/mcback/common:latest - -# Install Python dependencies -COPY src/requirements.txt /var/tmp/ -RUN \ - cd /var/tmp/ && \ - pip3 install -r requirements.txt && \ - rm requirements.txt && \ - rm -rf /root/.cache/ && \ - true - -# Copy sources -COPY src/ /opt/mediacloud/src/podcast-submit-operation/ -ENV PERL5LIB="/opt/mediacloud/src/podcast-submit-operation/perl:${PERL5LIB}" \ - PYTHONPATH="/opt/mediacloud/src/podcast-submit-operation/python:${PYTHONPATH}" - -# Copy worker script -COPY bin /opt/mediacloud/bin - -USER mediacloud - -CMD ["podcast_submit_operation_worker.py"] diff --git a/apps/podcast-submit-operation/bin/podcast_submit_operation_worker.py b/apps/podcast-submit-operation/bin/podcast_submit_operation_worker.py deleted file mode 100755 index ae3b712e80..0000000000 --- a/apps/podcast-submit-operation/bin/podcast_submit_operation_worker.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python3 - -from mediawords.db import connect_to_db -from mediawords.job import JobBroker -from mediawords.util.log import create_logger -from mediawords.util.perl import decode_object_from_bytes_if_needed -from mediawords.util.process import fatal_error - -from podcast_submit_operation.exceptions import McPodcastSubmitOperationSoftException -from podcast_submit_operation.submit_operation import get_podcast_episode, submit_transcribe_operation - -log = create_logger(__name__) - -ADD_TO_QUEUE_AT_DURATION_MULTIPLIER = 1.1 -""" -How soon to expect the transcription results to become available in relation to episode's duration. - -For example, if the episode's duration is 60 minutes, and the multiplier is 1.1, the transcription results fetch will -first be attempted after 60 * 1.1 = 66 minutes. -""" - - -def run_podcast_submit_operation(stories_id: int) -> None: - """Submit a podcast episode to the Speech API.""" - - if isinstance(stories_id, bytes): - stories_id = decode_object_from_bytes_if_needed(stories_id) - stories_id = int(stories_id) - - db = connect_to_db() - - log.info(f"Submitting story's {stories_id} podcast episode for transcription...") - - try: - episode = get_podcast_episode(db=db, stories_id=stories_id) - speech_operation_id = submit_transcribe_operation(episode=episode) - - db.query(""" - UPDATE podcast_episodes - SET speech_operation_id = %(speech_operation_id)s - WHERE podcast_episodes_id = %(podcast_episodes_id)s - """, { - 'podcast_episodes_id': episode.podcast_episodes_id, - 'speech_operation_id': speech_operation_id, - }) - - add_to_queue_interval = f"{int(episode.duration + ADD_TO_QUEUE_AT_DURATION_MULTIPLIER)} seconds" - db.query(""" - INSERT INTO podcast_episode_transcript_fetches ( - podcast_episodes_id, - add_to_queue_at - ) VALUES ( - %(podcast_episodes_id)s, - NOW() + INTERVAL %(add_to_queue_interval)s - ) - """, { - 'podcast_episodes_id': episode.podcast_episodes_id, - 'add_to_queue_interval': add_to_queue_interval, - }) - - except McPodcastSubmitOperationSoftException as ex: - # Soft exceptions - log.error(f"Unable to submit podcast episode for story {stories_id}: {ex}") - raise ex - - except Exception as ex: - # Hard and other exceptions - fatal_error(f"Fatal / unknown error while submitting podcast episode for story {stories_id}: {ex}") - - log.info(f"Done submitting story's {stories_id} podcast episode for transcription") - - -if __name__ == '__main__': - app = JobBroker(queue_name='MediaWords::Job::Podcast::SubmitOperation') - app.start_worker(handler=run_podcast_submit_operation) diff --git a/apps/podcast-submit-operation/docker-compose.tests.yml b/apps/podcast-submit-operation/docker-compose.tests.yml deleted file mode 100644 index 85d255dad3..0000000000 --- a/apps/podcast-submit-operation/docker-compose.tests.yml +++ /dev/null @@ -1,56 +0,0 @@ -version: "3.7" - -services: - - podcast-submit-operation: - image: gcr.io/mcback/podcast-submit-operation:latest - init: true - stop_signal: SIGKILL - environment: - MC_PODCAST_GC_AUTH_JSON_BASE64: "${MC_PODCAST_GC_AUTH_JSON_BASE64}" - volumes: - - type: bind - source: ./bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./src/ - target: /opt/mediacloud/src/podcast-submit-operation/ - - type: bind - source: ./tests/ - target: /opt/mediacloud/tests/ - - type: bind - source: ./../common/src/ - target: /opt/mediacloud/src/common/ - depends_on: - - postgresql-pgbouncer - # We don't need "rabbitmq-server" to run tests - - postgresql-pgbouncer: - image: gcr.io/mcback/postgresql-pgbouncer:latest - init: true - stop_signal: SIGKILL - expose: - - 6432 - volumes: - - type: bind - source: ./../postgresql-pgbouncer/conf/ - target: /etc/pgbouncer/ - depends_on: - - postgresql-server - - postgresql-server: - image: gcr.io/mcback/postgresql-server:latest - init: true - stop_signal: SIGKILL - expose: - - 5432 - volumes: - - type: bind - source: ./../postgresql-server/bin/ - target: /opt/mediacloud/bin/ - - type: bind - source: ./../postgresql-server/schema/ - target: /opt/mediacloud/schema/ - - type: bind - source: ./../postgresql-base/conf/ - target: /etc/postgresql/11/main/ diff --git a/apps/podcast-submit-operation/src/python/podcast_submit_operation/__init__.py b/apps/podcast-submit-operation/src/python/podcast_submit_operation/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/podcast-submit-operation/src/python/podcast_submit_operation/config.py b/apps/podcast-submit-operation/src/python/podcast_submit_operation/config.py deleted file mode 100644 index 0d20337c98..0000000000 --- a/apps/podcast-submit-operation/src/python/podcast_submit_operation/config.py +++ /dev/null @@ -1,12 +0,0 @@ -from mediawords.util.config import file_with_env_value - - -class PodcastSubmitOperationConfig(object): - """ - Podcast submit transcription operation configuration. - """ - - @staticmethod - def gc_auth_json_file() -> str: - """Return path to Google Cloud authentication JSON file.""" - return file_with_env_value(name='MC_PODCAST_GC_AUTH_JSON_BASE64', encoded_with_base64=True) diff --git a/apps/podcast-submit-operation/src/requirements.txt b/apps/podcast-submit-operation/src/requirements.txt deleted file mode 100644 index 59e80a7b73..0000000000 --- a/apps/podcast-submit-operation/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -google-cloud-speech==2.0.1 diff --git a/apps/podcast-submit-operation/tests/python/test_submit_operation.py b/apps/podcast-submit-operation/tests/python/test_submit_operation.py deleted file mode 100644 index 013037530c..0000000000 --- a/apps/podcast-submit-operation/tests/python/test_submit_operation.py +++ /dev/null @@ -1,40 +0,0 @@ -from mediawords.db import connect_to_db -from mediawords.test.db.create import create_test_medium, create_test_feed, create_test_story - -from podcast_submit_operation.submit_operation import get_podcast_episode, submit_transcribe_operation - - -def test_submit_transcribe_operation(): - test_gcs_uri = "gs://mc-podcast-sample-audio-files/samples/kim_kardashian-mp3.mp3" - - db = connect_to_db() - test_medium = create_test_medium(db=db, label='test') - test_feed = create_test_feed(db=db, label='test', medium=test_medium) - story = create_test_story(db=db, label='test', feed=test_feed) - - stories_id = story['stories_id'] - - enclosure = db.insert(table='story_enclosures', insert_hash={ - 'stories_id': stories_id, - # URL doesn't really matter as we won't be fetching it - 'url': 'http://example.com/', - 'mime_type': 'audio/mpeg', - 'length': 100000, - }) - - db.insert(table='podcast_episodes', insert_hash={ - 'stories_id': stories_id, - 'story_enclosures_id': enclosure['story_enclosures_id'], - 'gcs_uri': test_gcs_uri, - - # We lie about the duration because we want to test whether 'add_to_queue_at' will be set way into the future - 'duration': 60 * 60, - - 'codec': 'MP3', - 'sample_rate': 44100, - 'bcp47_language_code': 'en-US', - }) - - episode = get_podcast_episode(db=db, stories_id=stories_id) - speech_operation_id = submit_transcribe_operation(episode=episode) - assert speech_operation_id diff --git a/apps/podcast-fetch-episode/.dockerignore b/apps/podcast-transcribe-episode/.dockerignore similarity index 100% rename from apps/podcast-fetch-episode/.dockerignore rename to apps/podcast-transcribe-episode/.dockerignore diff --git a/apps/podcast-transcribe-episode/.idea/.gitignore b/apps/podcast-transcribe-episode/.idea/.gitignore new file mode 100644 index 0000000000..73f69e0958 --- /dev/null +++ b/apps/podcast-transcribe-episode/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/apps/podcast-fetch-episode/.idea/externalDependencies.xml b/apps/podcast-transcribe-episode/.idea/externalDependencies.xml similarity index 100% rename from apps/podcast-fetch-episode/.idea/externalDependencies.xml rename to apps/podcast-transcribe-episode/.idea/externalDependencies.xml diff --git a/apps/podcast-transcribe-episode/.idea/inspectionProfiles/Project_Default.xml b/apps/podcast-transcribe-episode/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000000..fe9d3b7548 --- /dev/null +++ b/apps/podcast-transcribe-episode/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,15 @@ + + + + \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/inspectionProfiles/profiles_settings.xml b/apps/podcast-transcribe-episode/.idea/inspectionProfiles/profiles_settings.xml similarity index 100% rename from apps/podcast-fetch-episode/.idea/inspectionProfiles/profiles_settings.xml rename to apps/podcast-transcribe-episode/.idea/inspectionProfiles/profiles_settings.xml diff --git a/apps/podcast-transcribe-episode/.idea/mediawords.sql b/apps/podcast-transcribe-episode/.idea/mediawords.sql new file mode 120000 index 0000000000..08fc9a64b9 --- /dev/null +++ b/apps/podcast-transcribe-episode/.idea/mediawords.sql @@ -0,0 +1 @@ +../../postgresql-server/schema/mediawords.sql \ No newline at end of file diff --git a/apps/podcast-transcribe-episode/.idea/misc.xml b/apps/podcast-transcribe-episode/.idea/misc.xml new file mode 100644 index 0000000000..d89177f747 --- /dev/null +++ b/apps/podcast-transcribe-episode/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/modules.xml b/apps/podcast-transcribe-episode/.idea/modules.xml similarity index 51% rename from apps/podcast-fetch-episode/.idea/modules.xml rename to apps/podcast-transcribe-episode/.idea/modules.xml index 1f8ef01409..9023537213 100644 --- a/apps/podcast-fetch-episode/.idea/modules.xml +++ b/apps/podcast-transcribe-episode/.idea/modules.xml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/apps/podcast-fetch-transcript/.idea/podcast-fetch-transcript.iml b/apps/podcast-transcribe-episode/.idea/podcast-transcribe-episode.iml similarity index 80% rename from apps/podcast-fetch-transcript/.idea/podcast-fetch-transcript.iml rename to apps/podcast-transcribe-episode/.idea/podcast-transcribe-episode.iml index ffc8ff3cc9..16f0d9a079 100644 --- a/apps/podcast-fetch-transcript/.idea/podcast-fetch-transcript.iml +++ b/apps/podcast-transcribe-episode/.idea/podcast-transcribe-episode.iml @@ -2,7 +2,7 @@ - + diff --git a/apps/podcast-fetch-transcript/.idea/runConfigurations/Dockerfile.xml b/apps/podcast-transcribe-episode/.idea/runConfigurations/Dockerfile.xml similarity index 83% rename from apps/podcast-fetch-transcript/.idea/runConfigurations/Dockerfile.xml rename to apps/podcast-transcribe-episode/.idea/runConfigurations/Dockerfile.xml index 3f86f834cf..85f79e0693 100644 --- a/apps/podcast-fetch-transcript/.idea/runConfigurations/Dockerfile.xml +++ b/apps/podcast-transcribe-episode/.idea/runConfigurations/Dockerfile.xml @@ -2,12 +2,12 @@ - diff --git a/apps/podcast-poll-due-operations/.idea/sqldialects.xml b/apps/podcast-transcribe-episode/.idea/sqldialects.xml similarity index 62% rename from apps/podcast-poll-due-operations/.idea/sqldialects.xml rename to apps/podcast-transcribe-episode/.idea/sqldialects.xml index 790b3f37f8..f8c2c59528 100644 --- a/apps/podcast-poll-due-operations/.idea/sqldialects.xml +++ b/apps/podcast-transcribe-episode/.idea/sqldialects.xml @@ -1,7 +1,7 @@ - + \ No newline at end of file diff --git a/apps/podcast-fetch-episode/.idea/vcs.xml b/apps/podcast-transcribe-episode/.idea/vcs.xml similarity index 100% rename from apps/podcast-fetch-episode/.idea/vcs.xml rename to apps/podcast-transcribe-episode/.idea/vcs.xml diff --git a/apps/podcast-fetch-episode/Dockerfile b/apps/podcast-transcribe-episode/Dockerfile similarity index 54% rename from apps/podcast-fetch-episode/Dockerfile rename to apps/podcast-transcribe-episode/Dockerfile index 6bb28d4eb4..3dfec89ac1 100644 --- a/apps/podcast-fetch-episode/Dockerfile +++ b/apps/podcast-transcribe-episode/Dockerfile @@ -1,5 +1,5 @@ # -# Fetch podcast episode from story, read metadata, store it to GCS +# Fetch podcast episode, convert it (if needed), transcribe and store to the database # FROM gcr.io/mcback/common:latest @@ -17,13 +17,13 @@ RUN \ true # Copy sources -COPY src/ /opt/mediacloud/src/podcast-fetch-episode/ -ENV PERL5LIB="/opt/mediacloud/src/podcast-fetch-episode/perl:${PERL5LIB}" \ - PYTHONPATH="/opt/mediacloud/src/podcast-fetch-episode/python:${PYTHONPATH}" +COPY src/ /opt/mediacloud/src/podcast-transcribe-episode/ +ENV PERL5LIB="/opt/mediacloud/src/podcast-transcribe-episode/perl:${PERL5LIB}" \ + PYTHONPATH="/opt/mediacloud/src/podcast-transcribe-episode/python:${PYTHONPATH}" # Copy worker script COPY bin /opt/mediacloud/bin USER mediacloud -CMD ["podcast_fetch_episode_worker.py"] +CMD ["podcast_transcribe_episode_worker.py"] diff --git a/apps/podcast-fetch-episode/bin/podcast_fetch_episode_worker.py b/apps/podcast-transcribe-episode/bin/podcast_transcribe_episode_worker.py similarity index 88% rename from apps/podcast-fetch-episode/bin/podcast_fetch_episode_worker.py rename to apps/podcast-transcribe-episode/bin/podcast_transcribe_episode_worker.py index e8fc7ec433..c2c0e80a9e 100755 --- a/apps/podcast-fetch-episode/bin/podcast_fetch_episode_worker.py +++ b/apps/podcast-transcribe-episode/bin/podcast_transcribe_episode_worker.py @@ -6,8 +6,8 @@ from mediawords.util.perl import decode_object_from_bytes_if_needed from mediawords.util.process import fatal_error -from podcast_fetch_episode.exceptions import McPodcastFetchEpisodeSoftException -from podcast_fetch_episode.fetch_and_store import fetch_and_store_episode +from podcast_transcribe_episode.fetch_episode.exceptions import McPodcastFetchEpisodeSoftException +from podcast_transcribe_episode.fetch_episode.fetch_and_store import fetch_and_store_episode log = create_logger(__name__) diff --git a/apps/podcast-transcribe-episode/docker-compose.tests.yml b/apps/podcast-transcribe-episode/docker-compose.tests.yml new file mode 100644 index 0000000000..cdd4bf8601 --- /dev/null +++ b/apps/podcast-transcribe-episode/docker-compose.tests.yml @@ -0,0 +1,124 @@ +version: "3.7" + +services: + + podcast-transcribe-episode: + image: gcr.io/mcback/podcast-transcribe-episode:latest + init: true + stop_signal: SIGKILL + environment: + MC_PODCAST_GC_AUTH_JSON_BASE64: "${MC_PODCAST_GC_AUTH_JSON_BASE64}" + MC_PODCAST_FETCH_EPISODE_BUCKET_NAME: "${MC_PODCAST_FETCH_EPISODE_BUCKET_NAME}" + # Dev/test environments don't use "MC_PODCAST_FETCH_EPISODE_PATH_PREFIX" environment + # variable as they create a different, timestamped prefix for every test run. + volumes: + - type: bind + source: ./bin/ + target: /opt/mediacloud/bin/ + - type: bind + source: ./src/ + target: /opt/mediacloud/src/podcast-transcribe-episode/ + - type: bind + source: ./tests/ + target: /opt/mediacloud/tests/ + - type: bind + source: ./../common/src/ + target: /opt/mediacloud/src/common/ + depends_on: + - postgresql-pgbouncer + - temporal-server + # We don't need "rabbitmq-server" to run tests + + postgresql-pgbouncer: + image: gcr.io/mcback/postgresql-pgbouncer:latest + init: true + stop_signal: SIGKILL + expose: + - 6432 + volumes: + - type: bind + source: ./../postgresql-pgbouncer/conf/ + target: /etc/pgbouncer/ + depends_on: + - postgresql-server + + postgresql-server: + image: gcr.io/mcback/postgresql-server:latest + init: true + stop_signal: SIGKILL + expose: + - 5432 + volumes: + - type: bind + source: ./../postgresql-server/bin/ + target: /opt/mediacloud/bin/ + - type: bind + source: ./../postgresql-server/schema/ + target: /opt/mediacloud/schema/ + - type: bind + source: ./../postgresql-base/conf/ + target: /etc/postgresql/11/main/ + + temporal-server: + image: gcr.io/mcback/temporal-server:latest + init: true + stop_signal: SIGKILL + depends_on: + - temporal-postgresql + - temporal-elasticsearch + expose: + - 6933 + - 6934 + - 6935 + - 6939 + - 7233 + - 7234 + - 7235 + - 7239 + volumes: + - type: bind + source: ./../temporal-server/bin/ + target: /opt/temporal-server/bin/ + - type: bind + source: ./../temporal-server/config/dynamicconfig.yaml + target: /opt/temporal-server/config/dynamicconfig.yaml + - type: bind + source: ./../temporal-server/config/mediacloud_template.yaml + target: /opt/temporal-server/config/mediacloud_template.yaml + + temporal-postgresql: + image: gcr.io/mcback/temporal-postgresql:latest + init: true + stop_signal: SIGKILL + expose: + - 5432 + volumes: + - type: bind + source: ./../temporal-postgresql/bin/ + target: /opt/temporal-postgresql/bin/ + - type: bind + source: ./../postgresql-base/conf/ + target: /etc/postgresql/11/main/ + + temporal-elasticsearch: + image: gcr.io/mcback/temporal-elasticsearch:latest + init: true + stop_signal: SIGKILL + expose: + - "9200" + - "9300" + volumes: + - type: bind + source: ./../elasticsearch-base/bin/elasticsearch.sh + target: /opt/elasticsearch/bin/elasticsearch.sh + # Not mounting config as it gets concatenated into a single file + + temporal-webapp: + image: gcr.io/mcback/temporal-webapp:latest + init: true + stop_signal: SIGKILL + expose: + - "8088" + ports: + # Expose to host for debugging + - "8088:8088" diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/__init__.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/__init__.py similarity index 100% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/__init__.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/__init__.py diff --git a/apps/podcast-fetch-episode/tests/python/__init__.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/__init__.py similarity index 100% rename from apps/podcast-fetch-episode/tests/python/__init__.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/__init__.py diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/audio_codecs.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/audio_codecs.py similarity index 100% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/audio_codecs.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/audio_codecs.py diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/bcp47_lang.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/bcp47_lang.py similarity index 100% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/bcp47_lang.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/bcp47_lang.py diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/config.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/config.py similarity index 100% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/config.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/config.py diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/enclosure.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/enclosure.py similarity index 62% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/enclosure.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/enclosure.py index 6d734c4d8b..e3193ebe43 100644 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/enclosure.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/enclosure.py @@ -1,68 +1,15 @@ import dataclasses from typing import Optional, Dict, Any -from furl import furl - from mediawords.db import DatabaseHandler from mediawords.util.log import create_logger -from mediawords.util.url import is_http_url log = create_logger(__name__) -_MP3_MIME_TYPES = {'audio/mpeg', 'audio/mpeg3', 'audio/mp3', 'audio/x-mpeg-3'} -"""MIME types which MP3 files might have.""" - MAX_ENCLOSURE_SIZE = 1024 * 1024 * 500 """Max. enclosure size (in bytes) that we're willing to download.""" -@dataclasses.dataclass -class StoryEnclosure(object): - """Single story enclosure derived from feed's element.""" - story_enclosures_id: int - url: str - mime_type: Optional[str] - length: Optional[int] - - def mime_type_is_mp3(self) -> bool: - """Return True if declared MIME type is one of the MP3 ones.""" - if self.mime_type: - if self.mime_type.lower() in _MP3_MIME_TYPES: - return True - return False - - def mime_type_is_audio(self) -> bool: - """Return True if declared MIME type is an audio type.""" - if self.mime_type: - if self.mime_type.lower().startswith('audio/'): - return True - return False - - def mime_type_is_video(self) -> bool: - """Return True if declared MIME type is a video type.""" - if self.mime_type: - if self.mime_type.lower().startswith('video/'): - return True - return False - - def url_path_has_mp3_extension(self) -> bool: - """Return True if URL's path has .mp3 extension.""" - if is_http_url(self.url): - uri = furl(self.url) - if '.mp3' in str(uri.path).lower(): - return True - return False - - @classmethod - def from_db_row(cls, db_row: Dict[str, Any]) -> 'StoryEnclosure': - return cls( - story_enclosures_id=db_row['story_enclosures_id'], - url=db_row['url'], - mime_type=db_row['mime_type'], - length=db_row['length'], - ) - - def podcast_viable_enclosure_for_story(db: DatabaseHandler, stories_id: int) -> Optional[StoryEnclosure]: """Fetch all enclosures, find and return the one that looks like a podcast episode the most (or None).""" story_enclosures_dicts = db.query(""" diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/exceptions.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/exceptions.py similarity index 100% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/exceptions.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/exceptions.py diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/fetch_and_store.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py similarity index 93% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/fetch_and_store.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py index 82836d1dbf..1719ecfdca 100644 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/fetch_and_store.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py @@ -8,10 +8,10 @@ from mediawords.util.log import create_logger from mediawords.util.parse_html import html_strip -from podcast_fetch_episode.bcp47_lang import iso_639_1_code_to_bcp_47_identifier -from podcast_fetch_episode.config import PodcastFetchEpisodeConfig -from podcast_fetch_episode.enclosure import podcast_viable_enclosure_for_story, MAX_ENCLOSURE_SIZE -from podcast_fetch_episode.exceptions import ( +from .bcp47_lang import iso_639_1_code_to_bcp_47_identifier +from .config import PodcastFetchEpisodeConfig +from .enclosure import podcast_viable_enclosure_for_story, MAX_ENCLOSURE_SIZE +from .exceptions import ( McStoryNotFoundException, McPodcastNoViableStoryEnclosuresException, McPodcastEnclosureTooBigException, @@ -20,9 +20,9 @@ McPodcastGCSStoreFailureException, McPodcastPostgreSQLException, ) -from podcast_fetch_episode.fetch_url import fetch_big_file -from podcast_fetch_episode.gcs_store import GCSStore -from podcast_fetch_episode.media_file import TranscodeTempDirAndFile, transcode_media_file_if_needed, media_file_info +from .fetch_url import fetch_big_file +from .gcs_store import GCSStore +from .media_file import TranscodeTempDirAndFile, transcode_media_file_if_needed, media_file_info log = create_logger(__name__) diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/fetch_url.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_url.py similarity index 95% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/fetch_url.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_url.py index 7d7b6716e6..b605d792e0 100644 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/fetch_url.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_url.py @@ -1,10 +1,11 @@ import os +# noinspection PyPackageRequirements import requests from mediawords.util.log import create_logger -from podcast_fetch_episode.exceptions import McPodcastFileFetchFailureException, McPodcastFileStoreFailureException +from .exceptions import McPodcastFileFetchFailureException, McPodcastFileStoreFailureException log = create_logger(__name__) diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/gcs_store.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/gcs_store.py similarity index 98% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/gcs_store.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/gcs_store.py index 579ceb3afb..90fbc88f55 100644 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/gcs_store.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/gcs_store.py @@ -10,8 +10,8 @@ from mediawords.util.log import create_logger -from podcast_fetch_episode.config import PodcastFetchEpisodeConfig -from podcast_fetch_episode.exceptions import ( +from .config import PodcastFetchEpisodeConfig +from .exceptions import ( McPodcastGCSStoreFailureException, McPodcastMisconfiguredGCSException, ) diff --git a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/media_file.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_file.py similarity index 99% rename from apps/podcast-fetch-episode/src/python/podcast_fetch_episode/media_file.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_file.py index debd695878..998a5a2f54 100644 --- a/apps/podcast-fetch-episode/src/python/podcast_fetch_episode/media_file.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_file.py @@ -6,11 +6,12 @@ import tempfile from typing import Type, Optional, List +# noinspection PyPackageRequirements import ffmpeg from mediawords.util.log import create_logger -from podcast_fetch_episode.audio_codecs import ( +from .audio_codecs import ( AbstractAudioCodec, Linear16AudioCodec, FLACAudioCodec, @@ -18,7 +19,7 @@ OggOpusAudioCodec, MP3AudioCodec, ) -from podcast_fetch_episode.exceptions import ( +from .exceptions import ( McPodcastMisconfiguredTranscoderException, McPodcastFileIsInvalidException, McPodcastFileStoreFailureException, diff --git a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/__init__.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/__init__.py similarity index 100% rename from apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/__init__.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/__init__.py diff --git a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/exceptions.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/exceptions.py similarity index 100% rename from apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/exceptions.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/exceptions.py diff --git a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/fetch_store.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/fetch_store.py similarity index 97% rename from apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/fetch_store.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/fetch_store.py index 72db28a80b..8aa6f086de 100644 --- a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/fetch_store.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/fetch_store.py @@ -3,11 +3,11 @@ from mediawords.db import DatabaseHandler from mediawords.util.log import create_logger -from podcast_fetch_transcript.exceptions import ( +from .exceptions import ( McDatabaseErrorException, McDatabaseNotFoundException, ) -from podcast_fetch_transcript.handler import AbstractHandler, DefaultHandler +from .handler import AbstractHandler, DefaultHandler log = create_logger(__name__) diff --git a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/handler.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py similarity index 97% rename from apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/handler.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py index 8951f7c9fe..38a86a9c13 100644 --- a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/handler.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py @@ -15,14 +15,14 @@ from mediawords.dbi.downloads.store import store_content from mediawords.util.log import create_logger -from podcast_fetch_transcript.config import PodcastFetchTranscriptConfig -from podcast_fetch_transcript.exceptions import ( +from .config import PodcastFetchTranscriptConfig +from .exceptions import ( McDatabaseNotFoundException, McMisconfiguredSpeechAPIException, McOperationNotFoundException, McTranscriptionReturnedErrorException, ) -from podcast_fetch_transcript.transcript import UtteranceAlternative, Utterance, Transcript +from .transcript import UtteranceAlternative, Utterance, Transcript log = create_logger(__name__) diff --git a/apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/transcript.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/transcript.py similarity index 100% rename from apps/podcast-fetch-transcript/src/python/podcast_fetch_transcript/transcript.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/transcript.py diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/shared.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/shared.py new file mode 100644 index 0000000000..5c02ea948a --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/shared.py @@ -0,0 +1,190 @@ +# FIXME remove unused tables +# FIXME post-init validation of dataclasses (https://docs.python.org/3/library/dataclasses.html#post-init-processing) +# FIXME workflow logger + +import dataclasses +import enum +from datetime import timedelta +from typing import Optional + +# noinspection PyPackageRequirements +from furl import furl +# noinspection PyPackageRequirements +from temporal.activity_method import activity_method, RetryParameters +# noinspection PyPackageRequirements +from temporal.workflow import workflow_method + +from mediawords.util.url import is_http_url + +TASK_QUEUE = "podcast-transcribe-episode" +"""Temporal task queue.""" + +NAMESPACE = "default" +"""Temporal namespace.""" + +# FIXME different retry parameters for various actions +RETRY_PARAMETERS = RetryParameters( + initial_interval=timedelta(seconds=1), + maximum_interval=timedelta(seconds=100), + backoff_coefficient=2, + maximum_attempts=500, +) + + +@enum.unique +class AudioCodec(enum.Enum): + """ + Audio file codec that's supported by Google Speech API. + + https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1 + """ + LINEAR16 = 'LINEAR16', + FLAC = 'FLAC' + MULAW = 'MULAW' + OGG_OPUS = 'OGG_OPUS' + MP3 = 'MP3' + + +@dataclasses.dataclass(frozen=True) +class EpisodeMetadata(object): + """Metadata about an episode to be transcribed.""" + + duration: int + """Episode's duration in seconds.""" + + codec: AudioCodec + """Episode's codec.""" + + sample_rate: int + """Episode's sample rate (Hz) as determined by transcoder, e.g. 44100.""" + + +@dataclasses.dataclass +class StoryEnclosure(object): + """Single story enclosure derived from feed's element.""" + + __MP3_MIME_TYPES = {'audio/mpeg', 'audio/mpeg3', 'audio/mp3', 'audio/x-mpeg-3'} + """MIME types which MP3 files might have.""" + + url: str + """Enclosure's URL, e.g. 'https://www.example.com/episode.mp3'.""" + + mime_type: Optional[str] + """Enclosure's reported MIME type, or None if it wasn't reported; e.g. 'audio/mpeg'.""" + + length: Optional[int] + """Enclosure's reported length in bytes, or None if it wasn't reported.""" + + def mime_type_is_mp3(self) -> bool: + """Return True if declared MIME type is one of the MP3 ones.""" + if self.mime_type: + if self.mime_type.lower() in self.__MP3_MIME_TYPES: + return True + return False + + def mime_type_is_audio(self) -> bool: + """Return True if declared MIME type is an audio type.""" + if self.mime_type: + if self.mime_type.lower().startswith('audio/'): + return True + return False + + def mime_type_is_video(self) -> bool: + """Return True if declared MIME type is a video type.""" + if self.mime_type: + if self.mime_type.lower().startswith('video/'): + return True + return False + + def url_path_has_mp3_extension(self) -> bool: + """Return True if URL's path has .mp3 extension.""" + if is_http_url(self.url): + uri = furl(self.url) + if '.mp3' in str(uri.path).lower(): + return True + return False + + +class AbstractPodcastTranscribeActivities(object): + """Activities interface.""" + + # FIXME timeouts and retries of every action + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=5), + # schedule_to_close_timeout=timedelta(seconds=5), + retry_parameters=RETRY_PARAMETERS, + ) + async def identify_story_bcp47_language_code(self, stories_id: int) -> Optional[str]: + """ + Guess BCP 47 language code of a story, e.g. 'en-US'. + + https://cloud.google.com/speech-to-text/docs/languages + """ + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=5), + # schedule_to_close_timeout=timedelta(seconds=5), + retry_parameters=RETRY_PARAMETERS, + ) + async def determine_best_enclosure(self, stories_id: int) -> Optional[StoryEnclosure]: + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=5), + # schedule_to_close_timeout=timedelta(seconds=5), + retry_parameters=RETRY_PARAMETERS, + ) + async def fetch_store_enclosure(self, stories_id: int, enclosure: StoryEnclosure) -> None: + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=5), + # schedule_to_close_timeout=timedelta(seconds=5), + retry_parameters=RETRY_PARAMETERS, + ) + async def fetch_transcode_store_episode(self, stories_id: int) -> EpisodeMetadata: + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=5), + # schedule_to_close_timeout=timedelta(seconds=5), + retry_parameters=RETRY_PARAMETERS, + ) + async def submit_transcribe_operation(self, + stories_id: int, + episode_metadata: EpisodeMetadata, + bcp47_language_code: str) -> str: + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=5), + # schedule_to_close_timeout=timedelta(seconds=5), + retry_parameters=RETRY_PARAMETERS, + ) + async def fetch_store_raw_transcript_json(self, stories_id: int, speech_operation_id: str) -> None: + raise NotImplementedError + + @activity_method( + task_queue=TASK_QUEUE, + start_to_close_timeout=timedelta(seconds=5), + # schedule_to_close_timeout=timedelta(seconds=5), + retry_parameters=RETRY_PARAMETERS, + ) + async def fetch_store_transcript(self, stories_id: int) -> None: + raise NotImplementedError + + +class AbstractPodcastTranscribeWorkflow(object): + """Workflow interface.""" + + @workflow_method(task_queue=TASK_QUEUE) + async def transcribe_episode(self, stories_id: int) -> None: + raise NotImplementedError diff --git a/apps/podcast-fetch-transcript/tests/python/__init__.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/__init__.py similarity index 100% rename from apps/podcast-fetch-transcript/tests/python/__init__.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/__init__.py diff --git a/apps/podcast-submit-operation/src/python/podcast_submit_operation/exceptions.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/exceptions.py similarity index 100% rename from apps/podcast-submit-operation/src/python/podcast_submit_operation/exceptions.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/exceptions.py diff --git a/apps/podcast-submit-operation/src/python/podcast_submit_operation/submit_operation.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py similarity index 98% rename from apps/podcast-submit-operation/src/python/podcast_submit_operation/submit_operation.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py index 32e754ad97..4977a11ead 100644 --- a/apps/podcast-submit-operation/src/python/podcast_submit_operation/submit_operation.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py @@ -9,8 +9,8 @@ from mediawords.db import DatabaseHandler from mediawords.util.log import create_logger -from podcast_submit_operation.config import PodcastSubmitOperationConfig -from podcast_submit_operation.exceptions import ( +from .config import PodcastSubmitOperationConfig +from .exceptions import ( McPodcastNoEpisodesException, McPodcastDatabaseErrorException, McPodcastInvalidInputException, diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py new file mode 100644 index 0000000000..8162717caa --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py @@ -0,0 +1,56 @@ +import time + +# noinspection PyPackageRequirements +from temporal.workflow import Workflow + +from .shared import AbstractPodcastTranscribeWorkflow, AbstractPodcastTranscribeActivities, RETRY_PARAMETERS + + +# FIXME in the example the activities implementation *was not* inheriting from the interface +class PodcastTranscribeActivities(AbstractPodcastTranscribeActivities): + """Activities implementation.""" + + # noinspection PyMethodMayBeStatic + async def compose_greeting(self, greeting: str, name: str, number: int): + time.sleep(1) + return f"{greeting} {name} number {number}!" + + +class PodcastTranscribeWorkflow(AbstractPodcastTranscribeWorkflow): + """Workflow implementation.""" + + def __init__(self): + self.activities: AbstractPodcastTranscribeActivities = Workflow.new_activity_stub( + activities_cls=AbstractPodcastTranscribeActivities, + retry_parameters=RETRY_PARAMETERS, + ) + + async def transcribe_episode(self, stories_id: int) -> None: + bcp47_language_code = await self.activities.identify_story_bcp47_language_code(stories_id=stories_id) + + enclosure = await self.activities.determine_best_enclosure(stories_id=stories_id) + if not enclosure: + # FIXME what do we do if there's no viable enclosure? Nothing? + return + + await self.activities.fetch_store_enclosure(stories_id=stories_id, enclosure=enclosure) + + episode_metadata = await self.activities.fetch_transcode_store_episode(stories_id=stories_id) + + # FIXME we probably want to test the metadata here, e.g. whether it's set at all or if the duration is right + + speech_operation_id = await self.activities.submit_transcribe_operation( + stories_id=stories_id, + episode_metadata=episode_metadata, + bcp47_language_code=bcp47_language_code, + ) + + await Workflow.sleep(int(episode_metadata.duration * 1.1)) + + # FIXME get the retries right here + await self.activities.fetch_store_raw_transcript_json( + stories_id=stories_id, + speech_operation_id=speech_operation_id, + ) + + await self.activities.fetch_store_transcript(stories_id=stories_id) diff --git a/apps/podcast-transcribe-episode/src/requirements.txt b/apps/podcast-transcribe-episode/src/requirements.txt new file mode 100644 index 0000000000..80fc71c165 --- /dev/null +++ b/apps/podcast-transcribe-episode/src/requirements.txt @@ -0,0 +1,3 @@ +ffmpeg-python==0.2.0 +google-cloud-speech==2.2.1 +google-cloud-storage==1.37.1 diff --git a/apps/podcast-fetch-episode/tests/data/media-samples b/apps/podcast-transcribe-episode/tests/data/media-samples similarity index 100% rename from apps/podcast-fetch-episode/tests/data/media-samples rename to apps/podcast-transcribe-episode/tests/data/media-samples diff --git a/apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/__init__.py b/apps/podcast-transcribe-episode/tests/python/__init__.py similarity index 100% rename from apps/podcast-poll-due-operations/src/python/podcast_poll_due_operations/__init__.py rename to apps/podcast-transcribe-episode/tests/python/__init__.py diff --git a/apps/podcast-fetch-episode/tests/python/config_random_gcs_prefix.py b/apps/podcast-transcribe-episode/tests/python/config_random_gcs_prefix.py similarity index 100% rename from apps/podcast-fetch-episode/tests/python/config_random_gcs_prefix.py rename to apps/podcast-transcribe-episode/tests/python/config_random_gcs_prefix.py diff --git a/apps/podcast-fetch-episode/tests/python/test_bcp47_lang.py b/apps/podcast-transcribe-episode/tests/python/test_bcp47_lang.py similarity index 100% rename from apps/podcast-fetch-episode/tests/python/test_bcp47_lang.py rename to apps/podcast-transcribe-episode/tests/python/test_bcp47_lang.py diff --git a/apps/podcast-fetch-episode/tests/python/test_enclosure.py b/apps/podcast-transcribe-episode/tests/python/test_enclosure.py similarity index 100% rename from apps/podcast-fetch-episode/tests/python/test_enclosure.py rename to apps/podcast-transcribe-episode/tests/python/test_enclosure.py diff --git a/apps/podcast-fetch-episode/tests/python/test_fetch_and_store.py b/apps/podcast-transcribe-episode/tests/python/test_fetch_and_store.py similarity index 100% rename from apps/podcast-fetch-episode/tests/python/test_fetch_and_store.py rename to apps/podcast-transcribe-episode/tests/python/test_fetch_and_store.py diff --git a/apps/podcast-fetch-episode/tests/python/test_fetch_url.py b/apps/podcast-transcribe-episode/tests/python/test_fetch_url.py similarity index 100% rename from apps/podcast-fetch-episode/tests/python/test_fetch_url.py rename to apps/podcast-transcribe-episode/tests/python/test_fetch_url.py diff --git a/apps/podcast-fetch-episode/tests/python/test_gcs_store.py b/apps/podcast-transcribe-episode/tests/python/test_gcs_store.py similarity index 100% rename from apps/podcast-fetch-episode/tests/python/test_gcs_store.py rename to apps/podcast-transcribe-episode/tests/python/test_gcs_store.py diff --git a/apps/podcast-fetch-episode/tests/python/test_media_file.py b/apps/podcast-transcribe-episode/tests/python/test_media_file.py similarity index 100% rename from apps/podcast-fetch-episode/tests/python/test_media_file.py rename to apps/podcast-transcribe-episode/tests/python/test_media_file.py diff --git a/apps/postgresql-server/schema/mediawords.sql b/apps/postgresql-server/schema/mediawords.sql index 81e3df6744..6eda12b0aa 100644 --- a/apps/postgresql-server/schema/mediawords.sql +++ b/apps/postgresql-server/schema/mediawords.sql @@ -3806,17 +3806,6 @@ CREATE UNIQUE INDEX story_enclosures_stories_id_url ON story_enclosures (stories_id, url); --- --- Audio file codec; keep in sync with "_SUPPORTED_NATIVE_AUDIO_CODECS" constant --- (https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1) --- -CREATE TYPE podcast_episodes_audio_codec AS ENUM ( - 'LINEAR16', - 'FLAC', - 'MULAW', - 'OGG_OPUS', - 'MP3' -); -- @@ -3844,13 +3833,13 @@ CREATE TABLE podcast_episodes ( -- Audio codec as determined by transcoder codec podcast_episodes_audio_codec NOT NULL, - -- Audio sample rate (Hz) as determined by transcoder + -- sample_rate INT NOT NULL CONSTRAINT sample_rate_looks_reasonable CHECK(sample_rate > 1000), -- BCP 47 language identifier - -- (https://cloud.google.com/speech-to-text/docs/languages) + -- () bcp47_language_code CITEXT NOT NULL CONSTRAINT bcp47_language_code_looks_reasonable CHECK( @@ -3864,83 +3853,6 @@ CREATE TABLE podcast_episodes ( ); --- Only one episode per story -CREATE UNIQUE INDEX podcast_episodes_stories_id - ON podcast_episodes (stories_id); - -CREATE UNIQUE INDEX podcast_episodes_story_enclosures_id - ON podcast_episodes (story_enclosures_id); - -CREATE UNIQUE INDEX podcast_episodes_stories_id_story_enclosures_id - ON podcast_episodes (stories_id, story_enclosures_id); - - --- Result of an attempt to fetch the transcript -CREATE TYPE podcast_episode_transcript_fetch_result AS ENUM ( - - -- Operation was not yet finished yet at the time of fetching - 'in_progress', - - -- Operation was finished and transcription has succeeded - 'success', - - -- Operation was finished but the transcription has failed - 'error' - -); - - --- --- Attempts to fetch podcast episode transcript --- (we might need to try fetching the operation's results multiple times) --- -CREATE TABLE podcast_episode_transcript_fetches ( - podcast_episode_transcript_fetches_id BIGSERIAL PRIMARY KEY, - - -- Podcast that is being transcribed - podcast_episodes_id BIGINT NOT NULL - REFERENCES podcast_episodes (podcast_episodes_id) - ON DELETE CASCADE, - - -- Timestamp for when a fetch job should be added to the job broker's queue the soonest - add_to_queue_at TIMESTAMP WITH TIME ZONE NOT NULL, - - -- Timestamp for when a fetch job was added to the job broker's queue; - -- if NULL, a fetch job was never added to the queue - added_to_queue_at TIMESTAMP WITH TIME ZONE NULL, - - -- Timestamp when the operation's results were attempted to be fetched by the worker; - -- if NULL, the results weren't attempted to be fetched yet - fetched_at TIMESTAMP WITH TIME ZONE NULL, - - -- Result of the fetch attempt; - -- if NULL, the operation fetch didn't happen yet - result podcast_episode_transcript_fetch_result NULL, - - -- If result = 'error', error message that happened with the fetch attempt - error_message TEXT NULL - -); - - --- Function that returns true if results were attempted at being fetched -CREATE FUNCTION podcast_episode_transcript_was_added_to_queue(p_added_to_queue_at TIMESTAMP WITH TIME ZONE) -RETURNS BOOL AS $$ - - SELECT CASE WHEN p_added_to_queue_at::timestamp IS NULL THEN false ELSE true END; - -$$ LANGUAGE SQL IMMUTABLE; - - -CREATE INDEX podcast_episode_transcript_fetches_podcast_episodes_id - ON podcast_episode_transcript_fetches (podcast_episodes_id); - -CREATE UNIQUE INDEX podcast_episode_transcript_fetches_due - ON podcast_episode_transcript_fetches ( - add_to_queue_at, - podcast_episode_transcript_was_added_to_queue(added_to_queue_at) - ); - -- -- Celery job results From 6ea1594c0468bf54424167df845d8eb6f602b2be Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 8 Apr 2021 16:16:19 +0300 Subject: [PATCH 052/175] Fix service names [ci skip] --- apps/docker-compose.dist.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/docker-compose.dist.yml b/apps/docker-compose.dist.yml index 8b8ad2b6dc..7f6580cd56 100644 --- a/apps/docker-compose.dist.yml +++ b/apps/docker-compose.dist.yml @@ -1226,7 +1226,7 @@ services: # NYTLabels fetch annotation and tag # ----------------------- # - nytlabels-update-story-tags: + nytlabels-fetch-annotation-and-tag: image: gcr.io/mcback/nytlabels-fetch-annotation-and-tag:release init: true networks: From c65f7128bda634526bd75a8d0150eb5e07321ea0 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Mon, 12 Apr 2021 23:27:27 +0300 Subject: [PATCH 053/175] More Temporal podcast demo changes --- .../{fetch_episode => }/config.py | 2 +- .../podcast_transcribe_episode/exceptions.py | 146 ++++++++++++++++++ .../fetch_episode/exceptions.py | 79 ---------- .../fetch_episode/fetch_and_store.py | 14 +- .../fetch_episode/fetch_url.py | 2 +- .../fetch_episode/gcs_store.py | 11 +- .../fetch_episode/media_file.py | 10 +- .../fetch_transcript/exceptions.py | 52 ------- .../fetch_transcript/fetch_store.py | 5 +- .../fetch_transcript/handler.py | 6 +- .../submit_operation/exceptions.py | 54 ------- .../submit_operation/submit_operation.py | 6 +- apps/temporal-grafana/Dockerfile | 3 - apps/temporal-server/Dockerfile | 3 - apps/temporal-webapp/Dockerfile | 3 - 15 files changed, 170 insertions(+), 226 deletions(-) rename apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/{fetch_episode => }/config.py (94%) create mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/exceptions.py delete mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/exceptions.py delete mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/exceptions.py delete mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/exceptions.py diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/config.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/config.py similarity index 94% rename from apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/config.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/config.py index 05c7d028af..b34e2ade51 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/config.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/config.py @@ -1,7 +1,7 @@ from mediawords.util.config import env_value, file_with_env_value -class PodcastFetchEpisodeConfig(object): +class PodcastTranscribeEpisodeConfig(object): """ Podcast episode fetcher configuration. """ diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/exceptions.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/exceptions.py new file mode 100644 index 0000000000..5fb6fe1245 --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/exceptions.py @@ -0,0 +1,146 @@ +import abc + + +class _AbstractPodcastTranscribeEpisodeException(Exception, metaclass=abc.ABCMeta): + """Abstract exception.""" + pass + + +class PodcastTranscribeEpisodeSoftException(_AbstractPodcastTranscribeEpisodeException): + """Soft errors exception.""" + pass + + +class McStoryNotFoundException(PodcastTranscribeEpisodeSoftException): + """Exception raised when story was not found.""" + pass + + +class McPodcastNoViableStoryEnclosuresException(PodcastTranscribeEpisodeSoftException): + """Exception thrown when story has no viable enclosures to choose from.""" + pass + + +class McPodcastEnclosureTooBigException(PodcastTranscribeEpisodeSoftException): + """Exception thrown when story's best viable enclosure is too big.""" + pass + + +class McPodcastFileFetchFailureException(PodcastTranscribeEpisodeSoftException): + """Exception thrown when we're unable to fetch the downloaded file for whatever reason.""" + pass + + +class McPodcastFileIsInvalidException(PodcastTranscribeEpisodeSoftException): + """Exception thrown when the fetched file is not something that we can process for whatever reason.""" + pass + + +class McOperationNotFoundException(PodcastTranscribeEpisodeSoftException): + """Exception thrown when a transcription operation was not found for a particular operation ID.""" + # Not a "hard" failure as sometimes these operations expire + pass + + +class McPodcastNoEpisodesException(PodcastTranscribeEpisodeSoftException): + """Exception thrown when there are no episodes for a story.""" + pass + + +class McPodcastEpisodeTooLongException(PodcastTranscribeEpisodeSoftException): + """Exception raised when podcast's episode is too long.""" + pass + + +# --- + +class PodcastTranscribeEpisodeHardException(_AbstractPodcastTranscribeEpisodeException): + """Hard errors exception.""" + pass + + +class McPodcastFileStoreFailureException(PodcastTranscribeEpisodeHardException): + """ + Exception thrown when we're unable to store the downloaded file for whatever reason. + + This is a hard exception as not being able to store a file means that we might be out of disk space or something + like that. + """ + pass + + +class McPodcastGCSStoreFailureException(PodcastTranscribeEpisodeHardException): + """ + Exception thrown when we're unable to store an object to Google Cloud Storage. + + GCS problems, if any, are probably temporary, but still, in those cases we should retry a few times and then give up + permanently because not being able to store stuff to GCS might mean that we ran out of some sort of a limit, + credentials are wrong, etc. + """ + pass + + +class McPodcastMisconfiguredTranscoderException(PodcastTranscribeEpisodeHardException): + """Exception thrown when something happens with the transcoder that we didn't anticipate before.""" + pass + + +class McPodcastMisconfiguredGCSException(PodcastTranscribeEpisodeHardException): + """Exception thrown when something happens with Google Cloud Storage that we didn't anticipate before.""" + pass + + +class McPodcastPostgreSQLException(PodcastTranscribeEpisodeHardException): + """Exception thrown on PostgreSQL errors.""" + pass + + +class McDatabaseNotFoundException(PodcastTranscribeEpisodeHardException): + """Exception thrown when we can't find something in the database that we've expected to find.""" + pass + + +class McDatabaseErrorException(PodcastTranscribeEpisodeHardException): + """Exception thrown when a database raises an error.""" + pass + + +class McMisconfiguredSpeechAPIException(PodcastTranscribeEpisodeHardException): + """Exception thrown when we receive something we didn't expect from Speech API.""" + pass + + +class McTranscriptionReturnedErrorException(PodcastTranscribeEpisodeHardException): + """ + Exception thrown when Speech API explicitly returns an error state. + + When Speech API returns with an error, it's unclear whether it was us who have messed up or + something is (temporarily) wrong on their end, so on the safe side we throw a "hard" exception. + """ + pass + + +class McPodcastDatabaseErrorException(PodcastTranscribeEpisodeHardException): + """Exception thrown on database errors.""" + pass + + +class McPodcastInvalidInputException(PodcastTranscribeEpisodeHardException): + """Exception thrown on invalid inputs.""" + pass + + +class McPodcastMisconfiguredSpeechAPIException(PodcastTranscribeEpisodeHardException): + """Exception thrown on misconfigured Google Speech API.""" + pass + + +class McPodcastSpeechAPIRequestFailedException(PodcastTranscribeEpisodeHardException): + """ + Exception that is thrown when we're unable to submit a new job to Speech API. + + This is a hard exception because we should be able to handle "soft" failures (e.g. temporary network errors) of + Speech API in the code, and on any other, previously unseen, problems (service downtime, ran out of money, blocked, + outdated API version, etc.) it's better just to shut down the worker + """ + pass diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/exceptions.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/exceptions.py deleted file mode 100644 index 9c95054ffd..0000000000 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/exceptions.py +++ /dev/null @@ -1,79 +0,0 @@ -import abc - - -class _AbstractMcPodcastFetchEpisodeException(Exception, metaclass=abc.ABCMeta): - """Abstract exception.""" - pass - - -class McPodcastFetchEpisodeSoftException(_AbstractMcPodcastFetchEpisodeException): - """Soft errors exception.""" - pass - - -class McStoryNotFoundException(McPodcastFetchEpisodeSoftException): - """Exception raised when story was not found.""" - pass - - -class McPodcastNoViableStoryEnclosuresException(McPodcastFetchEpisodeSoftException): - """Exception thrown when story has no viable enclosures to choose from.""" - pass - - -class McPodcastEnclosureTooBigException(McPodcastFetchEpisodeSoftException): - """Exception thrown when story's best viable enclosure is too big.""" - pass - - -class McPodcastFileFetchFailureException(McPodcastFetchEpisodeSoftException): - """Exception thrown when we're unable to fetch the downloaded file for whatever reason.""" - pass - - -class McPodcastFileIsInvalidException(McPodcastFetchEpisodeSoftException): - """Exception thrown when the fetched file is not something that we can process for whatever reason.""" - pass - - -# --- - -class McPodcastFetchEpisodeHardException(_AbstractMcPodcastFetchEpisodeException): - """Hard errors exception.""" - pass - - -class McPodcastFileStoreFailureException(McPodcastFetchEpisodeHardException): - """ - Exception thrown when we're unable to store the downloaded file for whatever reason. - - This is a hard exception as not being able to store a file means that we might be out of disk space or something - like that. - """ - pass - - -class McPodcastGCSStoreFailureException(McPodcastFetchEpisodeHardException): - """ - Exception thrown when we're unable to store an object to Google Cloud Storage. - - GCS problems, if any, are probably temporary, but still, in those cases we should retry a few times and then give up - permanently because not being able to store stuff to GCS might mean that we ran out of some sort of a limit, - credentials are wrong, etc. - """ - pass - - -class McPodcastMisconfiguredTranscoderException(McPodcastFetchEpisodeHardException): - """Exception thrown when something happens with the transcoder that we didn't anticipate before.""" - pass - - -class McPodcastMisconfiguredGCSException(McPodcastFetchEpisodeHardException): - """Exception thrown when something happens with Google Cloud Storage that we didn't anticipate before.""" - pass - - -class McPodcastPostgreSQLException(McPodcastFetchEpisodeHardException): - """Exception thrown on PostgreSQL errors.""" - pass diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py index 1719ecfdca..ba4b2f68d0 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py @@ -8,10 +8,8 @@ from mediawords.util.log import create_logger from mediawords.util.parse_html import html_strip -from .bcp47_lang import iso_639_1_code_to_bcp_47_identifier -from .config import PodcastFetchEpisodeConfig -from .enclosure import podcast_viable_enclosure_for_story, MAX_ENCLOSURE_SIZE -from .exceptions import ( +from ..config import PodcastTranscribeEpisodeConfig +from ..exceptions import ( McStoryNotFoundException, McPodcastNoViableStoryEnclosuresException, McPodcastEnclosureTooBigException, @@ -20,6 +18,8 @@ McPodcastGCSStoreFailureException, McPodcastPostgreSQLException, ) +from .bcp47_lang import iso_639_1_code_to_bcp_47_identifier +from .enclosure import podcast_viable_enclosure_for_story, MAX_ENCLOSURE_SIZE from .fetch_url import fetch_big_file from .gcs_store import GCSStore from .media_file import TranscodeTempDirAndFile, transcode_media_file_if_needed, media_file_info @@ -38,7 +38,7 @@ def _cleanup_temp_dir(temp: TranscodeTempDirAndFile) -> None: def fetch_and_store_episode(db: DatabaseHandler, stories_id: int, - config: Optional[PodcastFetchEpisodeConfig] = None) -> None: + config: Optional[PodcastTranscribeEpisodeConfig] = None) -> None: """ Choose a viable story enclosure for podcast, fetch it, transcode if needed, store to GCS, and record to DB. @@ -54,15 +54,13 @@ def fetch_and_store_episode(db: DatabaseHandler, 4) Uploads the episode audio file to Google Cloud Storage; 5) Adds a row to "podcast_episodes". - Adding a job to submit the newly created episode to Speech API (by adding a RabbitMQ job) is up to the caller. - :param db: Database handler. :param stories_id: Story ID for the story to operate on. :param config: (optional) Podcast fetcher configuration object (useful for testing). """ if not config: - config = PodcastFetchEpisodeConfig() + config = PodcastTranscribeEpisodeConfig() story = db.find_by_id(table='stories', object_id=stories_id) if not story: diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_url.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_url.py index b605d792e0..f3cd90b20c 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_url.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_url.py @@ -5,7 +5,7 @@ from mediawords.util.log import create_logger -from .exceptions import McPodcastFileFetchFailureException, McPodcastFileStoreFailureException +from ..exceptions import McPodcastFileFetchFailureException, McPodcastFileStoreFailureException log = create_logger(__name__) diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/gcs_store.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/gcs_store.py index 90fbc88f55..9a9b4c2817 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/gcs_store.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/gcs_store.py @@ -10,11 +10,8 @@ from mediawords.util.log import create_logger -from .config import PodcastFetchEpisodeConfig -from .exceptions import ( - McPodcastGCSStoreFailureException, - McPodcastMisconfiguredGCSException, -) +from ..config import PodcastTranscribeEpisodeConfig +from ..exceptions import McPodcastGCSStoreFailureException, McPodcastMisconfiguredGCSException log = create_logger(__name__) @@ -27,9 +24,9 @@ class GCSStore(object): '__config', ] - def __init__(self, config: Optional[PodcastFetchEpisodeConfig] = None): + def __init__(self, config: Optional[PodcastTranscribeEpisodeConfig] = None): if not config: - config = PodcastFetchEpisodeConfig() + config = PodcastTranscribeEpisodeConfig() self.__config = config self.__bucket_internal = None diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_file.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_file.py index 998a5a2f54..1702c42f2c 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_file.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_file.py @@ -11,6 +11,11 @@ from mediawords.util.log import create_logger +from ..exceptions import ( + McPodcastMisconfiguredTranscoderException, + McPodcastFileIsInvalidException, + McPodcastFileStoreFailureException, +) from .audio_codecs import ( AbstractAudioCodec, Linear16AudioCodec, @@ -19,11 +24,6 @@ OggOpusAudioCodec, MP3AudioCodec, ) -from .exceptions import ( - McPodcastMisconfiguredTranscoderException, - McPodcastFileIsInvalidException, - McPodcastFileStoreFailureException, -) log = create_logger(__name__) diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/exceptions.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/exceptions.py deleted file mode 100644 index 0b64b540b6..0000000000 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/exceptions.py +++ /dev/null @@ -1,52 +0,0 @@ -import abc - - -class _AbstractMcPodcastFetchTranscriptException(Exception, metaclass=abc.ABCMeta): - """Abstract exception.""" - pass - - -# --- - - -class McPodcastFetchTranscriptSoftException(_AbstractMcPodcastFetchTranscriptException): - """Soft errors exception.""" - pass - - -class McOperationNotFoundException(McPodcastFetchTranscriptSoftException): - """Exception thrown when a transcription operation was not found for a particular operation ID.""" - # Not a "hard" failure as sometimes these operations expire - pass - - -# --- - -class McPodcastFetchTranscriptHardException(_AbstractMcPodcastFetchTranscriptException): - """Hard errors exception.""" - pass - - -class McDatabaseNotFoundException(McPodcastFetchTranscriptHardException): - """Exception thrown when we can't find something in the database that we've expected to find.""" - pass - - -class McDatabaseErrorException(McPodcastFetchTranscriptHardException): - """Exception thrown when a database raises an error.""" - pass - - -class McMisconfiguredSpeechAPIException(McPodcastFetchTranscriptHardException): - """Exception thrown when we receive something we didn't expect from Speech API.""" - pass - - -class McTranscriptionReturnedErrorException(McPodcastFetchTranscriptHardException): - """ - Exception thrown when Speech API explicitly returns an error state. - - When Speech API returns with an error, it's unclear whether it was us who have messed up or - something is (temporarily) wrong on their end, so on the safe side we throw a "hard" exception. - """ - pass diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/fetch_store.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/fetch_store.py index 8aa6f086de..bdde9e0fe9 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/fetch_store.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/fetch_store.py @@ -3,10 +3,7 @@ from mediawords.db import DatabaseHandler from mediawords.util.log import create_logger -from .exceptions import ( - McDatabaseErrorException, - McDatabaseNotFoundException, -) +from ..exceptions import McDatabaseErrorException, McDatabaseNotFoundException from .handler import AbstractHandler, DefaultHandler log = create_logger(__name__) diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py index 38a86a9c13..33d4388bf2 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py @@ -15,8 +15,8 @@ from mediawords.dbi.downloads.store import store_content from mediawords.util.log import create_logger -from .config import PodcastFetchTranscriptConfig -from .exceptions import ( +from ..config import PodcastTranscribeEpisodeConfig +from ..exceptions import ( McDatabaseNotFoundException, McMisconfiguredSpeechAPIException, McOperationNotFoundException, @@ -90,7 +90,7 @@ def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetche raise McMisconfiguredSpeechAPIException(f"Speech ID for podcast episode {podcast_episodes_id} is unset.") try: - config = PodcastFetchTranscriptConfig() + config = PodcastTranscribeEpisodeConfig() client = SpeechClient.from_service_account_json(config.gc_auth_json_file()) operations_client = OperationsClient(channel=client._transport._grpc_channel) except Exception as ex: diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/exceptions.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/exceptions.py deleted file mode 100644 index 2ed79eb39a..0000000000 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/exceptions.py +++ /dev/null @@ -1,54 +0,0 @@ -import abc - - -class _AbstractMcPodcastSubmitOperationException(Exception, metaclass=abc.ABCMeta): - """Abstract exception.""" - pass - - -class McPodcastSubmitOperationSoftException(_AbstractMcPodcastSubmitOperationException): - """Soft errors exception.""" - pass - - -class McPodcastNoEpisodesException(McPodcastSubmitOperationSoftException): - """Exception thrown when there are no episodes for a story.""" - pass - - -class McPodcastEpisodeTooLongException(McPodcastSubmitOperationSoftException): - """Exception raised when podcast's episode is too long.""" - pass - - -# --- - -class McPodcastSubmitOperationHardException(_AbstractMcPodcastSubmitOperationException): - """Hard errors exception.""" - pass - - -class McPodcastDatabaseErrorException(McPodcastSubmitOperationHardException): - """Exception thrown on database errors.""" - pass - - -class McPodcastInvalidInputException(McPodcastSubmitOperationHardException): - """Exception thrown on invalid inputs.""" - pass - - -class McPodcastMisconfiguredSpeechAPIException(McPodcastSubmitOperationHardException): - """Exception thrown on misconfigured Google Speech API.""" - pass - - -class McPodcastSpeechAPIRequestFailedException(McPodcastSubmitOperationHardException): - """ - Exception that is thrown when we're unable to submit a new job to Speech API. - - This is a hard exception because we should be able to handle "soft" failures (e.g. temporary network errors) of - Speech API in the code, and on any other, previously unseen, problems (service downtime, ran out of money, blocked, - outdated API version, etc.) it's better just to shut down the worker - """ - pass diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py index 4977a11ead..9724c0417d 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py @@ -9,8 +9,8 @@ from mediawords.db import DatabaseHandler from mediawords.util.log import create_logger -from .config import PodcastSubmitOperationConfig -from .exceptions import ( +from ..config import PodcastTranscribeEpisodeConfig +from ..exceptions import ( McPodcastNoEpisodesException, McPodcastDatabaseErrorException, McPodcastInvalidInputException, @@ -145,7 +145,7 @@ def submit_transcribe_operation(episode: PodcastEpisode) -> int: """ try: - config = PodcastSubmitOperationConfig() + config = PodcastTranscribeEpisodeConfig() client = SpeechClient.from_service_account_json(config.gc_auth_json_file()) except Exception as ex: raise McPodcastMisconfiguredSpeechAPIException(f"Unable to create Speech API client: {ex}") diff --git a/apps/temporal-grafana/Dockerfile b/apps/temporal-grafana/Dockerfile index 3500dac11a..d4b5d2b461 100644 --- a/apps/temporal-grafana/Dockerfile +++ b/apps/temporal-grafana/Dockerfile @@ -4,9 +4,6 @@ FROM gcr.io/mcback/base:latest -# FIXME -RUN apt-get -y update - # Install dependencies RUN \ apt-get -y --no-install-recommends install \ diff --git a/apps/temporal-server/Dockerfile b/apps/temporal-server/Dockerfile index 65b1b580ec..02ca2cedfb 100644 --- a/apps/temporal-server/Dockerfile +++ b/apps/temporal-server/Dockerfile @@ -4,9 +4,6 @@ FROM gcr.io/mcback/base:latest -# FIXME -RUN apt-get -y update - # Install dependencies RUN \ apt-get -y --no-install-recommends install \ diff --git a/apps/temporal-webapp/Dockerfile b/apps/temporal-webapp/Dockerfile index cd338db5e5..4ec35c2eb4 100644 --- a/apps/temporal-webapp/Dockerfile +++ b/apps/temporal-webapp/Dockerfile @@ -4,9 +4,6 @@ FROM gcr.io/mcback/base:latest -# FIXME -RUN apt-get -y update - RUN \ # # Add NodeSource APT repository From 2a08bce2597bb9a497fbaec12fc6d9aa76016ca0 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Tue, 13 Apr 2021 01:05:44 +0300 Subject: [PATCH 054/175] More work on the podcast demo --- .../podcast_transcribe_episode/exceptions.py | 78 ++----- .../fetch_episode/enclosure.py | 64 +++++- .../fetch_episode/fetch_and_store.py | 128 +---------- .../fetch_transcript/fetch_store.py | 4 +- .../fetch_transcript/handler.py | 17 +- .../podcast_transcribe_episode/shared.py | 213 ++++++++++++------ .../submit_operation/submit_operation.py | 9 +- .../podcast_transcribe_episode/workflow.py | 79 ++++++- apps/postgresql-server/schema/mediawords.sql | 48 ---- 9 files changed, 321 insertions(+), 319 deletions(-) diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/exceptions.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/exceptions.py index 5fb6fe1245..1b2cd3922b 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/exceptions.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/exceptions.py @@ -6,60 +6,30 @@ class _AbstractPodcastTranscribeEpisodeException(Exception, metaclass=abc.ABCMet pass -class PodcastTranscribeEpisodeSoftException(_AbstractPodcastTranscribeEpisodeException): +class SoftException(_AbstractPodcastTranscribeEpisodeException): """Soft errors exception.""" pass -class McStoryNotFoundException(PodcastTranscribeEpisodeSoftException): - """Exception raised when story was not found.""" - pass - - -class McPodcastNoViableStoryEnclosuresException(PodcastTranscribeEpisodeSoftException): - """Exception thrown when story has no viable enclosures to choose from.""" - pass - - -class McPodcastEnclosureTooBigException(PodcastTranscribeEpisodeSoftException): - """Exception thrown when story's best viable enclosure is too big.""" - pass - -class McPodcastFileFetchFailureException(PodcastTranscribeEpisodeSoftException): +class McPodcastFileFetchFailureException(SoftException): """Exception thrown when we're unable to fetch the downloaded file for whatever reason.""" pass -class McPodcastFileIsInvalidException(PodcastTranscribeEpisodeSoftException): +class McPodcastFileIsInvalidException(SoftException): """Exception thrown when the fetched file is not something that we can process for whatever reason.""" pass -class McOperationNotFoundException(PodcastTranscribeEpisodeSoftException): - """Exception thrown when a transcription operation was not found for a particular operation ID.""" - # Not a "hard" failure as sometimes these operations expire - pass - - -class McPodcastNoEpisodesException(PodcastTranscribeEpisodeSoftException): - """Exception thrown when there are no episodes for a story.""" - pass - - -class McPodcastEpisodeTooLongException(PodcastTranscribeEpisodeSoftException): - """Exception raised when podcast's episode is too long.""" - pass - - # --- -class PodcastTranscribeEpisodeHardException(_AbstractPodcastTranscribeEpisodeException): +class HardException(_AbstractPodcastTranscribeEpisodeException): """Hard errors exception.""" pass -class McPodcastFileStoreFailureException(PodcastTranscribeEpisodeHardException): +class McPodcastFileStoreFailureException(HardException): """ Exception thrown when we're unable to store the downloaded file for whatever reason. @@ -69,7 +39,7 @@ class McPodcastFileStoreFailureException(PodcastTranscribeEpisodeHardException): pass -class McPodcastGCSStoreFailureException(PodcastTranscribeEpisodeHardException): +class McPodcastGCSStoreFailureException(HardException): """ Exception thrown when we're unable to store an object to Google Cloud Storage. @@ -80,62 +50,42 @@ class McPodcastGCSStoreFailureException(PodcastTranscribeEpisodeHardException): pass -class McPodcastMisconfiguredTranscoderException(PodcastTranscribeEpisodeHardException): +class McPodcastMisconfiguredTranscoderException(HardException): """Exception thrown when something happens with the transcoder that we didn't anticipate before.""" pass -class McPodcastMisconfiguredGCSException(PodcastTranscribeEpisodeHardException): +class McPodcastMisconfiguredGCSException(HardException): """Exception thrown when something happens with Google Cloud Storage that we didn't anticipate before.""" pass -class McPodcastPostgreSQLException(PodcastTranscribeEpisodeHardException): - """Exception thrown on PostgreSQL errors.""" - pass - - -class McDatabaseNotFoundException(PodcastTranscribeEpisodeHardException): +class McDatabaseNotFoundException(HardException): """Exception thrown when we can't find something in the database that we've expected to find.""" pass -class McDatabaseErrorException(PodcastTranscribeEpisodeHardException): - """Exception thrown when a database raises an error.""" - pass - - -class McMisconfiguredSpeechAPIException(PodcastTranscribeEpisodeHardException): +class McMisconfiguredSpeechAPIException(HardException): """Exception thrown when we receive something we didn't expect from Speech API.""" pass -class McTranscriptionReturnedErrorException(PodcastTranscribeEpisodeHardException): - """ - Exception thrown when Speech API explicitly returns an error state. - - When Speech API returns with an error, it's unclear whether it was us who have messed up or - something is (temporarily) wrong on their end, so on the safe side we throw a "hard" exception. - """ - pass - - -class McPodcastDatabaseErrorException(PodcastTranscribeEpisodeHardException): +class McPodcastDatabaseErrorException(HardException): """Exception thrown on database errors.""" pass -class McPodcastInvalidInputException(PodcastTranscribeEpisodeHardException): +class McPodcastInvalidInputException(HardException): """Exception thrown on invalid inputs.""" pass -class McPodcastMisconfiguredSpeechAPIException(PodcastTranscribeEpisodeHardException): +class McPodcastMisconfiguredSpeechAPIException(HardException): """Exception thrown on misconfigured Google Speech API.""" pass -class McPodcastSpeechAPIRequestFailedException(PodcastTranscribeEpisodeHardException): +class McPodcastSpeechAPIRequestFailedException(HardException): """ Exception that is thrown when we're unable to submit a new job to Speech API. diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/enclosure.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/enclosure.py index e3193ebe43..b6284798e6 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/enclosure.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/enclosure.py @@ -1,8 +1,12 @@ import dataclasses -from typing import Optional, Dict, Any +from typing import Optional + +# noinspection PyPackageRequirements +from furl import furl from mediawords.db import DatabaseHandler from mediawords.util.log import create_logger +from mediawords.util.url import is_http_url log = create_logger(__name__) @@ -10,6 +14,55 @@ """Max. enclosure size (in bytes) that we're willing to download.""" +@dataclasses.dataclass +class StoryEnclosure(object): + """Single story enclosure derived from feed's element.""" + + __MP3_MIME_TYPES = {'audio/mpeg', 'audio/mpeg3', 'audio/mp3', 'audio/x-mpeg-3'} + """MIME types which MP3 files might have.""" + + story_enclosures_id: int + """ID from 'story_enclosures' table.""" + + url: str + """Enclosure's URL, e.g. 'https://www.example.com/episode.mp3'.""" + + mime_type: Optional[str] + """Enclosure's reported MIME type, or None if it wasn't reported; e.g. 'audio/mpeg'.""" + + length: Optional[int] + """Enclosure's reported length in bytes, or None if it wasn't reported.""" + + def mime_type_is_mp3(self) -> bool: + """Return True if declared MIME type is one of the MP3 ones.""" + if self.mime_type: + if self.mime_type.lower() in self.__MP3_MIME_TYPES: + return True + return False + + def mime_type_is_audio(self) -> bool: + """Return True if declared MIME type is an audio type.""" + if self.mime_type: + if self.mime_type.lower().startswith('audio/'): + return True + return False + + def mime_type_is_video(self) -> bool: + """Return True if declared MIME type is a video type.""" + if self.mime_type: + if self.mime_type.lower().startswith('video/'): + return True + return False + + def url_path_has_mp3_extension(self) -> bool: + """Return True if URL's path has .mp3 extension.""" + if is_http_url(self.url): + uri = furl(self.url) + if '.mp3' in str(uri.path).lower(): + return True + return False + + def podcast_viable_enclosure_for_story(db: DatabaseHandler, stories_id: int) -> Optional[StoryEnclosure]: """Fetch all enclosures, find and return the one that looks like a podcast episode the most (or None).""" story_enclosures_dicts = db.query(""" @@ -32,7 +85,14 @@ def podcast_viable_enclosure_for_story(db: DatabaseHandler, stories_id: int) -> for enclosure_dict in story_enclosures_dicts: if is_http_url(enclosure_dict['url']): - story_enclosures.append(StoryEnclosure.from_db_row(db_row=enclosure_dict)) + story_enclosures.append( + StoryEnclosure( + story_enclosures_id=enclosure_dict['story_enclosures_id'], + url=enclosure_dict['url'], + mime_type=enclosure_dict['mime_type'], + length=enclosure_dict['length'], + ) + ) chosen_enclosure = None diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py index ba4b2f68d0..6d9c723535 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py @@ -1,25 +1,11 @@ import os import shutil import tempfile -from typing import Optional -from mediawords.db import DatabaseHandler -from mediawords.util.identify_language import language_code_for_text, identification_would_be_reliable from mediawords.util.log import create_logger -from mediawords.util.parse_html import html_strip - -from ..config import PodcastTranscribeEpisodeConfig -from ..exceptions import ( - McStoryNotFoundException, - McPodcastNoViableStoryEnclosuresException, - McPodcastEnclosureTooBigException, - McPodcastFileStoreFailureException, - McPodcastFileFetchFailureException, - McPodcastGCSStoreFailureException, - McPodcastPostgreSQLException, -) -from .bcp47_lang import iso_639_1_code_to_bcp_47_identifier -from .enclosure import podcast_viable_enclosure_for_story, MAX_ENCLOSURE_SIZE + +from ..exceptions import McPodcastFileStoreFailureException, McPodcastFileFetchFailureException +from .enclosure import MAX_ENCLOSURE_SIZE, StoryEnclosure from .fetch_url import fetch_big_file from .gcs_store import GCSStore from .media_file import TranscodeTempDirAndFile, transcode_media_file_if_needed, media_file_info @@ -36,63 +22,22 @@ def _cleanup_temp_dir(temp: TranscodeTempDirAndFile) -> None: raise McPodcastFileStoreFailureException(f"Unable to remove temporary directory: {ex}") -def fetch_and_store_episode(db: DatabaseHandler, - stories_id: int, - config: Optional[PodcastTranscribeEpisodeConfig] = None) -> None: +def fetch_and_store_episode(stories_id: int, enclosure: StoryEnclosure) -> None: """ - Choose a viable story enclosure for podcast, fetch it, transcode if needed, store to GCS, and record to DB. + Choose a viable story enclosure for podcast, fetch it, transcode if needed, and store to GCS. - 1) Determines the episode's likely language by looking into its title and description, converts the language code to - BCP 47; - 1) Using enclosures from "story_enclosures", chooses the one that looks like a podcast episode the most; 2) Fetches the chosen enclosure; 3) Transcodes the file (if needed) by: a) converting it to an audio format that the Speech API can support, and / or b) discarding video stream from the media file, and / or c) discarding other audio streams from the media file; 5) Reads the various parameters, e.g. sample rate, of the episode audio file; - 4) Uploads the episode audio file to Google Cloud Storage; - 5) Adds a row to "podcast_episodes". + 4) Uploads the episode audio file to Google Cloud Storage. - :param db: Database handler. :param stories_id: Story ID for the story to operate on. - :param config: (optional) Podcast fetcher configuration object (useful for testing). + :param enclosure: Enclosure to fetch. """ - if not config: - config = PodcastTranscribeEpisodeConfig() - - story = db.find_by_id(table='stories', object_id=stories_id) - if not story: - raise McStoryNotFoundException(f"Story {stories_id} was not found.") - - # Try to determine language of the story - story_title = story['title'] - story_description = html_strip(story['description']) - sample_text = f"{story_title}\n{story_description}" - - iso_639_1_language_code = None - if identification_would_be_reliable(text=sample_text): - iso_639_1_language_code = language_code_for_text(text=sample_text) - - if not iso_639_1_language_code: - iso_639_1_language_code = 'en' - - # Convert to BCP 47 identifier - bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier( - iso_639_1_code=iso_639_1_language_code, - url_hint=story['url'], - ) - - # Find the enclosure that might work the best - best_enclosure = podcast_viable_enclosure_for_story(db=db, stories_id=stories_id) - if not best_enclosure: - raise McPodcastNoViableStoryEnclosuresException(f"There were no viable enclosures found for story {stories_id}") - - if best_enclosure.length: - if best_enclosure.length > MAX_ENCLOSURE_SIZE: - raise McPodcastEnclosureTooBigException(f"Chosen enclosure {best_enclosure} is too big.") - try: temp_dir = tempfile.mkdtemp('fetch_and_store') except Exception as ex: @@ -101,9 +46,9 @@ def fetch_and_store_episode(db: DatabaseHandler, # Fetch enclosure input_filename = 'input_file' input_file_path = os.path.join(temp_dir, input_filename) - log.info(f"Fetching enclosure {best_enclosure} to {input_file_path}...") - fetch_big_file(url=best_enclosure.url, dest_file=input_file_path, max_size=MAX_ENCLOSURE_SIZE) - log.info(f"Done fetching enclosure {best_enclosure} to {input_file_path}") + log.info(f"Fetching enclosure {enclosure} to {input_file_path}...") + fetch_big_file(url=enclosure.url, dest_file=input_file_path, max_size=MAX_ENCLOSURE_SIZE) + log.info(f"Done fetching enclosure {enclosure} to {input_file_path}") if os.stat(input_file_path).st_size == 0: # Might happen with misconfigured webservers @@ -131,7 +76,7 @@ def fetch_and_store_episode(db: DatabaseHandler, # Store input file to GCS try: - gcs = GCSStore(config=config) + gcs = GCSStore() gcs_uri = gcs.store_object( local_file_path=input_file_obj.temp_full_path, object_id=str(stories_id), @@ -150,53 +95,4 @@ def fetch_and_store_episode(db: DatabaseHandler, # Clean up the locally stored file as we don't need it anymore _cleanup_temp_dir(temp=input_file_obj) - # Insert everything to the database - try: - db.query(""" - INSERT INTO podcast_episodes ( - stories_id, - story_enclosures_id, - gcs_uri, - duration, - codec, - sample_rate, - bcp47_language_code - ) VALUES ( - %(stories_id)s, - %(story_enclosures_id)s, - %(gcs_uri)s, - %(duration)s, - %(codec)s, - %(sample_rate)s, - %(bcp47_language_code)s - ) ON CONFLICT (stories_id) DO UPDATE SET - story_enclosures_id = %(story_enclosures_id)s, - gcs_uri = %(gcs_uri)s, - duration = %(duration)s, - codec = %(codec)s, - sample_rate = %(sample_rate)s, - bcp47_language_code = %(bcp47_language_code)s - """, { - 'stories_id': stories_id, - 'story_enclosures_id': best_enclosure.story_enclosures_id, - 'gcs_uri': gcs_uri, - 'duration': best_audio_stream.duration, - 'codec': best_audio_stream.audio_codec_class.postgresql_enum_value(), - 'sample_rate': best_audio_stream.sample_rate, - 'bcp47_language_code': bcp_47_language_code, - }) - - except Exception as ex_db: - - # Try to delete object on GCS first - try: - gcs.delete_object(object_id=str(stories_id)) - except Exception as ex_gcs: - # We should be able to delete it as we've just uploaded it - raise McPodcastGCSStoreFailureException(( - f"Unable to clean up story's {stories_id} audio file from GCS after database insert failure; " - f"database insert exception: {ex_db}; " - f"GCS exception: {ex_gcs}") - ) - - raise McPodcastPostgreSQLException(f"Failed inserting episode for story {stories_id}: {ex_db}") + # FIXME diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/fetch_store.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/fetch_store.py index bdde9e0fe9..da1ed5bfee 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/fetch_store.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/fetch_store.py @@ -3,7 +3,7 @@ from mediawords.db import DatabaseHandler from mediawords.util.log import create_logger -from ..exceptions import McDatabaseErrorException, McDatabaseNotFoundException +from ..exceptions import McDatabaseNotFoundException, HardException from .handler import AbstractHandler, DefaultHandler log = create_logger(__name__) @@ -102,7 +102,7 @@ def fetch_store_transcript( 'error_message': str(ex), }) except Exception as ex2: - raise McDatabaseErrorException(( + raise HardException(( f"Error while executing transcript fetch for ID {podcast_episode_transcript_fetches_id}: {ex}; " f"further, I wasn't able to log it to database because: {ex2}" )) diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py index 33d4388bf2..f84c517e89 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py @@ -19,8 +19,7 @@ from ..exceptions import ( McDatabaseNotFoundException, McMisconfiguredSpeechAPIException, - McOperationNotFoundException, - McTranscriptionReturnedErrorException, + HardException, SoftException, ) from .transcript import UtteranceAlternative, Utterance, Transcript @@ -101,7 +100,9 @@ def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetche except InvalidArgument as ex: raise McMisconfiguredSpeechAPIException(f"Invalid operation ID '{speech_operation_id}': {ex}") except NotFound as ex: - raise McOperationNotFoundException(f"Operation ID '{speech_operation_id}' was not found: {ex}") + # Not a "hard" failure as sometimes these operations expire + # FIXME although we should be resubmitting the media file for a new transcript when that happens + raise SoftException(f"Operation ID '{speech_operation_id}' was not found: {ex}") except Exception as ex: # On any other errors, raise a hard exception raise McMisconfiguredSpeechAPIException(f"Error while fetching operation ID '{speech_operation_id}': {ex}") @@ -158,13 +159,15 @@ def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetche ) except GoogleAPICallError as ex: - raise McTranscriptionReturnedErrorException( - f"Unable to read transcript for operation '{speech_operation_id}': {ex}" + # When Speech API returns with an error, it's unclear whether it was us who have messed up or something is + # (temporarily) wrong on their end, so on the safe side we throw a "hard" exception. + raise HardException( + f"Unable to read transcript for operation '{speech_operation_id}' due to API error: {ex}" ) except Exception as ex: - raise McMisconfiguredSpeechAPIException( - f"Unable to read transcript for operation '{speech_operation_id}': {ex}" + raise HardException( + f"Unable to read transcript for operation '{speech_operation_id}' due to other error: {ex}" ) return Transcript(stories_id=stories_id, utterances=utterances) diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/shared.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/shared.py index 5c02ea948a..bfb1854689 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/shared.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/shared.py @@ -1,20 +1,20 @@ # FIXME remove unused tables # FIXME post-init validation of dataclasses (https://docs.python.org/3/library/dataclasses.html#post-init-processing) # FIXME workflow logger +# FIXME if something's wrong (e.g. the episode doesn't look valid), should the workflow succeed or fail? import dataclasses import enum from datetime import timedelta from typing import Optional -# noinspection PyPackageRequirements -from furl import furl # noinspection PyPackageRequirements from temporal.activity_method import activity_method, RetryParameters # noinspection PyPackageRequirements from temporal.workflow import workflow_method -from mediawords.util.url import is_http_url +from .exceptions import HardException +from .fetch_episode.enclosure import StoryEnclosure TASK_QUEUE = "podcast-transcribe-episode" """Temporal task queue.""" @@ -24,11 +24,32 @@ # FIXME different retry parameters for various actions RETRY_PARAMETERS = RetryParameters( + + # InitialInterval is a delay before the first retry. initial_interval=timedelta(seconds=1), - maximum_interval=timedelta(seconds=100), + + # BackoffCoefficient. Retry policies are exponential. The coefficient specifies how fast the retry interval is + # growing. The coefficient of 1 means that the retry interval is always equal to the InitialInterval. backoff_coefficient=2, - maximum_attempts=500, + + # MaximumInterval specifies the maximum interval between retries. Useful for coefficients more than 1. + maximum_interval=timedelta(hours=2), + + # MaximumAttempts specifies how many times to attempt to execute an Activity in the presence of failures. If this + # limit is exceeded, the error is returned back to the Workflow that invoked the Activity. + maximum_attempts=50, + + # NonRetryableErrorReasons allows you to specify errors that shouldn't be retried. For example retrying invalid + # arguments error doesn't make sense in some scenarios. + # FIXME test if it actually works + non_retryable_error_types=[HardException.__name__], + ) +""" +Retry parameters. + +https://docs.temporal.io/docs/concept-activities/ +""" @enum.unique @@ -58,51 +79,16 @@ class EpisodeMetadata(object): sample_rate: int """Episode's sample rate (Hz) as determined by transcoder, e.g. 44100.""" + def __post_init__(self) -> None: + """Validate episode's metadata.""" -@dataclasses.dataclass -class StoryEnclosure(object): - """Single story enclosure derived from feed's element.""" - - __MP3_MIME_TYPES = {'audio/mpeg', 'audio/mpeg3', 'audio/mp3', 'audio/x-mpeg-3'} - """MIME types which MP3 files might have.""" - - url: str - """Enclosure's URL, e.g. 'https://www.example.com/episode.mp3'.""" - - mime_type: Optional[str] - """Enclosure's reported MIME type, or None if it wasn't reported; e.g. 'audio/mpeg'.""" - - length: Optional[int] - """Enclosure's reported length in bytes, or None if it wasn't reported.""" - - def mime_type_is_mp3(self) -> bool: - """Return True if declared MIME type is one of the MP3 ones.""" - if self.mime_type: - if self.mime_type.lower() in self.__MP3_MIME_TYPES: - return True - return False - - def mime_type_is_audio(self) -> bool: - """Return True if declared MIME type is an audio type.""" - if self.mime_type: - if self.mime_type.lower().startswith('audio/'): - return True - return False - - def mime_type_is_video(self) -> bool: - """Return True if declared MIME type is a video type.""" - if self.mime_type: - if self.mime_type.lower().startswith('video/'): - return True - return False - - def url_path_has_mp3_extension(self) -> bool: - """Return True if URL's path has .mp3 extension.""" - if is_http_url(self.url): - uri = furl(self.url) - if '.mp3' in str(uri.path).lower(): - return True - return False + if self.duration <= 0: + # FIXME could it be zero? + raise ValueError('Episode duration is not positive.') + if not self.codec: + raise ValueError('Episode codec is not set.') + if self.sample_rate <= 1000: + raise ValueError('Episode sample rate is not correct.') class AbstractPodcastTranscribeActivities(object): @@ -112,49 +98,138 @@ class AbstractPodcastTranscribeActivities(object): @activity_method( task_queue=TASK_QUEUE, - start_to_close_timeout=timedelta(seconds=5), - # schedule_to_close_timeout=timedelta(seconds=5), + + # ScheduleToStart is the maximum time from a Workflow requesting Activity execution to a worker starting its + # execution. The usual reason for this timeout to fire is all workers being down or not being able to keep up + # with the request rate. We recommend setting this timeout to the maximum time a Workflow is willing to wait for + # an Activity execution in the presence of all possible worker outages. + # schedule_to_start_timeout=None, + + # StartToClose is the maximum time an Activity can execute after it was picked by a worker. + start_to_close_timeout=timedelta(seconds=60), + + # ScheduleToClose is the maximum time from the Workflow requesting an Activity execution to its completion. + # schedule_to_close_timeout=None, + + # Heartbeat is the maximum time between heartbeat requests. See Long Running Activities. + # (https://docs.temporal.io/docs/concept-activities/#long-running-activities) + # heartbeat_timeout=None, + retry_parameters=RETRY_PARAMETERS, ) async def identify_story_bcp47_language_code(self, stories_id: int) -> Optional[str]: """ - Guess BCP 47 language code of a story, e.g. 'en-US'. + Guess BCP 47 language code of a story. https://cloud.google.com/speech-to-text/docs/languages + + :param stories_id: Story to guess the language code for. + :return: BCP 47 language code (e.g. 'en-US') or None if the language code could not be determined. """ raise NotImplementedError @activity_method( task_queue=TASK_QUEUE, - start_to_close_timeout=timedelta(seconds=5), - # schedule_to_close_timeout=timedelta(seconds=5), + # schedule_to_start_timeout=None, + start_to_close_timeout=timedelta(seconds=60), + # schedule_to_close_timeout=None, + # heartbeat_timeout=None, retry_parameters=RETRY_PARAMETERS, ) async def determine_best_enclosure(self, stories_id: int) -> Optional[StoryEnclosure]: + """ + Fetch a list of story enclosures, determine which one looks like a podcast episode the most. + + Uses or similar tag. + + :param stories_id: Story to fetch the enclosures for. + :return: Best enclosure metadata object, or None if no best enclosure could be determined. + """ raise NotImplementedError @activity_method( task_queue=TASK_QUEUE, - start_to_close_timeout=timedelta(seconds=5), - # schedule_to_close_timeout=timedelta(seconds=5), - retry_parameters=RETRY_PARAMETERS, + # schedule_to_start_timeout=None, + + # With a super-slow server, it's probably reasonable to expect that it might take a few hours to fetch a single + # episode + start_to_close_timeout=timedelta(hours=2), + + # schedule_to_close_timeout=None, + + # FIXME add heartbeats for such a long running process + # heartbeat_timeout=None, + + retry_parameters=dataclasses.replace( + RETRY_PARAMETERS, + + # Wait for a minute before trying again + initial_interval=timedelta(minutes=1), + + # Hope for the server to resurrect in a week + maximum_interval=timedelta(weeks=1), + + # Don't kill ourselves trying to hit a permanently dead server + maximum_attempts=50, + ), ) - async def fetch_store_enclosure(self, stories_id: int, enclosure: StoryEnclosure) -> None: + async def fetch_enclosure_to_gcs(self, stories_id: int, enclosure: StoryEnclosure) -> None: + """ + Fetch enclosure and store it to GCS as an episode. + + Doesn't do transcoding or anything because transcoding or any subsequent steps might fail, and if they do, we + want to have the raw episode fetched and safely stored somewhere. + + :param stories_id: Story to fetch the enclosure for. + :param enclosure: Enclosure to fetch. + """ raise NotImplementedError @activity_method( task_queue=TASK_QUEUE, - start_to_close_timeout=timedelta(seconds=5), - # schedule_to_close_timeout=timedelta(seconds=5), - retry_parameters=RETRY_PARAMETERS, + # schedule_to_start_timeout=None, + + # Let's expect super long episodes or super slow servers + start_to_close_timeout=timedelta(hours=2), + + # schedule_to_close_timeout=None, + + # FIXME transcoding could use a timeout as well + # heartbeat_timeout=None, + + retry_parameters=dataclasses.replace( + RETRY_PARAMETERS, + + # Wait for a minute before trying again (GCS might be down) + initial_interval=timedelta(minutes=1), + + # Hope for GCS to resurrect in a day + maximum_interval=timedelta(days=1), + + # Limit attempts because transcoding itself might be broken, and we don't want to be fetching huge objects + # from GCS periodically + maximum_attempts=20, + ), ) async def fetch_transcode_store_episode(self, stories_id: int) -> EpisodeMetadata: + """ + Fetch episode from GCS, transcode it if needed and store it to GCS again in a separate bucket. + + Now that the raw episode file is safely located in GCS, we can try transcoding it. + + :param stories_id: Story ID the episode of which should be transcoded. + :return: Metadata determined as part of the transcoding. + """ raise NotImplementedError @activity_method( task_queue=TASK_QUEUE, - start_to_close_timeout=timedelta(seconds=5), - # schedule_to_close_timeout=timedelta(seconds=5), + # schedule_to_start_timeout=None, + start_to_close_timeout=timedelta(seconds=60), + # schedule_to_close_timeout=None, + # heartbeat_timeout=None, + + # FIXME don't submit too many operations retry_parameters=RETRY_PARAMETERS, ) async def submit_transcribe_operation(self, @@ -165,8 +240,10 @@ async def submit_transcribe_operation(self, @activity_method( task_queue=TASK_QUEUE, - start_to_close_timeout=timedelta(seconds=5), - # schedule_to_close_timeout=timedelta(seconds=5), + # schedule_to_start_timeout=None, + start_to_close_timeout=timedelta(seconds=60), + # schedule_to_close_timeout=None, + # heartbeat_timeout=None, retry_parameters=RETRY_PARAMETERS, ) async def fetch_store_raw_transcript_json(self, stories_id: int, speech_operation_id: str) -> None: @@ -174,8 +251,10 @@ async def fetch_store_raw_transcript_json(self, stories_id: int, speech_operatio @activity_method( task_queue=TASK_QUEUE, - start_to_close_timeout=timedelta(seconds=5), - # schedule_to_close_timeout=timedelta(seconds=5), + # schedule_to_start_timeout=None, + start_to_close_timeout=timedelta(seconds=60), + # schedule_to_close_timeout=None, + # heartbeat_timeout=None, retry_parameters=RETRY_PARAMETERS, ) async def fetch_store_transcript(self, stories_id: int) -> None: diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py index 9724c0417d..2c50de8010 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py @@ -11,12 +11,11 @@ from ..config import PodcastTranscribeEpisodeConfig from ..exceptions import ( - McPodcastNoEpisodesException, McPodcastDatabaseErrorException, McPodcastInvalidInputException, McPodcastMisconfiguredSpeechAPIException, - McPodcastEpisodeTooLongException, McPodcastSpeechAPIRequestFailedException, + SoftException, ) log = create_logger(__name__) @@ -117,7 +116,7 @@ def get_podcast_episode(db: DatabaseHandler, stories_id: int) -> PodcastEpisode: raise McPodcastDatabaseErrorException(f"Unable to fetch story's {stories_id} podcast episodes: {ex}") if not podcast_episodes: - raise McPodcastNoEpisodesException(f"There are no podcast episodes for story {stories_id}") + raise SoftException(f"There are no podcast episodes for story {stories_id}") if len(podcast_episodes) > 1: # That's very weird, there should be only one episode per story @@ -129,9 +128,7 @@ def get_podcast_episode(db: DatabaseHandler, stories_id: int) -> PodcastEpisode: raise McPodcastInvalidInputException(f"Invalid episode for story {stories_id}: {ex}") if episode.duration > MAX_DURATION: - raise McPodcastEpisodeTooLongException( - f"Story's {stories_id} podcast episode is too long ({episode.duration} seconds)." - ) + raise SoftException(f"Story's {stories_id} podcast episode is too long ({episode.duration} seconds).") return episode diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py index 8162717caa..1659620b27 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py @@ -1,19 +1,79 @@ -import time +# noinspection PyPackageRequirements +from typing import Optional # noinspection PyPackageRequirements +from mediawords.util.identify_language import identification_would_be_reliable, language_code_for_text +from mediawords.util.parse_html import html_strip +from .fetch_episode.enclosure import podcast_viable_enclosure_for_story, MAX_ENCLOSURE_SIZE, StoryEnclosure + +from .exceptions import SoftException from temporal.workflow import Workflow -from .shared import AbstractPodcastTranscribeWorkflow, AbstractPodcastTranscribeActivities, RETRY_PARAMETERS +from mediawords.db import connect_to_db + +from .fetch_episode.bcp47_lang import iso_639_1_code_to_bcp_47_identifier + +from .shared import ( + AbstractPodcastTranscribeWorkflow, + AbstractPodcastTranscribeActivities, + RETRY_PARAMETERS, +) # FIXME in the example the activities implementation *was not* inheriting from the interface class PodcastTranscribeActivities(AbstractPodcastTranscribeActivities): """Activities implementation.""" - # noinspection PyMethodMayBeStatic - async def compose_greeting(self, greeting: str, name: str, number: int): - time.sleep(1) - return f"{greeting} {name} number {number}!" + async def identify_story_bcp47_language_code(self, stories_id: int) -> Optional[str]: + try: + db = connect_to_db() + except Exception as ex: + raise SoftException(f"Unable to connect to the database: {ex}") + + try: + story = db.find_by_id(table='stories', object_id=stories_id) + except Exception as ex: + raise SoftException(f"Database failed when fetching story {stories_id}: {ex}") + + if not story: + raise SoftException(f"Story {stories_id} was not found.") + + # Podcast episodes typically come with title and description set so try guessing from that + story_title = story['title'] + story_description = html_strip(story['description']) + sample_text = f"{story_title}\n{story_description}" + + bcp_47_language_code = None + if identification_would_be_reliable(text=sample_text): + iso_639_1_language_code = language_code_for_text(text=sample_text) + + # Convert to BCP 47 identifier + bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier( + iso_639_1_code=iso_639_1_language_code, + url_hint=story['url'], + ) + + return bcp_47_language_code + + async def determine_best_enclosure(self, stories_id: int) -> Optional[StoryEnclosure]: + + try: + db = connect_to_db() + except Exception as ex: + raise SoftException(f"Unable to connect to the database: {ex}") + + # Find the enclosure that might work the best + best_enclosure = podcast_viable_enclosure_for_story(db=db, stories_id=stories_id) + if not best_enclosure: + # FIXME possibly return None here? + raise SoftException(f"There were no viable enclosures found for story {stories_id}") + + if best_enclosure.length: + if best_enclosure.length > MAX_ENCLOSURE_SIZE: + # FIXME possibly return None here? + raise SoftException(f"Chosen enclosure {best_enclosure} is too big.") + + return best_enclosure class PodcastTranscribeWorkflow(AbstractPodcastTranscribeWorkflow): @@ -26,14 +86,18 @@ def __init__(self): ) async def transcribe_episode(self, stories_id: int) -> None: + bcp47_language_code = await self.activities.identify_story_bcp47_language_code(stories_id=stories_id) + if bcp47_language_code is None: + # Default to English in case there wasn't enough sizable text in title / description to make a good guess + bcp47_language_code = 'en' enclosure = await self.activities.determine_best_enclosure(stories_id=stories_id) if not enclosure: # FIXME what do we do if there's no viable enclosure? Nothing? return - await self.activities.fetch_store_enclosure(stories_id=stories_id, enclosure=enclosure) + await self.activities.fetch_enclosure_to_gcs(stories_id=stories_id, enclosure=enclosure) episode_metadata = await self.activities.fetch_transcode_store_episode(stories_id=stories_id) @@ -48,6 +112,7 @@ async def transcribe_episode(self, stories_id: int) -> None: await Workflow.sleep(int(episode_metadata.duration * 1.1)) # FIXME get the retries right here + # FIXME if the operation with a given ID is not found, re-submit the transcription operation await self.activities.fetch_store_raw_transcript_json( stories_id=stories_id, speech_operation_id=speech_operation_id, diff --git a/apps/postgresql-server/schema/mediawords.sql b/apps/postgresql-server/schema/mediawords.sql index 6eda12b0aa..0b2d0c163b 100644 --- a/apps/postgresql-server/schema/mediawords.sql +++ b/apps/postgresql-server/schema/mediawords.sql @@ -3806,54 +3806,6 @@ CREATE UNIQUE INDEX story_enclosures_stories_id_url ON story_enclosures (stories_id, url); - - --- --- Podcast story episodes (derived from enclosures) --- -CREATE TABLE podcast_episodes ( - podcast_episodes_id BIGSERIAL PRIMARY KEY, - stories_id INT NOT NULL REFERENCES stories (stories_id) ON DELETE CASCADE, - - -- Enclosure that's considered to point to a podcast episode - story_enclosures_id BIGINT NOT NULL - REFERENCES story_enclosures (story_enclosures_id) - ON DELETE CASCADE, - - -- Google Cloud Storage URI where the audio file is located at - gcs_uri TEXT NOT NULL - CONSTRAINT gcs_uri_has_gs_prefix - CHECK(gcs_uri LIKE 'gs://%'), - - -- Duration (in seconds) - duration INT NOT NULL - CONSTRAINT duration_is_positive - CHECK(duration > 0), - - -- Audio codec as determined by transcoder - codec podcast_episodes_audio_codec NOT NULL, - - -- - sample_rate INT NOT NULL - CONSTRAINT sample_rate_looks_reasonable - CHECK(sample_rate > 1000), - - -- BCP 47 language identifier - -- () - bcp47_language_code CITEXT NOT NULL - CONSTRAINT bcp47_language_code_looks_reasonable - CHECK( - bcp47_language_code LIKE '%-%' - OR bcp47_language_code = 'zh' - ), - - -- Speech API operation ID to be used for retrieving transcription; if NULL, - -- transcription job hasn't been submitted yet - speech_operation_id TEXT NULL - -); - - -- -- Celery job results -- (configured as self.__app.conf.database_table_names; schema is dictated by Celery + SQLAlchemy) From 99ceb8bac3986ca9bf99ef1dd8bd4482ec1bd76a Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Wed, 14 Apr 2021 01:34:45 +0300 Subject: [PATCH 055/175] Resurrect `mail` utility for munin-cron, generate `alerts.conf` dynamically Munin is our monitoring tool of choice for all app-related matters. It doesn't (and is not supposed to) do any kind of systems monitoring as Munin gets run as a Docker service, so if the `mediacloud` Docker stack is down, that makes Munin down as well. Instead, it tracks various app-related metrics such as: * How many stories got imported into Solr during last hour: https://github.com/mediacloud/backend/blob/63545b1f3cd02f2e170801cd90842e5a8f7751a0/apps/munin-node/plugins/mc_solr_stories_last_hour * How many downloads didn't yet get extracted: https://github.com/mediacloud/backend/blob/63545b1f3cd02f2e170801cd90842e5a8f7751a0/apps/munin-node/plugins/mc_downloads_unextracted * How many media sources do we have: https://github.com/mediacloud/backend/blob/63545b1f3cd02f2e170801cd90842e5a8f7751a0/apps/munin-node/plugins/mc_media_count * ...and others: https://github.com/mediacloud/backend/tree/63545b1f3cd02f2e170801cd90842e5a8f7751a0/apps/munin-node/plugins We chose Munin because: * It's rather simple ("plugins" are simple scripts that output a bunch of numbers in a specific format so you can write them in any language); * It's able to send alert emails when things look off (unlike some other tools); * It's even able to send alert emails when it's unable to figure out if things are off (unlike most other tools). `munin-cron` service polls `munin-node` service periodically (every 5 minutes) for stats, writes them down, tests whether those stats are out of bounds (as defined in the plugin's source code) and if they look off, sends us an email. Munin is super old and so it doesn't know how to connect to the SMTP server (`mail-postfix-server`) directly. Instead, it calls Linux's `mail` utility (https://linux.die.net/man/1/mail; somewhat similar to `sendmail`) to submit the alert email to be sent. Weird Linux (or UNIX?) history: just like `sendmail`, `mail` is not so much of a single utility that you can install but rather a "protocol" that can be implemented by various vendors which provide utilities compatible with `mail` and `sendmail`. The idea is that one can choose a provider (a specific implementation of `mail` or `sendmail`) to do their `mail`ing or `sendmail`ing, and then the tools that call `mail` or `sendmail` will be able to send the email while not caring how exactly does it get sent. We were using S-nail (https://wiki.archlinux.org/index.php/S-nail) for our `mail` needs, and msmtp (https://wiki.archlinux.org/index.php/msmtp) to do the `sendmail`ing. `mail` (implemented via S-nail) was pre-configured to call `sendmail` utility to submit outgoing email to it, and `sendmail` (implemented via msmtp) then was to connect to `mail-postfix-server` and post the outgoing message. The end result in all cases was the same - no matter how you choose to send email from any container (`mail`, `sendmail`, or direct connection to the SMTP server), the message would still go out. So, but it then turns out that Debian (probably) people changed up how S-nail (`s-nail` APT package) gets installed, and it stopped providing `mail` utility, the end result being that `munin-cron` service was silently failing at sending us alert emails about problems. Not good! So, I changed it up as follows: 1) Instead of S-nail, we're now installing `bsd-mailx` package which also provides the `mail` utility which is also pre-configured to submit emails to `sendmail`; 2) For extra paranoia, we're now verifying whether `sendmail` is symlinked to `msmtp` and `mail` is symlinked to `/usr/bin/bsd-mailx` Plus, the email addresses to send alerts to were hardcoded so I've made them configurable by auto-generating `alerts.conf` file on container startup. We'll point it to Google Groups after deployment so that all of us could get alerts about things going south. James, could you have a look? --- apps/base/Dockerfile | 27 ++++++++++++++++--- apps/docker-compose.dist.yml | 2 ++ apps/munin-cron/Dockerfile | 5 ++++ apps/munin-cron/bin/munin-cron.sh | 34 ++++++++++++++++++++++++ apps/munin-cron/docker-compose.tests.yml | 9 +++++-- apps/munin-cron/munin-conf.d/alerts.conf | 2 -- 6 files changed, 72 insertions(+), 7 deletions(-) create mode 100755 apps/munin-cron/bin/munin-cron.sh delete mode 100644 apps/munin-cron/munin-conf.d/alerts.conf diff --git a/apps/base/Dockerfile b/apps/base/Dockerfile index 6f12c7fdde..6d6bab2a90 100644 --- a/apps/base/Dockerfile +++ b/apps/base/Dockerfile @@ -48,6 +48,9 @@ RUN \ apt-get -y --no-install-recommends install \ # Quicker container debugging bash-completion \ + # "mail" utility which uses sendmail (provided by msmtp-mta) internally; + # some tools like munin-cron use "mail" to send emails + bsd-mailx \ curl \ htop \ # apt-key @@ -56,7 +59,8 @@ RUN \ iproute2 \ # Pinging other containers from within Compose environment iputils-ping \ - # Sending mail via sendmail utility through mail-postfix-server + # Provides "sendmail" utility which relays email through + # "mail-postfix-server" app msmtp \ msmtp-mta \ # Provides killall among other utilities @@ -67,8 +71,6 @@ RUN \ netcat \ # Some packages insist on logging to syslog rsyslog \ - # "mail" utility (which uses msmtp internally) - s-nail \ # Timezone data, used by many packages tzdata \ # Basic editor for files in container while debugging @@ -90,6 +92,25 @@ COPY bin/container_memory_limit.sh bin/container_cpu_limit.sh bin/dl_to_stdout.s # Copy MSMTP configuration COPY conf/msmtprc conf/msmtp-aliases /etc/ +# Both "sendmail" and "mail" utilities are important as they're used by various +# apps (e.g. munin-cron) to send us important email, and those apps aren't +# particularly vocal when they're unable to send email. So, for extra paranoia, +# verify that both utilities point to correct symlinks here. +RUN \ + if [ "$(readlink -- "/usr/sbin/sendmail")" != "../bin/msmtp" ]; then \ + echo "sendmail is not symlinked to msmtp, sending email won't work." && \ + exit 1; \ + fi; \ + if [ "$(readlink -- "/usr/bin/mail")" != "/etc/alternatives/mail" ]; then \ + echo "mail is not symlinked to /etc/alternatives/mail, sending email won't work." && \ + exit 1; \ + fi; \ + if [ "$(readlink -- "/etc/alternatives/mail")" != "/usr/bin/bsd-mailx" ]; then \ + echo "mail is not symlinked to /etc/alternatives/mail, sending email won't work." && \ + exit 1; \ + fi; \ + true + # Generate and set locale RUN \ locale-gen en_US en_US.UTF-8 && \ diff --git a/apps/docker-compose.dist.yml b/apps/docker-compose.dist.yml index 7f6580cd56..0643da297c 100644 --- a/apps/docker-compose.dist.yml +++ b/apps/docker-compose.dist.yml @@ -1060,6 +1060,8 @@ services: depends_on: # Reads data from Munin node - munin-node + environment: + MC_MUNIN_CRON_ALERT_EMAIL: "FIXME@mediacloud.org" volumes: # Shared with "munin-fastcgi-graph" container: - vol_munin_data:/var/lib/munin/ diff --git a/apps/munin-cron/Dockerfile b/apps/munin-cron/Dockerfile index 27e617e49f..6047ed821a 100644 --- a/apps/munin-cron/Dockerfile +++ b/apps/munin-cron/Dockerfile @@ -43,6 +43,8 @@ COPY munin-conf.d/ /etc/munin/munin-conf.d/ # Overwrite crontab with our own COPY crontab /etc/cron.d/munin +COPY bin/munin-cron.sh / + # Volume for RRD data (shared with munin-fastcgi-graph) VOLUME /var/lib/munin/ @@ -50,3 +52,6 @@ VOLUME /var/lib/munin/ VOLUME /var/cache/munin/www/ # No USER because Cron will run the script as unprivileged user itself + +# Use our own wrapper for +CMD ["/munin-cron.sh"] diff --git a/apps/munin-cron/bin/munin-cron.sh b/apps/munin-cron/bin/munin-cron.sh new file mode 100755 index 0000000000..a44d39d8a4 --- /dev/null +++ b/apps/munin-cron/bin/munin-cron.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -e + +if [ -z "$MC_MUNIN_CRON_ALERT_EMAIL" ]; then + echo "MC_MUNIN_CRON_ALERT_EMAIL (email address to send email alerts to) is not set." + exit 1 +fi + +set -u + +# Set up alerting +ALERTS_CONF_FILE="/etc/munin/munin-conf.d/alerts.conf" +echo -n > "${ALERTS_CONF_FILE}" +chmod 644 "${ALERTS_CONF_FILE}" + +# Pretty weird way to print a bunch of dollar signs to a file but Munin doesn't make it easy +echo -n 'contact.mediacloud.command ' >> "${ALERTS_CONF_FILE}" +echo -n 'mail -s "[Munin] ' >> "${ALERTS_CONF_FILE}" +echo -n '${if:cfields CRITICAL}${if:wfields WARNING}' >> "${ALERTS_CONF_FILE}" +echo -n '${if:fofields OK}${if:ufields UNKNOWN}' >> "${ALERTS_CONF_FILE}" +echo -n ' -> ${var:graph_title} ' >> "${ALERTS_CONF_FILE}" +echo -n '${if:wfields -> ${loop<,>:wfields ${var:label}=${var:value}}}' >> "${ALERTS_CONF_FILE}" +echo -n '${if:cfields -> ${loop<,>:cfields ${var:label}=${var:value}}}' >> "${ALERTS_CONF_FILE}" +echo -n '${if:fofields -> ${loop<,>:fofields ${var:label}=${var:value}}}' >> "${ALERTS_CONF_FILE}" +echo -n '" ' >> "${ALERTS_CONF_FILE}" + +# Escape "@" +echo -n "${MC_MUNIN_CRON_ALERT_EMAIL}" | sed 's/@/\\@/g' >> "${ALERTS_CONF_FILE}" + +echo >> "${ALERTS_CONF_FILE}" + +# Start Cron daemon wrapper from cron-base +exec /cron.sh diff --git a/apps/munin-cron/docker-compose.tests.yml b/apps/munin-cron/docker-compose.tests.yml index e24e465693..17bf4fb6b5 100644 --- a/apps/munin-cron/docker-compose.tests.yml +++ b/apps/munin-cron/docker-compose.tests.yml @@ -6,10 +6,15 @@ services: image: gcr.io/mcback/munin-cron:latest init: true stop_signal: SIGKILL + environment: + MC_MUNIN_CRON_ALERT_EMAIL: "alerts@testmediacloud.ml" volumes: - type: bind - source: ./munin-conf.d/ - target: /etc/munin/munin-conf.d/ + source: ./munin-conf.d/host.conf + target: /etc/munin/munin-conf.d/host.conf + - type: bind + source: ./bin/munin-cron.sh + target: /munin-cron.sh - type: bind source: ./../cron-base/bin/cron.sh target: /cron.sh diff --git a/apps/munin-cron/munin-conf.d/alerts.conf b/apps/munin-cron/munin-conf.d/alerts.conf deleted file mode 100644 index 583489fc94..0000000000 --- a/apps/munin-cron/munin-conf.d/alerts.conf +++ /dev/null @@ -1,2 +0,0 @@ -contact.hroberts.command mail -s "[Munin] ${if:cfields CRITICAL}${if:wfields WARNING}${if:fofields OK}${if:ufields UNKNOWN} -> ${var:graph_title} ${if:wfields -> ${loop<,>:wfields ${var:label}=${var:value}}}${if:cfields -> ${loop<,>:cfields ${var:label}=${var:value}}}${if:fofields -> ${loop<,>:fofields ${var:label}=${var:value}}}" hroberts\@mediacloud.org -contact.lvaliukas.command mail -s "[Munin] ${if:cfields CRITICAL}${if:wfields WARNING}${if:fofields OK}${if:ufields UNKNOWN} -> ${var:graph_title} ${if:wfields -> ${loop<,>:wfields ${var:label}=${var:value}}}${if:cfields -> ${loop<,>:cfields ${var:label}=${var:value}}}${if:fofields -> ${loop<,>:fofields ${var:label}=${var:value}}}" linas\@mediacloud.org From 2e9fb43b04570473f78794cc4c684bb204899d00 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Mon, 19 Apr 2021 20:39:28 +0300 Subject: [PATCH 056/175] Increase max. PgBouncer client connection count to 4096, limit pool size to 512 We run PgBouncer in front of PostgreSQL because PostgreSQL itself is not that good at managing a lot of connections at once: https://www.percona.com/blog/2018/06/27/scaling-postgresql-with-pgbouncer-you-may-need-a-connection-pooler-sooner-than-you-expect/ Simply put, PgBouncer opens a bunch of connections to PostgreSQL and shares those connections between a large number of clients (containers that we run) more efficiently than PostgreSQL itself would. How many connections will PgBouncer make to the PostgreSQL and how many clients will it let connect to itself is all defined in pgbouncer.ini. Also, PgBouncer's configuration can't go over PostgreSQL's own connection limit which is defined with a `max_connections` property and is currently set at 610 concurrent connections: https://github.com/mediacloud/backend/blob/59d5f03ca060e78e79a6bd52a0d598626ccaff9a/apps/postgresql-server/conf/postgresql.conf#L11 Some time Friday afternoon (in my timezone), PostgreSQL started complaining about *too many connections*, in which case PostgreSQL stops accepting new attempts to connect (which is bad). Given that most (but not all) connecting that we do to PostgreSQL is through PgBouncer, I've looked into its configuration and found out that it's not configured properly. If I read the configuration reference correctly: https://www.pgbouncer.org/config.html ...PgBouncer was configured to both let only up to 600 clients to connect, and to make 600 connections to PostgreSQL itself, pretty much making PgBouncer itself rather useless. So, I've configured it as follows: * Now up to 5000 clients will be allowed to connect to PgBouncer (`max_client_conn`) * Those (up to) 5000 clients will share a pool of up to 450 connections to PostgreSQL (`default_pool_size`) * That's probably a no-op in our case, but PgBouncer won't create more than 500 connections to PostgreSQL (`max_db_connections`) That way we let more clients connect at once, and reduce the risk of going over the 610 connection limit in PostgreSQL itself. This is most likely still not too optimal, proper way to set those parameters would be to do some benchmarking, but that will have to wait for that day when we'll have no other things to do :) @jtotoole, could you read through that PgBouncer blog post? Any questions are more than welcome too. --- apps/postgresql-pgbouncer/conf/pgbouncer.ini | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/apps/postgresql-pgbouncer/conf/pgbouncer.ini b/apps/postgresql-pgbouncer/conf/pgbouncer.ini index cd6760882b..1efee57e54 100644 --- a/apps/postgresql-pgbouncer/conf/pgbouncer.ini +++ b/apps/postgresql-pgbouncer/conf/pgbouncer.ini @@ -16,8 +16,17 @@ auth_file = /etc/pgbouncer/userlist.txt pool_mode = session server_reset_query = DISCARD ALL -max_client_conn = 600 -default_pool_size = 600 + +# Maximum number of client connections allowed +max_client_conn = 5000 + +# How many server connections to allow per user/database pair +default_pool_size = 450 + +# Do not allow more than this many server connections per database (regardless +# of user) +max_db_connections = 500 + log_connections = 0 log_disconnections = 0 stats_period = 600 From 0e0ac21cf2a1d4616592a125c9bdfb55ffa02364 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Tue, 20 Apr 2021 14:47:52 +0300 Subject: [PATCH 057/175] Even more work on the podcast demo --- .../bin/podcast_transcribe_episode_worker.py | 4 +- .../podcast_transcribe_episode/config.py | 58 ++++- .../fetch_episode/audio_codecs.py | 58 ++--- .../fetch_episode/enclosure.py | 3 - .../fetch_episode/fetch_and_store.py | 98 -------- .../fetch_episode/gcs_store.py | 68 ++++-- .../{media_file.py => media_info.py} | 133 +---------- .../fetch_episode/speech_api.py | 202 +++++++++++++++++ .../fetch_episode/transcode.py | 92 ++++++++ .../fetch_episode/transcript.py | 68 ++++++ .../fetch_transcript/handler.py | 164 +------------- .../fetch_transcript/transcript.py | 40 ---- .../podcast_transcribe_episode/shared.py | 70 ++---- .../submit_operation/__init__.py | 0 .../submit_operation/submit_operation.py | 213 ------------------ .../podcast_transcribe_episode/workflow.py | 100 +++++++- .../src/requirements.txt | 2 +- .../tests/python/test_gcs_store.py | 32 +-- 18 files changed, 607 insertions(+), 798 deletions(-) delete mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py rename apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/{media_file.py => media_info.py} (51%) create mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/speech_api.py create mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/transcode.py create mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/transcript.py delete mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/transcript.py delete mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/__init__.py delete mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py diff --git a/apps/podcast-transcribe-episode/bin/podcast_transcribe_episode_worker.py b/apps/podcast-transcribe-episode/bin/podcast_transcribe_episode_worker.py index c2c0e80a9e..bd8f159142 100755 --- a/apps/podcast-transcribe-episode/bin/podcast_transcribe_episode_worker.py +++ b/apps/podcast-transcribe-episode/bin/podcast_transcribe_episode_worker.py @@ -26,8 +26,6 @@ def run_podcast_fetch_episode(stories_id: int) -> None: try: fetch_and_store_episode(db=db, stories_id=stories_id) - JobBroker(queue_name='MediaWords::Job::Podcast::SubmitOperation').add_to_queue(stories_id=stories_id) - except McPodcastFetchEpisodeSoftException as ex: # Soft exceptions log.error(f"Unable to fetch podcast episode for story {stories_id}: {ex}") @@ -40,5 +38,5 @@ def run_podcast_fetch_episode(stories_id: int) -> None: if __name__ == '__main__': - app = JobBroker(queue_name='MediaWords::Job::Podcast::FetchEpisode') + app = JobBroker(queue_name='MediaWords::Job::Podcast::TranscribeEpisode') app.start_worker(handler=run_podcast_fetch_episode) diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/config.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/config.py index b34e2ade51..02c1b940ce 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/config.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/config.py @@ -1,22 +1,56 @@ +import abc + from mediawords.util.config import env_value, file_with_env_value +# FIXME move constants into proper configuration + +MAX_ENCLOSURE_SIZE = 1024 * 1024 * 500 +"""Max. enclosure size (in bytes) that we're willing to download.""" + +MAX_DURATION = 60 * 60 * 2 +"""Max. podcast episode duration (in seconds) to submit for transcription.""" + -class PodcastTranscribeEpisodeConfig(object): +class AbstractPodcastGCBucketConfig(object, metaclass=abc.ABCMeta): """ - Podcast episode fetcher configuration. + Configuration of a single GCS bucket. """ - @staticmethod - def gc_auth_json_file() -> str: + @abc.abstractmethod + def bucket_name(self) -> str: + """Bucket name to upload objects to / download from.""" + raise NotImplementedError + + @abc.abstractmethod + def path_prefix(self) -> str: + """Path prefix under which the objects are to be found.""" + raise NotImplementedError + + +class PodcastGCAuthConfig(object): + """Google Cloud (both Storage and Speech API) authentication configuration.""" + + @classmethod + def gc_auth_json_file(cls) -> str: """Return path to Google Cloud authentication JSON file.""" return file_with_env_value(name='MC_PODCAST_GC_AUTH_JSON_BASE64', encoded_with_base64=True) - @staticmethod - def gc_storage_bucket_name() -> str: - """Return Google Cloud Storage bucket name.""" - return env_value(name='MC_PODCAST_FETCH_EPISODE_BUCKET_NAME') - @staticmethod - def gc_storage_path_prefix() -> str: - """Return Google Cloud Storage path prefix under which objects will be stored.""" - return env_value(name='MC_PODCAST_FETCH_EPISODE_PATH_PREFIX') +class PodcastGCRawEnclosuresBucketConfig(AbstractPodcastGCBucketConfig): + """Configuration for GCS bucket where raw enclosures will be stored.""" + + def bucket_name(self) -> str: + return env_value(name='MC_PODCAST_RAW_ENCLOSURES_BUCKET_NAME') + + def path_prefix(self) -> str: + return env_value(name='MC_PODCAST_RAW_ENCLOSURES_PATH_PREFIX') + + +class PodcastGCTranscodedEpisodesBucketConfig(AbstractPodcastGCBucketConfig): + """Configuration for GCS bucket where transcoded, Speech API-ready episodes will be stored.""" + + def bucket_name(self) -> str: + return env_value(name='MC_PODCAST_TRANSCODED_EPISODES_BUCKET_NAME') + + def path_prefix(self) -> str: + return env_value(name='MC_PODCAST_TRANSCODED_EPISODES_PATH_PREFIX') diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/audio_codecs.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/audio_codecs.py index 4dafee4146..1529b3b34c 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/audio_codecs.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/audio_codecs.py @@ -10,36 +10,32 @@ class AbstractAudioCodec(object, metaclass=abc.ABCMeta): - @classmethod - @abc.abstractmethod - def postgresql_enum_value(cls) -> str: - """Return value from 'podcast_episodes_audio_codec' PostgreSQL enum.""" - raise NotImplemented("Abstract method") - @classmethod @abc.abstractmethod def ffmpeg_stream_is_this_codec(cls, ffmpeg_stream: Dict[str, Any]) -> bool: """Return True if ffmpeg.probe()'s one of the streams ('streams' key) is of this codec.""" - raise NotImplemented("Abstract method") + raise NotImplementedError @classmethod @abc.abstractmethod def ffmpeg_container_format(cls) -> str: """Return FFmpeg container format (-f argument).""" - raise NotImplemented("Abstract method") + raise NotImplementedError @classmethod @abc.abstractmethod def mime_type(cls) -> str: """Return MIME type to store as GCS object metadata.""" - raise NotImplemented("Abstract method") + raise NotImplementedError + @classmethod + @abc.abstractmethod + def speech_api_codec(cls) -> str: + """Return codec enum value to pass to Speech API when submitting the transcription operation.""" + raise NotImplementedError -class Linear16AudioCodec(AbstractAudioCodec): - @classmethod - def postgresql_enum_value(cls) -> str: - return 'LINEAR16' +class Linear16AudioCodec(AbstractAudioCodec): @classmethod def ffmpeg_stream_is_this_codec(cls, ffmpeg_stream: Dict[str, Any]) -> bool: @@ -53,12 +49,12 @@ def ffmpeg_container_format(cls) -> str: def mime_type(cls) -> str: return 'audio/wav' + @classmethod + def speech_api_codec(cls) -> str: + return 'LINEAR16' -class FLACAudioCodec(AbstractAudioCodec): - @classmethod - def postgresql_enum_value(cls) -> str: - return 'FLAC' +class FLACAudioCodec(AbstractAudioCodec): @classmethod def ffmpeg_stream_is_this_codec(cls, ffmpeg_stream: Dict[str, Any]) -> bool: @@ -73,12 +69,12 @@ def ffmpeg_container_format(cls) -> str: def mime_type(cls) -> str: return 'audio/flac' + @classmethod + def speech_api_codec(cls) -> str: + return 'FLAC' -class MULAWAudioCodec(AbstractAudioCodec): - @classmethod - def postgresql_enum_value(cls) -> str: - return 'MULAW' +class MULAWAudioCodec(AbstractAudioCodec): @classmethod def ffmpeg_stream_is_this_codec(cls, ffmpeg_stream: Dict[str, Any]) -> bool: @@ -92,12 +88,12 @@ def ffmpeg_container_format(cls) -> str: def mime_type(cls) -> str: return 'audio/basic' + @classmethod + def speech_api_codec(cls) -> str: + return 'MULAW' -class OggOpusAudioCodec(AbstractAudioCodec): - @classmethod - def postgresql_enum_value(cls) -> str: - return 'OGG_OPUS' +class OggOpusAudioCodec(AbstractAudioCodec): @classmethod def ffmpeg_stream_is_this_codec(cls, ffmpeg_stream: Dict[str, Any]) -> bool: @@ -111,12 +107,12 @@ def ffmpeg_container_format(cls) -> str: def mime_type(cls) -> str: return 'audio/ogg' + @classmethod + def speech_api_codec(cls) -> str: + return 'OGG_OPUS' -class MP3AudioCodec(AbstractAudioCodec): - @classmethod - def postgresql_enum_value(cls) -> str: - return 'MP3' +class MP3AudioCodec(AbstractAudioCodec): @classmethod def ffmpeg_stream_is_this_codec(cls, ffmpeg_stream: Dict[str, Any]) -> bool: @@ -129,3 +125,7 @@ def ffmpeg_container_format(cls) -> str: @classmethod def mime_type(cls) -> str: return 'audio/mpeg' + + @classmethod + def speech_api_codec(cls) -> str: + return 'MP3' diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/enclosure.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/enclosure.py index b6284798e6..0728fd21e2 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/enclosure.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/enclosure.py @@ -10,9 +10,6 @@ log = create_logger(__name__) -MAX_ENCLOSURE_SIZE = 1024 * 1024 * 500 -"""Max. enclosure size (in bytes) that we're willing to download.""" - @dataclasses.dataclass class StoryEnclosure(object): diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py deleted file mode 100644 index 6d9c723535..0000000000 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/fetch_and_store.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -import shutil -import tempfile - -from mediawords.util.log import create_logger - -from ..exceptions import McPodcastFileStoreFailureException, McPodcastFileFetchFailureException -from .enclosure import MAX_ENCLOSURE_SIZE, StoryEnclosure -from .fetch_url import fetch_big_file -from .gcs_store import GCSStore -from .media_file import TranscodeTempDirAndFile, transcode_media_file_if_needed, media_file_info - -log = create_logger(__name__) - - -def _cleanup_temp_dir(temp: TranscodeTempDirAndFile) -> None: - """Clean up temporary directory or raise a hard exception.""" - try: - shutil.rmtree(temp.temp_dir) - except Exception as ex: - # Not being able to clean up after ourselves is a "hard" error as we might run out of disk space that way - raise McPodcastFileStoreFailureException(f"Unable to remove temporary directory: {ex}") - - -def fetch_and_store_episode(stories_id: int, enclosure: StoryEnclosure) -> None: - """ - Choose a viable story enclosure for podcast, fetch it, transcode if needed, and store to GCS. - - 2) Fetches the chosen enclosure; - 3) Transcodes the file (if needed) by: - a) converting it to an audio format that the Speech API can support, and / or - b) discarding video stream from the media file, and / or - c) discarding other audio streams from the media file; - 5) Reads the various parameters, e.g. sample rate, of the episode audio file; - 4) Uploads the episode audio file to Google Cloud Storage. - - :param stories_id: Story ID for the story to operate on. - :param enclosure: Enclosure to fetch. - """ - - try: - temp_dir = tempfile.mkdtemp('fetch_and_store') - except Exception as ex: - raise McPodcastFileStoreFailureException(f"Unable to create temporary directory: {ex}") - - # Fetch enclosure - input_filename = 'input_file' - input_file_path = os.path.join(temp_dir, input_filename) - log.info(f"Fetching enclosure {enclosure} to {input_file_path}...") - fetch_big_file(url=enclosure.url, dest_file=input_file_path, max_size=MAX_ENCLOSURE_SIZE) - log.info(f"Done fetching enclosure {enclosure} to {input_file_path}") - - if os.stat(input_file_path).st_size == 0: - # Might happen with misconfigured webservers - raise McPodcastFileFetchFailureException(f"Fetched file {input_file_path} is empty.") - - # Transcode if needed - input_file_obj = TranscodeTempDirAndFile(temp_dir=temp_dir, filename=input_filename) - transcoded_file_obj = transcode_media_file_if_needed(input_media_file=input_file_obj) - - # Unset the variable so that we don't accidentally use it later - del input_filename, temp_dir - - if input_file_obj != transcoded_file_obj: - # Function did some transcoding and stored everything in yet another file - - # Remove the input file - _cleanup_temp_dir(temp=input_file_obj) - - # Consider the transcoded file the new input file - input_file_obj = transcoded_file_obj - - # (Re)read the properties of either the original or the transcoded file - media_info = media_file_info(media_file_path=input_file_obj.temp_full_path) - best_audio_stream = media_info.best_supported_audio_stream() - - # Store input file to GCS - try: - gcs = GCSStore() - gcs_uri = gcs.store_object( - local_file_path=input_file_obj.temp_full_path, - object_id=str(stories_id), - mime_type=best_audio_stream.audio_codec_class.mime_type(), - ) - - except Exception as ex: - - log.error(f"Unable to store episode file '{input_file_obj.temp_full_path}' for story {stories_id}: {ex}") - - # Clean up, then raise further - _cleanup_temp_dir(temp=input_file_obj) - - raise ex - - # Clean up the locally stored file as we don't need it anymore - _cleanup_temp_dir(temp=input_file_obj) - - # FIXME diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/gcs_store.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/gcs_store.py index 9a9b4c2817..dfc51cdb08 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/gcs_store.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/gcs_store.py @@ -10,8 +10,8 @@ from mediawords.util.log import create_logger -from ..config import PodcastTranscribeEpisodeConfig -from ..exceptions import McPodcastGCSStoreFailureException, McPodcastMisconfiguredGCSException +from ..config import AbstractPodcastGCBucketConfig, PodcastGCAuthConfig +from ..exceptions import McPodcastGCSStoreFailureException, McPodcastMisconfiguredGCSException, SoftException log = create_logger(__name__) @@ -21,14 +21,19 @@ class GCSStore(object): __slots__ = [ '__bucket_internal', - '__config', + '__auth_config', + '__bucket_config', ] - def __init__(self, config: Optional[PodcastTranscribeEpisodeConfig] = None): - if not config: - config = PodcastTranscribeEpisodeConfig() + def __init__(self, bucket_config: AbstractPodcastGCBucketConfig, auth_config: Optional[PodcastGCAuthConfig] = None): + if not bucket_config: + raise McPodcastMisconfiguredGCSException("Bucket configuration is unset.") - self.__config = config + if not auth_config: + auth_config = PodcastGCAuthConfig() + + self.__auth_config = auth_config + self.__bucket_config = bucket_config self.__bucket_internal = None @property @@ -37,11 +42,11 @@ def _bucket(self) -> Bucket: if not self.__bucket_internal: try: - storage_client = storage.Client.from_service_account_json(self.__config.gc_auth_json_file()) - self.__bucket_internal = storage_client.get_bucket(self.__config.gc_storage_bucket_name()) + storage_client = storage.Client.from_service_account_json(self.__auth_config.gc_auth_json_file()) + self.__bucket_internal = storage_client.get_bucket(self.__bucket_config.bucket_name()) except Exception as ex: raise McPodcastGCSStoreFailureException( - f"Unable to get GCS bucket '{self.__config.gc_storage_bucket_name()}': {ex}" + f"Unable to get GCS bucket '{self.__bucket_config.bucket_name()}': {ex}" ) return self.__bucket_internal @@ -66,7 +71,7 @@ def _blob_from_object_id(self, object_id: str) -> Blob: if not object_id: raise McPodcastMisconfiguredGCSException("Object ID is unset.") - remote_path = self._remote_path(path_prefix=self.__config.gc_storage_path_prefix(), object_id=object_id) + remote_path = self._remote_path(path_prefix=self.__bucket_config.path_prefix(), object_id=object_id) blob = self._bucket.blob(remote_path) return blob @@ -103,16 +108,14 @@ def object_exists(self, object_id: str) -> bool: return exists - def store_object(self, local_file_path: str, object_id: str, mime_type: Optional[str] = None) -> str: + def upload_object(self, local_file_path: str, object_id: str) -> None: """ - Store a local file to a remote location. + Upload a local file to a GCS object. Will overwrite existing objects with a warning. :param local_file_path: Local file that should be stored. :param object_id: Object ID under which the object should be stored. - :param mime_type: MIME type which, if set, will be stored as "Content-Type". - :return: Full Google Cloud Storage URI of the object, e.g. "gs:////". """ if not os.path.isfile(local_file_path): @@ -121,16 +124,39 @@ def store_object(self, local_file_path: str, object_id: str, mime_type: Optional if not object_id: raise McPodcastMisconfiguredGCSException("Object ID is unset.") - log.debug(f"Storing file '{local_file_path}' as object ID {object_id}...") + log.debug(f"Uploading file '{local_file_path}' as object ID {object_id}...") if self.object_exists(object_id=object_id): log.warning(f"Object {object_id} already exists, will overwrite.") blob = self._blob_from_object_id(object_id=object_id) - blob.upload_from_filename(filename=local_file_path, content_type=mime_type) + blob.upload_from_filename(filename=local_file_path, content_type='application/octet-stream') + + # FIXME write some tests + def download_object(self, object_id: str, local_file_path: str) -> None: + """ + Download a GCS object to a local file. + + :param object_id: Object ID of an object that should be downloaded. + :param local_file_path: Local file that the object should be stored to. + """ - return self.object_uri(object_id=object_id) + if os.path.isfile(local_file_path): + raise McPodcastMisconfiguredGCSException(f"Local file '{local_file_path}' already exists.") + + if not object_id: + raise McPodcastMisconfiguredGCSException("Object ID is unset.") + + log.debug(f"Downloading object ID {object_id} to file '{local_file_path}'...") + + if not self.object_exists(object_id=object_id): + # FIXME or is it a hard exception? + raise SoftException(f"Object ID {object_id} was not found.") + + blob = self._blob_from_object_id(object_id=object_id) + + blob.download_to_filename(filename=local_file_path) def delete_object(self, object_id: str) -> None: """ @@ -138,6 +164,8 @@ def delete_object(self, object_id: str) -> None: Doesn't raise if object doesn't exist. + Used mostly for running tests, e.g. to find out what happens if the object to be fetched doesn't exist anymore. + :param object_id: Object ID that should be deleted. """ @@ -169,8 +197,8 @@ def object_uri(self, object_id: str) -> str: raise McPodcastMisconfiguredGCSException("Object ID is unset.") uri = "gs://{host}/{remote_path}".format( - host=self.__config.gc_storage_bucket_name(), - remote_path=self._remote_path(path_prefix=self.__config.gc_storage_path_prefix(), object_id=object_id), + host=self.__bucket_config.bucket_name(), + remote_path=self._remote_path(path_prefix=self.__bucket_config.path_prefix(), object_id=object_id), ) return uri diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_file.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_info.py similarity index 51% rename from apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_file.py rename to apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_info.py index 1702c42f2c..c642d33a33 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_file.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/media_info.py @@ -1,9 +1,6 @@ import dataclasses -import subprocess import math import os -import shutil -import tempfile from typing import Type, Optional, List # noinspection PyPackageRequirements @@ -11,11 +8,7 @@ from mediawords.util.log import create_logger -from ..exceptions import ( - McPodcastMisconfiguredTranscoderException, - McPodcastFileIsInvalidException, - McPodcastFileStoreFailureException, -) +from ..exceptions import McPodcastMisconfiguredTranscoderException, McPodcastFileIsInvalidException from .audio_codecs import ( AbstractAudioCodec, Linear16AudioCodec, @@ -173,127 +166,3 @@ def media_file_info(media_file_path: str) -> MediaFileInfo: audio_streams=audio_streams, has_video_streams=has_video_streams, ) - - -@dataclasses.dataclass -class TranscodeTempDirAndFile(object): - """ - Temporary directory and filename for transcoding. - - It is assumed that caller is free to recursively remove 'temp_directory' after making use of the transcoded file. - """ - temp_dir: str - filename: str - - @property - def temp_full_path(self) -> str: - """Return full path to file.""" - return os.path.join(self.temp_dir, self.filename) - - -def transcode_media_file_if_needed(input_media_file: TranscodeTempDirAndFile) -> TranscodeTempDirAndFile: - """ - Transcode file (if needed) to something that Speech API will support. - - * If input has a video stream, it will be discarded; - * If input has more than one audio stream, others will be discarded leaving only one (preferably the one that Speech - API can support); - * If input doesn't have an audio stream in Speech API-supported codec, it will be transcoded to lossless - FLAC 16 bit in order to preserve quality; - * If the chosen audio stream has multiple channels (e.g. stereo or 5.1), it will be mixed into a single (mono) - channel as Speech API supports multi-channel recognition only when different voices speak into each of the - channels. - - :param input_media_file: Temporary directory and input media file to consider transcoding. - :return: Either the same 'input_media_file' if file wasn't transcoded, or new TranscodeTempDirAndFile() if it was. - """ - - if not os.path.isdir(input_media_file.temp_dir): - # Directory should exist; if it doesn't, it's a critical problem either in the filesystem or the code - raise McPodcastMisconfiguredTranscoderException(f"Directory '{input_media_file.temp_dir}' does not exist.") - - if not os.path.isfile(input_media_file.temp_full_path): - raise McPodcastMisconfiguredTranscoderException(f"File '{input_media_file}' does not exist.") - - # Independently from what has told us, identify the file type again ourselves - media_info = media_file_info(media_file_path=input_media_file.temp_full_path) - - if not media_info.audio_streams: - raise McPodcastFileIsInvalidException("Downloaded file doesn't appear to have any audio streams.") - - ffmpeg_args = [] - - supported_audio_stream = media_info.best_supported_audio_stream() - if supported_audio_stream: - log.info(f"Found a supported audio stream") - - # Test if there is more than one audio stream - if len(media_info.audio_streams) > 1: - log.info(f"Found other audio streams besides the supported one, will discard those") - - ffmpeg_args.extend(['-f', supported_audio_stream.audio_codec_class.ffmpeg_container_format()]) - - # Select all audio streams - ffmpeg_args.extend(['-map', '0:a']) - - for stream in media_info.audio_streams: - # Deselect the unsupported streams - if stream != supported_audio_stream: - ffmpeg_args.extend(['-map', f'-0:a:{stream.ffmpeg_stream_index}']) - - # If a stream of a supported codec was not found, transcode it to FLAC 16 bit in order to not lose any quality - else: - log.info(f"None of the audio streams are supported by the Speech API, will transcode to FLAC") - - # Map first audio stream to input 0 - ffmpeg_args.extend(['-map', '0:a:0']) - - # Transcode to FLAC (16 bit) in order to not lose any quality - ffmpeg_args.extend(['-acodec', 'flac']) - ffmpeg_args.extend(['-f', 'flac']) - ffmpeg_args.extend(['-sample_fmt', 's16']) - - # Ensure that we end up with mono audio - ffmpeg_args.extend(['-ac', '1']) - - # If there's video in the file (e.g. video), remove it - if media_info.has_video_streams: - # Discard all video streams - ffmpeg_args.extend(['-map', '-0:v']) - - if ffmpeg_args: - - temp_filename = 'transcoded_file' - - try: - temp_dir = tempfile.mkdtemp('media_file') - except Exception as ex: - raise McPodcastFileStoreFailureException(f"Unable to create temporary directory: {ex}") - - temp_file_path = os.path.join(temp_dir, temp_filename) - - try: - log.info(f"Transcoding {input_media_file.temp_full_path} to {temp_file_path}...") - - # I wasn't sure how to map outputs in "ffmpeg-python" library so here we call ffmpeg directly - ffmpeg_command = ['ffmpeg', '-nostdin', '-hide_banner', - '-i', input_media_file.temp_full_path] + ffmpeg_args + [temp_file_path] - log.debug(f"FFmpeg command: {ffmpeg_command}") - subprocess.check_call(ffmpeg_command) - - log.info(f"Done transcoding {input_media_file.temp_full_path} to {temp_file_path}") - - except Exception as ex: - - shutil.rmtree(temp_dir) - - raise McPodcastFileIsInvalidException(f"Unable to transcode {input_media_file.temp_full_path}: {ex}") - - result_media_file = TranscodeTempDirAndFile(temp_dir=temp_dir, filename=temp_filename) - - else: - - # Return the same file as it wasn't touched - result_media_file = input_media_file - - return result_media_file diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/speech_api.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/speech_api.py new file mode 100644 index 0000000000..8962c63e42 --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/speech_api.py @@ -0,0 +1,202 @@ +# FIXME upload transcriptions directly to GCS once that's no longer a demo feature: +# https://cloud.google.com/speech-to-text/docs/async-recognize#speech_transcribe_async_gcs-python + +from typing import Optional + +# noinspection PyPackageRequirements +from google.api_core.exceptions import InvalidArgument, NotFound +# noinspection PyPackageRequirements +from google.api_core.operation import from_gapic, Operation +# noinspection PyPackageRequirements +from google.api_core.retry import Retry +# noinspection PyPackageRequirements +from google.cloud.speech_v1p1beta1 import ( + SpeechClient, RecognitionConfig, RecognitionAudio, LongRunningRecognizeResponse, LongRunningRecognizeMetadata, +) + +from mediawords.util.log import create_logger + +from .transcript import Transcript, UtteranceAlternative, Utterance +from ..config import PodcastGCAuthConfig +from ..exceptions import ( + McPodcastMisconfiguredSpeechAPIException, + McPodcastSpeechAPIRequestFailedException, + McMisconfiguredSpeechAPIException, + HardException, +) +from .media_info import MediaFileInfoAudioStream + +log = create_logger(__name__) + +# Speech API sometimes throws: +# +# google.api_core.exceptions.ServiceUnavailable: 503 failed to connect to all addresses +# +# so let it retry for 10 minutes or so. +_GOOGLE_API_RETRIES = Retry(initial=5, maximum=60, multiplier=2, deadline=60 * 10) +"""Google Cloud API's own retry policy.""" + + +def submit_transcribe_operation(gs_uri: str, + episode_metadata: MediaFileInfoAudioStream, + bcp47_language_code: str) -> str: + """ + Submit a Speech API long running operation to transcribe a podcast episode. + + :param gs_uri: Google Cloud Storage URI to a transcoded episode. + :param episode_metadata: Metadata derived from the episode while transcoding it. + :param bcp47_language_code: Episode's BCP 47 language code guessed from story's title + description. + :return Google Speech API operation ID by which the transcription operation can be referred to. + """ + + auth_config = PodcastGCAuthConfig() + + try: + client = SpeechClient.from_service_account_json(auth_config.gc_auth_json_file()) + except Exception as ex: + raise McPodcastMisconfiguredSpeechAPIException(f"Unable to create Speech API client: {ex}") + + try: + # noinspection PyTypeChecker + config = RecognitionConfig( + encoding=RecognitionConfig.AudioEncoding(episode_metadata.audio_codec_class.speech_api_codec()), + sample_rate_hertz=episode_metadata.sample_rate, + # We always set the channel count to 1 and disable separate recognition per channel as our inputs are all + # mono audio files and do not have separate speakers per audio channel. + audio_channel_count=1, + enable_separate_recognition_per_channel=False, + language_code=bcp47_language_code, + alternative_language_codes=[ + # FIXME add all Chinese variants + # FIXME add Mexican Spanish variants + ], + + speech_contexts=[ + # Speech API works pretty well without custom contexts + ], + # Don't care that much about word confidence + enable_word_confidence=False, + # Punctuation doesn't work that well but we still enable it here + enable_automatic_punctuation=True, + # Not setting 'model' as 'use_enhanced' will then choose the best model for us + # Using enhanced (more expensive) model, where available + use_enhanced=True, + ) + except Exception as ex: + raise McPodcastMisconfiguredSpeechAPIException(f"Unable to initialize Speech API configuration: {ex}") + + log.info(f"Submitting a Speech API operation for URI {gs_uri}...") + + try: + + # noinspection PyTypeChecker + audio = RecognitionAudio(uri=gs_uri) + + speech_operation = client.long_running_recognize(config=config, audio=audio, retry=_GOOGLE_API_RETRIES) + + except Exception as ex: + raise McPodcastSpeechAPIRequestFailedException(f"Unable to submit a Speech API operation: {ex}") + + try: + # We get the operation name in a try-except block because accessing it is not that well documented, so Google + # might change the property names whenever they please and we wouldn't necessarily notice otherwise + operation_id = speech_operation.operation.name + if not operation_id: + raise McPodcastMisconfiguredSpeechAPIException(f"Operation name is empty.") + except Exception as ex: + raise McPodcastMisconfiguredSpeechAPIException(f"Unable to get operation name: {ex}") + + log.info(f"Submitted Speech API operation for URI {gs_uri}") + + return operation_id + + +def fetch_transcript(speech_operation_id: str) -> Optional[Transcript]: + """ + Try to fetch a transcript for a given speech operation ID. + + :param speech_operation_id: Speech operation ID. + :return: Transcript, or None if the transcript hasn't been prepared yet. + """ + if not speech_operation_id: + raise McMisconfiguredSpeechAPIException(f"Speech operation ID is unset.") + + auth_config = PodcastGCAuthConfig() + + try: + client = SpeechClient.from_service_account_json(auth_config.gc_auth_json_file()) + except Exception as ex: + raise McMisconfiguredSpeechAPIException(f"Unable to initialize Speech API operations client: {ex}") + + try: + operation = client.transport.operations_client.get_operation( + name=speech_operation_id, + retry=_GOOGLE_API_RETRIES, + ) + except InvalidArgument as ex: + raise McMisconfiguredSpeechAPIException(f"Invalid operation ID '{speech_operation_id}': {ex}") + except NotFound as ex: + # FIXME we should be resubmitting the media file for a new transcript when that happens + raise HardException(f"Operation ID '{speech_operation_id}' was not found: {ex}") + except Exception as ex: + # On any other errors, raise a hard exception + raise McMisconfiguredSpeechAPIException(f"Error while fetching operation ID '{speech_operation_id}': {ex}") + + if not operation: + raise McMisconfiguredSpeechAPIException(f"Operation is unset.") + + try: + gapic_operation: Operation = from_gapic( + operation=operation, + operations_client=client.transport.operations_client, + result_type=LongRunningRecognizeResponse, + metadata_type=LongRunningRecognizeMetadata, + retry=_GOOGLE_API_RETRIES, + ) + except Exception as ex: + raise McMisconfiguredSpeechAPIException(f"Unable to create GAPIC operation: {ex}") + + log.debug(f"GAPIC operation: {gapic_operation}") + log.debug(f"Operation metadata: {gapic_operation.metadata}") + log.debug(f"Operation is done: {gapic_operation.done()}") + log.debug(f"Operation error: {gapic_operation.done()}") + + try: + operation_is_done = gapic_operation.done(retry=_GOOGLE_API_RETRIES) + except Exception as ex: + # 'done' attribute might be gone in a newer version of the Speech API client + raise McMisconfiguredSpeechAPIException( + f"Unable to test whether operation '{speech_operation_id}' is done: {ex}" + ) + + if not operation_is_done: + log.info(f"Operation '{speech_operation_id}' is still not done.") + return None + + utterances = [] + + try: + for result in gapic_operation.result(retry=_GOOGLE_API_RETRIES).results: + + alternatives = [] + for alternative in result.alternatives: + alternatives.append( + UtteranceAlternative( + text=alternative.transcript.strip(), + confidence=alternative.confidence, + ) + ) + + utterances.append( + Utterance( + alternatives=alternatives, + bcp47_language_code=result.language_code, + ) + ) + + except Exception as ex: + raise HardException( + f"Unable to read transcript for operation '{speech_operation_id}' due to other error: {ex}" + ) + + return Transcript(utterances=utterances) diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/transcode.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/transcode.py new file mode 100644 index 0000000000..580a18b123 --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/transcode.py @@ -0,0 +1,92 @@ +import subprocess +import os + +from mediawords.util.log import create_logger + +from ..exceptions import McPodcastMisconfiguredTranscoderException, McPodcastFileIsInvalidException +from .media_info import media_file_info + +log = create_logger(__name__) + + +def maybe_transcode_file(input_file: str, maybe_output_file: str) -> bool: + """ + Transcode file (if needed) to something that Speech API will support. + + * If input has a video stream, it will be discarded; + * If input has more than one audio stream, others will be discarded leaving only one (preferably the one that Speech + API can support); + * If input doesn't have an audio stream in Speech API-supported codec, it will be transcoded to lossless + FLAC 16 bit in order to preserve quality; + * If the chosen audio stream has multiple channels (e.g. stereo or 5.1), it will be mixed into a single (mono) + channel as Speech API supports multi-channel recognition only when different voices speak into each of the + channels. + + :param input_file: Input media file to consider for transcoding. + :param maybe_output_file: If we decide to transcode, output media file to transcode to. + :return: True if file had to be transcoded into "maybe_output_file", or False if input file can be used as it is. + """ + + if not os.path.isfile(input_file): + raise McPodcastMisconfiguredTranscoderException(f"File '{input_file}' does not exist.") + + # Independently from what has told us, identify the file type again ourselves + media_info = media_file_info(media_file_path=input_file) + + if not media_info.audio_streams: + raise McPodcastFileIsInvalidException("Downloaded file doesn't appear to have any audio streams.") + + ffmpeg_args = [] + + supported_audio_stream = media_info.best_supported_audio_stream() + if supported_audio_stream: + log.info(f"Found a supported audio stream") + + # Test if there is more than one audio stream + if len(media_info.audio_streams) > 1: + log.info(f"Found other audio streams besides the supported one, will discard those") + + ffmpeg_args.extend(['-f', supported_audio_stream.audio_codec_class.ffmpeg_container_format()]) + + # Select all audio streams + ffmpeg_args.extend(['-map', '0:a']) + + for stream in media_info.audio_streams: + # Deselect the unsupported streams + if stream != supported_audio_stream: + ffmpeg_args.extend(['-map', f'-0:a:{stream.ffmpeg_stream_index}']) + + # If a stream of a supported codec was not found, transcode it to FLAC 16 bit in order to not lose any quality + else: + log.info(f"None of the audio streams are supported by the Speech API, will transcode to FLAC") + + # Map first audio stream to input 0 + ffmpeg_args.extend(['-map', '0:a:0']) + + # Transcode to FLAC (16 bit) in order to not lose any quality + ffmpeg_args.extend(['-acodec', 'flac']) + ffmpeg_args.extend(['-f', 'flac']) + ffmpeg_args.extend(['-sample_fmt', 's16']) + + # Ensure that we end up with mono audio + ffmpeg_args.extend(['-ac', '1']) + + # If there's video in the file (e.g. video), remove it + if media_info.has_video_streams: + # Discard all video streams + ffmpeg_args.extend(['-map', '-0:v']) + + if not ffmpeg_args: + # No need to transcode -- caller should use the input file as-is + return False + + log.info(f"Transcoding '{input_file}' to '{maybe_output_file}'...") + + # I wasn't sure how to map outputs in "ffmpeg-python" library so here we call ffmpeg directly + ffmpeg_command = ['ffmpeg', '-nostdin', '-hide_banner', '-i', input_file] + ffmpeg_args + [maybe_output_file] + log.debug(f"FFmpeg command: {ffmpeg_command}") + subprocess.check_call(ffmpeg_command) + + log.info(f"Done transcoding '{input_file}' to '{maybe_output_file}'") + + return True diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/transcript.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/transcript.py new file mode 100644 index 0000000000..fae528533b --- /dev/null +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_episode/transcript.py @@ -0,0 +1,68 @@ +import abc +import dataclasses +from typing import List, Dict, Any + + +class _AbstractFromDict(object, metaclass=abc.ABCMeta): + + @classmethod + @abc.abstractmethod + def from_dict(cls, input_dict: Dict[str, Any]) -> '_AbstractFromDict': + raise NotImplementedError + + +@dataclasses.dataclass +class UtteranceAlternative(object): + """One of the alternatives of what might have been said in an utterance.""" + + text: str + """Utterance text.""" + + confidence: float + """How confident Speech API is that it got it right.""" + + @classmethod + def from_dict(cls, input_dict: Dict[str, Any]) -> 'UtteranceAlternative': + return cls( + text=input_dict['text'], + confidence=input_dict['confidence'], + ) + + +@dataclasses.dataclass +class Utterance(object): + """A single transcribed utterance (often but not always a single sentence).""" + + alternatives: List[UtteranceAlternative] + """Alternatives of what might have been said in an utterance, ordered from the best to the worst guess.""" + + bcp47_language_code: str + """BCP 47 language code; might be different from what we've passed as the input.""" + + @property + def best_alternative(self) -> UtteranceAlternative: + """Return best alternative for what might have been said in an utterance.""" + return self.alternatives[0] + + @classmethod + def from_dict(cls, input_dict: Dict[str, Any]) -> 'Utterance': + raise cls( + alternatives=[UtteranceAlternative.from_dict(x) for x in input_dict['alternatives']], + bcp47_language_code=input_dict['bcp47_language_code'], + ) + + +@dataclasses.dataclass +class Transcript(object): + """A single transcript.""" + + utterances: List[Utterance] + """List of ordered utterances in a transcript.""" + + # Only Transcript is to be serialized to JSON so to_dict() is implemented only here + def to_dict(self) -> Dict[str, Any]: + return dataclasses.asdict(self) + + @classmethod + def from_dict(cls, input_dict: Dict[str, Any]) -> 'Transcript': + return cls(utterances=[Utterance.from_dict(x) for x in input_dict['utterances']]) diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py index f84c517e89..46c17174e9 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py @@ -1,177 +1,15 @@ -import abc -from typing import Optional - -# noinspection PyPackageRequirements -from google.api_core.exceptions import InvalidArgument, NotFound, GoogleAPICallError -# noinspection PyPackageRequirements -from google.api_core.operation import from_gapic, Operation -# noinspection PyPackageRequirements -from google.api_core.operations_v1 import OperationsClient -# noinspection PyPackageRequirements -from google.cloud.speech_v1p1beta1 import SpeechClient, LongRunningRecognizeResponse, LongRunningRecognizeMetadata - from mediawords.db import DatabaseHandler from mediawords.dbi.downloads import create_download_for_new_story from mediawords.dbi.downloads.store import store_content from mediawords.util.log import create_logger -from ..config import PodcastTranscribeEpisodeConfig -from ..exceptions import ( - McDatabaseNotFoundException, - McMisconfiguredSpeechAPIException, - HardException, SoftException, -) -from .transcript import UtteranceAlternative, Utterance, Transcript +from src.python.podcast_transcribe_episode.fetch_episode.transcript import Transcript log = create_logger(__name__) -class AbstractHandler(object, metaclass=abc.ABCMeta): - """ - Abstract class that fetches and stores a transcript. - - Useful for testing as we can create a mock class which pretends to do it. - """ - - @classmethod - @abc.abstractmethod - def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetches_id: int) -> Optional[Transcript]: - """ - Attempt fetching a Speech API transcript for a given operation ID. - - :param db: Database handler. - :param podcast_episode_transcript_fetches_id: Transcript fetch attempt ID. - :return: None if transcript is not finished yet, a Transcript object otherwise. - """ - raise NotImplemented("Abstract method") - - @classmethod - @abc.abstractmethod - def store_transcript(cls, db: DatabaseHandler, transcript: Transcript) -> int: - """ - Store transcript to raw download store. - - We could write this directly to "download_texts", but if we decide to reextract everything (after, say, updating - an extractor), that "download_texts" row might disappear, so it's safer to just store a raw download on the - key-value store as if it was a HTML file or something. - - :param db: Database handler. - :param transcript: Transcript object. - :return: Download ID for a download that was created. - """ - raise NotImplemented("Abstract method") - - class DefaultHandler(AbstractHandler): - @classmethod - def fetch_transcript(cls, db: DatabaseHandler, podcast_episode_transcript_fetches_id: int) -> Optional[Transcript]: - transcript_fetch = db.find_by_id( - table='podcast_episode_transcript_fetches', - object_id=podcast_episode_transcript_fetches_id, - ) - if not transcript_fetch: - raise McDatabaseNotFoundException( - f"Unable to find transcript fetch with ID {podcast_episode_transcript_fetches_id}" - ) - podcast_episodes_id = transcript_fetch['podcast_episodes_id'] - - episode = db.find_by_id(table='podcast_episodes', object_id=podcast_episodes_id) - if not episode: - raise McDatabaseNotFoundException( - f"Unable to find podcast episode with ID {podcast_episodes_id}" - ) - - stories_id = episode['stories_id'] - speech_operation_id = episode['speech_operation_id'] - - if not speech_operation_id: - raise McMisconfiguredSpeechAPIException(f"Speech ID for podcast episode {podcast_episodes_id} is unset.") - - try: - config = PodcastTranscribeEpisodeConfig() - client = SpeechClient.from_service_account_json(config.gc_auth_json_file()) - operations_client = OperationsClient(channel=client._transport._grpc_channel) - except Exception as ex: - raise McMisconfiguredSpeechAPIException(f"Unable to initialize Speech API operations client: {ex}") - - try: - operation = operations_client.get_operation(name=speech_operation_id) - except InvalidArgument as ex: - raise McMisconfiguredSpeechAPIException(f"Invalid operation ID '{speech_operation_id}': {ex}") - except NotFound as ex: - # Not a "hard" failure as sometimes these operations expire - # FIXME although we should be resubmitting the media file for a new transcript when that happens - raise SoftException(f"Operation ID '{speech_operation_id}' was not found: {ex}") - except Exception as ex: - # On any other errors, raise a hard exception - raise McMisconfiguredSpeechAPIException(f"Error while fetching operation ID '{speech_operation_id}': {ex}") - - if not operation: - raise McMisconfiguredSpeechAPIException(f"Operation is unset.") - - try: - gapic_operation: Operation = from_gapic( - operation, - operations_client, - LongRunningRecognizeResponse, - metadata_type=LongRunningRecognizeMetadata, - ) - except Exception as ex: - raise McMisconfiguredSpeechAPIException(f"Unable to create GAPIC operation: {ex}") - - log.debug(f"GAPIC operation: {gapic_operation}") - log.debug(f"Operation metadata: {gapic_operation.metadata}") - log.debug(f"Operation is done: {gapic_operation.done()}") - log.debug(f"Operation error: {gapic_operation.done()}") - - try: - operation_is_done = gapic_operation.done() - except Exception as ex: - # 'done' attribute might be gone in a newer version of the Speech API client - raise McMisconfiguredSpeechAPIException( - f"Unable to test whether operation '{speech_operation_id}' is done: {ex}" - ) - - if not operation_is_done: - log.info(f"Operation '{speech_operation_id}' is still not done.") - return None - - utterances = [] - - try: - for result in gapic_operation.result().results: - - alternatives = [] - for alternative in result.alternatives: - alternatives.append( - UtteranceAlternative( - text=alternative.transcript.strip(), - confidence=alternative.confidence, - ) - ) - - utterances.append( - Utterance( - alternatives=alternatives, - bcp47_language_code=result.language_code, - ) - ) - - except GoogleAPICallError as ex: - # When Speech API returns with an error, it's unclear whether it was us who have messed up or something is - # (temporarily) wrong on their end, so on the safe side we throw a "hard" exception. - raise HardException( - f"Unable to read transcript for operation '{speech_operation_id}' due to API error: {ex}" - ) - - except Exception as ex: - raise HardException( - f"Unable to read transcript for operation '{speech_operation_id}' due to other error: {ex}" - ) - - return Transcript(stories_id=stories_id, utterances=utterances) - @classmethod def _download_text_from_transcript(cls, transcript: Transcript) -> str: best_utterance_alternatives = [] diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/transcript.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/transcript.py deleted file mode 100644 index edfbd257a4..0000000000 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/transcript.py +++ /dev/null @@ -1,40 +0,0 @@ -import dataclasses -from typing import List - - -@dataclasses.dataclass -class UtteranceAlternative(object): - """One of the alternatives of what might have been said in an utterance.""" - - text: str - """Utterance text.""" - - confidence: float - """How confident Speech API is that it got it right.""" - - -@dataclasses.dataclass -class Utterance(object): - """A single transcribed utterance (often but not always a single sentence).""" - - alternatives: List[UtteranceAlternative] - """Alternatives of what might have been said in an utterance, ordered from the best to the worst guess.""" - - bcp47_language_code: str - """BCP 47 language code; might be different from what we've passed as the input.""" - - @property - def best_alternative(self) -> UtteranceAlternative: - """Return best alternative for what might have been said in an utterance.""" - return self.alternatives[0] - - -@dataclasses.dataclass -class Transcript(object): - """A single transcript.""" - - stories_id: int - """Story ID.""" - - utterances: List[Utterance] - """List of ordered utterances in a transcript.""" diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/shared.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/shared.py index bfb1854689..3290807df1 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/shared.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/shared.py @@ -2,9 +2,11 @@ # FIXME post-init validation of dataclasses (https://docs.python.org/3/library/dataclasses.html#post-init-processing) # FIXME workflow logger # FIXME if something's wrong (e.g. the episode doesn't look valid), should the workflow succeed or fail? +# FIXME remove "Podcast(Transcribe)..." prefix from everywhere +# FIXME transient vs non-transient errors +# FIXME don't exit(1) if connect_to_db() fails import dataclasses -import enum from datetime import timedelta from typing import Optional @@ -15,6 +17,7 @@ from .exceptions import HardException from .fetch_episode.enclosure import StoryEnclosure +from .fetch_episode.media_info import MediaFileInfoAudioStream TASK_QUEUE = "podcast-transcribe-episode" """Temporal task queue.""" @@ -52,45 +55,6 @@ """ -@enum.unique -class AudioCodec(enum.Enum): - """ - Audio file codec that's supported by Google Speech API. - - https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1 - """ - LINEAR16 = 'LINEAR16', - FLAC = 'FLAC' - MULAW = 'MULAW' - OGG_OPUS = 'OGG_OPUS' - MP3 = 'MP3' - - -@dataclasses.dataclass(frozen=True) -class EpisodeMetadata(object): - """Metadata about an episode to be transcribed.""" - - duration: int - """Episode's duration in seconds.""" - - codec: AudioCodec - """Episode's codec.""" - - sample_rate: int - """Episode's sample rate (Hz) as determined by transcoder, e.g. 44100.""" - - def __post_init__(self) -> None: - """Validate episode's metadata.""" - - if self.duration <= 0: - # FIXME could it be zero? - raise ValueError('Episode duration is not positive.') - if not self.codec: - raise ValueError('Episode codec is not set.') - if self.sample_rate <= 1000: - raise ValueError('Episode sample rate is not correct.') - - class AbstractPodcastTranscribeActivities(object): """Activities interface.""" @@ -211,30 +175,44 @@ async def fetch_enclosure_to_gcs(self, stories_id: int, enclosure: StoryEnclosur maximum_attempts=20, ), ) - async def fetch_transcode_store_episode(self, stories_id: int) -> EpisodeMetadata: + async def fetch_transcode_store_episode(self, stories_id: int) -> MediaFileInfoAudioStream: """ Fetch episode from GCS, transcode it if needed and store it to GCS again in a separate bucket. Now that the raw episode file is safely located in GCS, we can try transcoding it. :param stories_id: Story ID the episode of which should be transcoded. - :return: Metadata determined as part of the transcoding. + :return: Metadata of the best audio stream determined as part of the transcoding. """ raise NotImplementedError @activity_method( task_queue=TASK_QUEUE, # schedule_to_start_timeout=None, - start_to_close_timeout=timedelta(seconds=60), + + # Give a bit more time as the implementation is likely to do some non-Temporal retries on weird Speech API + # errors + start_to_close_timeout=timedelta(minutes=5), + # schedule_to_close_timeout=None, # heartbeat_timeout=None, - # FIXME don't submit too many operations - retry_parameters=RETRY_PARAMETERS, + retry_parameters=dataclasses.replace( + RETRY_PARAMETERS, + + # Given that the thing is costly, wait a whole hour before retrying anything + initial_interval=timedelta(hours=1), + + # Hope for the Speech API to resurrect in a week + maximum_interval=timedelta(weeks=1), + + # Don't retry too much as each try is potentially very costly + maximum_attempts=10, + ), ) async def submit_transcribe_operation(self, stories_id: int, - episode_metadata: EpisodeMetadata, + episode_metadata: MediaFileInfoAudioStream, bcp47_language_code: str) -> str: raise NotImplementedError diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/__init__.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py deleted file mode 100644 index 2c50de8010..0000000000 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/submit_operation/submit_operation.py +++ /dev/null @@ -1,213 +0,0 @@ -import time -from typing import Dict, Any - -# noinspection PyPackageRequirements -from google.api_core.exceptions import ServiceUnavailable -# noinspection PyPackageRequirements -from google.cloud.speech_v1p1beta1 import SpeechClient, RecognitionConfig - -from mediawords.db import DatabaseHandler -from mediawords.util.log import create_logger - -from ..config import PodcastTranscribeEpisodeConfig -from ..exceptions import ( - McPodcastDatabaseErrorException, - McPodcastInvalidInputException, - McPodcastMisconfiguredSpeechAPIException, - McPodcastSpeechAPIRequestFailedException, - SoftException, -) - -log = create_logger(__name__) - -MAX_DURATION = 60 * 60 * 2 -"""Max. podcast episode duration (in seconds) to submit for transcription.""" - -MAX_RETRIES = 10 -"""Max. number of retries for submitting a Speech API long running operation.""" - -DELAY_BETWEEN_RETRIES = 5 -"""How long to wait (in seconds) between retries.""" - - -class PodcastEpisode(object): - """ - Podcast episode object. - - Postprocesses database row from "podcast_episodes" and does some extra checks. - """ - __slots__ = [ - '__stories_id', - '__podcast_episodes_id', - '__gcs_uri', - '__duration', - '__codec', - '__sample_rate', - '__bcp47_language_code', - ] - - def __init__(self, stories_id: int, db_row: Dict[str, Any]): - self.__stories_id = stories_id - self.__podcast_episodes_id = db_row['podcast_episodes_id'] - self.__gcs_uri = db_row['gcs_uri'] - self.__duration = db_row['duration'] - self.__codec = db_row['codec'] - self.__sample_rate = db_row['sample_rate'] - self.__bcp47_language_code = db_row['bcp47_language_code'] - - @property - def stories_id(self) -> int: - return self.__stories_id - - @property - def podcast_episodes_id(self) -> int: - return self.__podcast_episodes_id - - @property - def gcs_uri(self) -> str: - if not self.__gcs_uri.startswith('gs://'): - raise McPodcastInvalidInputException("Google Cloud Storage URI doesn't have gs:// prefix.") - return self.__gcs_uri - - @property - def duration(self) -> int: - if not self.__duration: - raise McPodcastInvalidInputException("Duration is unset or zero.") - return self.__duration - - @property - def codec(self) -> RecognitionConfig.AudioEncoding: - try: - encoding_obj = getattr(RecognitionConfig.AudioEncoding, self.__codec) - except Exception as ex: - raise McPodcastInvalidInputException(f"Invalid codec '{self.__codec}': {ex}") - - return encoding_obj - - @property - def sample_rate(self) -> int: - if not self.__sample_rate: - raise McPodcastInvalidInputException("Sample rate is unset or zero.") - return self.__sample_rate - - @property - def bcp47_language_code(self) -> str: - if '-' not in self.__bcp47_language_code and self.__bcp47_language_code != 'zh': - raise McPodcastInvalidInputException(f"Invalid BCP 47 language code '{self.__bcp47_language_code}'.") - return self.__bcp47_language_code - - -def get_podcast_episode(db: DatabaseHandler, stories_id: int) -> PodcastEpisode: - """ - Get podcast episode object for story ID. - - :param db: Database handler. - :param stories_id: Story ID. - :return: Podcast episode object. - """ - try: - podcast_episodes = db.select( - table='podcast_episodes', - what_to_select='*', - condition_hash={'stories_id': stories_id}, - ).hashes() - - except Exception as ex: - raise McPodcastDatabaseErrorException(f"Unable to fetch story's {stories_id} podcast episodes: {ex}") - - if not podcast_episodes: - raise SoftException(f"There are no podcast episodes for story {stories_id}") - - if len(podcast_episodes) > 1: - # That's very weird, there should be only one episode per story - raise McPodcastDatabaseErrorException(f"There's more than one podcast episode for story {stories_id}") - - try: - episode = PodcastEpisode(stories_id=stories_id, db_row=podcast_episodes[0]) - except Exception as ex: - raise McPodcastInvalidInputException(f"Invalid episode for story {stories_id}: {ex}") - - if episode.duration > MAX_DURATION: - raise SoftException(f"Story's {stories_id} podcast episode is too long ({episode.duration} seconds).") - - return episode - - -def submit_transcribe_operation(episode: PodcastEpisode) -> int: - """ - Submit a Speech API long running operation to transcribe a podcast episode. - - :param episode: Podcast episode object. - :return Operation's ID to use for fetching operation results. - """ - - try: - config = PodcastTranscribeEpisodeConfig() - client = SpeechClient.from_service_account_json(config.gc_auth_json_file()) - except Exception as ex: - raise McPodcastMisconfiguredSpeechAPIException(f"Unable to create Speech API client: {ex}") - - try: - config = RecognitionConfig( - encoding=episode.codec, - sample_rate_hertz=episode.sample_rate, - # We always set the channel count to 1 and disable separate recognition per channel as our inputs are all - # mono audio files and do not have separate speakers per audio channel. - audio_channel_count=1, - enable_separate_recognition_per_channel=False, - language_code=episode.bcp47_language_code, - alternative_language_codes=[ - # FIXME add all Chinese variants - # FIXME add Mexican Spanish variants - ], - - speech_contexts=[ - # Speech API works pretty well without custom contexts - ], - # Don't care that much about word confidence - enable_word_confidence=False, - # Punctuation doesn't work that well but we still enable it here - enable_automatic_punctuation=True, - # Not setting 'model' as 'use_enhanced' will then choose the best model for us - # Using enhanced (more expensive) model, where available - use_enhanced=True, - ) - except Exception as ex: - raise McPodcastMisconfiguredSpeechAPIException(f"Unable to initialize Speech API configuration: {ex}") - - log.info(f"Submitting a Speech API operation for story {episode.stories_id}...") - speech_operation = None - for attempt in range(1, MAX_RETRIES + 1): - - if attempt > 1: - log.warning(f"Waiting for {DELAY_BETWEEN_RETRIES} seconds and retrying #{attempt}...") - time.sleep(DELAY_BETWEEN_RETRIES) - - try: - speech_operation = client.long_running_recognize(config=config, audio={"uri": episode.gcs_uri}) - except ServiceUnavailable as ex: - # Speech API sometimes throws: - # - # google.api_core.exceptions.ServiceUnavailable: 503 failed to connect to all addresses - # - log.error(f"Unable to submit an operation because service is unavailable: {ex}") - except Exception as ex: - raise McPodcastSpeechAPIRequestFailedException(f"Unable to submit a Speech API operation: {ex}") - else: - break - - if not speech_operation: - raise McPodcastSpeechAPIRequestFailedException(f"Ran out of retries while submitting Speech API operation.") - - try: - # We get the operation name in a try-except block because accessing it is not that well documented, so Google - # might change the property names whenever they please and we wouldn't necessarily notice otherwise - operation_id = speech_operation.operation.name - if not operation_id: - raise McPodcastMisconfiguredSpeechAPIException(f"Operation name is empty.") - except Exception as ex: - raise McPodcastMisconfiguredSpeechAPIException(f"Unable to get operation name: {ex}") - - log.info(f"Submitted Speech API operation '{operation_id}' for story {episode.stories_id}") - - return operation_id diff --git a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py index 1659620b27..8ae208555f 100644 --- a/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py +++ b/apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/workflow.py @@ -1,18 +1,28 @@ -# noinspection PyPackageRequirements +import os +import tempfile from typing import Optional # noinspection PyPackageRequirements -from mediawords.util.identify_language import identification_would_be_reliable, language_code_for_text -from mediawords.util.parse_html import html_strip -from .fetch_episode.enclosure import podcast_viable_enclosure_for_story, MAX_ENCLOSURE_SIZE, StoryEnclosure - -from .exceptions import SoftException from temporal.workflow import Workflow from mediawords.db import connect_to_db +from mediawords.util.identify_language import identification_would_be_reliable, language_code_for_text +from mediawords.util.parse_html import html_strip +from .config import ( + PodcastGCRawEnclosuresBucketConfig, + PodcastGCTranscodedEpisodesBucketConfig, + MAX_ENCLOSURE_SIZE, + MAX_DURATION, +) +from .exceptions import SoftException, HardException +from .fetch_episode.enclosure import podcast_viable_enclosure_for_story, StoryEnclosure +from .fetch_episode.fetch_url import fetch_big_file +from .fetch_episode.gcs_store import GCSStore from .fetch_episode.bcp47_lang import iso_639_1_code_to_bcp_47_identifier - +from .fetch_episode.media_info import MediaFileInfoAudioStream, media_file_info +from .fetch_episode.speech_api import submit_transcribe_operation +from .fetch_episode.transcode import maybe_transcode_file from .shared import ( AbstractPodcastTranscribeWorkflow, AbstractPodcastTranscribeActivities, @@ -75,6 +85,77 @@ async def determine_best_enclosure(self, stories_id: int) -> Optional[StoryEnclo return best_enclosure + async def fetch_enclosure_to_gcs(self, stories_id: int, enclosure: StoryEnclosure) -> None: + + with tempfile.TemporaryDirectory(prefix='fetch_enclosure_to_gcs') as temp_dir: + raw_enclosure_path = os.path.join(temp_dir, 'raw_enclosure') + fetch_big_file(url=enclosure.url, dest_file=raw_enclosure_path, max_size=MAX_ENCLOSURE_SIZE) + + if os.stat(raw_enclosure_path).st_size == 0: + # Might happen with misconfigured webservers + raise SoftException(f"Fetched file {raw_enclosure_path} is empty.") + + gcs = GCSStore(bucket_config=PodcastGCRawEnclosuresBucketConfig()) + gcs.upload_object(local_file_path=raw_enclosure_path, object_id=str(stories_id)) + + async def fetch_transcode_store_episode(self, stories_id: int) -> MediaFileInfoAudioStream: + + with tempfile.TemporaryDirectory(prefix='fetch_transcode_store_episode') as temp_dir: + raw_enclosure_path = os.path.join(temp_dir, 'raw_enclosure') + + gcs_raw_enclosures = GCSStore(bucket_config=PodcastGCRawEnclosuresBucketConfig()) + gcs_raw_enclosures.download_object( + object_id=str(stories_id), + local_file_path=raw_enclosure_path, + ) + del gcs_raw_enclosures + + if os.stat(raw_enclosure_path).st_size == 0: + # If somehow the file from GCS ended up being of zero length, then this is very much unexpected + raise HardException(f"Fetched file {raw_enclosure_path} is empty.") + + transcoded_episode_path = os.path.join(temp_dir, 'transcoded_episode') + + raw_enclosure_transcoded = maybe_transcode_file( + input_file=raw_enclosure_path, + maybe_output_file=transcoded_episode_path, + ) + if not raw_enclosure_transcoded: + transcoded_episode_path = raw_enclosure_path + + del raw_enclosure_path + + gcs_transcoded_episodes = GCSStore(bucket_config=PodcastGCTranscodedEpisodesBucketConfig()) + gcs_transcoded_episodes.upload_object(local_file_path=transcoded_episode_path, object_id=str(stories_id)) + + # (Re)read the properties of either the original or the transcoded file + media_info = media_file_info(media_file_path=transcoded_episode_path) + best_audio_stream = media_info.best_supported_audio_stream() + + if not best_audio_stream.audio_codec_class: + raise HardException("Best audio stream doesn't have audio class set") + + return best_audio_stream + + async def submit_transcribe_operation(self, + stories_id: int, + episode_metadata: MediaFileInfoAudioStream, + bcp47_language_code: str) -> str: + + if not episode_metadata.audio_codec_class: + raise HardException("Best audio stream doesn't have audio class set") + + gcs_transcoded_episodes = GCSStore(bucket_config=PodcastGCTranscodedEpisodesBucketConfig()) + gs_uri = gcs_transcoded_episodes.object_uri(object_id=str(stories_id)) + + speech_operation_id = submit_transcribe_operation( + gs_uri=gs_uri, + episode_metadata=episode_metadata, + bcp47_language_code=bcp47_language_code, + ) + + return speech_operation_id + class PodcastTranscribeWorkflow(AbstractPodcastTranscribeWorkflow): """Workflow implementation.""" @@ -101,7 +182,10 @@ async def transcribe_episode(self, stories_id: int) -> None: episode_metadata = await self.activities.fetch_transcode_store_episode(stories_id=stories_id) - # FIXME we probably want to test the metadata here, e.g. whether it's set at all or if the duration is right + if episode_metadata.duration > MAX_DURATION: + # FIXME log that the episode duration exceeded the maximum allowed duration + # f"Story's {stories_id} podcast episode is too long ({episode_metadata.duration} seconds)." + return speech_operation_id = await self.activities.submit_transcribe_operation( stories_id=stories_id, diff --git a/apps/podcast-transcribe-episode/src/requirements.txt b/apps/podcast-transcribe-episode/src/requirements.txt index 80fc71c165..81263db2d0 100644 --- a/apps/podcast-transcribe-episode/src/requirements.txt +++ b/apps/podcast-transcribe-episode/src/requirements.txt @@ -1,3 +1,3 @@ ffmpeg-python==0.2.0 -google-cloud-speech==2.2.1 +google-cloud-speech==2.3.0 google-cloud-storage==1.37.1 diff --git a/apps/podcast-transcribe-episode/tests/python/test_gcs_store.py b/apps/podcast-transcribe-episode/tests/python/test_gcs_store.py index 71aa095811..be2b5e75df 100644 --- a/apps/podcast-transcribe-episode/tests/python/test_gcs_store.py +++ b/apps/podcast-transcribe-episode/tests/python/test_gcs_store.py @@ -15,7 +15,6 @@ class TestGCSStore(TestCase): def test_remote_path(self): - # Empty object ID with pytest.raises(McPodcastMisconfiguredGCSException): GCSStore._remote_path(path_prefix='', object_id='') @@ -35,33 +34,6 @@ def test_remote_path(self): assert GCSStore._remote_path(path_prefix='//', object_id='//a///b//../b/c') == 'a/b/c' - def test_object_uri(self): - gcs = GCSStore() - - # Empty object ID - with pytest.raises(McPodcastMisconfiguredGCSException): - gcs.object_uri(object_id='') - - class NoPathPrefixConfig(PodcastFetchEpisodeConfig): - - @staticmethod - def gc_storage_path_prefix() -> str: - return '' - - config = NoPathPrefixConfig() - gcs = GCSStore(config=config) - assert gcs.object_uri(object_id='a') == f'gs://{config.gc_storage_bucket_name()}/a' - - class MultiPathPrefixConfig(PodcastFetchEpisodeConfig): - - @staticmethod - def gc_storage_path_prefix() -> str: - return '//foo/bar//' - - config = MultiPathPrefixConfig() - gcs = GCSStore(config=config) - assert gcs.object_uri(object_id='a') == f'gs://{config.gc_storage_bucket_name()}/foo/bar/a' - def test_store_exists_delete(self): config = RandomPathPrefixConfig() gcs = GCSStore(config=config) @@ -74,11 +46,11 @@ def test_store_exists_delete(self): with open(temp_file, mode='wb') as f: f.write(mock_data) - gcs.store_object(local_file_path=temp_file, object_id=object_id) + gcs.upload_object(local_file_path=temp_file, object_id=object_id) assert gcs.object_exists(object_id=object_id) is True # Try storing twice - gcs.store_object(local_file_path=temp_file, object_id=object_id) + gcs.upload_object(local_file_path=temp_file, object_id=object_id) assert gcs.object_exists(object_id=object_id) is True gcs.delete_object(object_id=object_id) From 0a317e4ce9f356a0902c52191aff57199774d8d1 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 22 Apr 2021 21:03:21 +0300 Subject: [PATCH 058/175] Make connect_to_db() retries configurable --- .../src/python/mediawords/db/__init__.py | 40 +++++-- .../python/mediawords/util/config/common.py | 109 +++++++++++++----- 2 files changed, 111 insertions(+), 38 deletions(-) diff --git a/apps/common/src/python/mediawords/db/__init__.py b/apps/common/src/python/mediawords/db/__init__.py index f03065ed5a..1a29fbfe4e 100644 --- a/apps/common/src/python/mediawords/db/__init__.py +++ b/apps/common/src/python/mediawords/db/__init__.py @@ -1,18 +1,31 @@ import time +from typing import Optional from mediawords.db.handler import DatabaseHandler -from mediawords.util.config.common import CommonConfig +from mediawords.util.config.common import CommonConfig, DatabaseConfig from mediawords.util.log import create_logger -from mediawords.util.perl import decode_object_from_bytes_if_needed from mediawords.util.process import fatal_error log = create_logger(__name__) -def connect_to_db() -> DatabaseHandler: - """Connect to PostgreSQL.""" +class McConnectToDBError(Exception): + """Exception that gets raised if connect_to_db() runs out of retries and + db_config.retries.fatal_error_on_failure is set to False.""" + pass + + +def connect_to_db(db_config: Optional[DatabaseConfig] = None) -> DatabaseHandler: + """ + Connect to PostgreSQL (via PgBouncer). + + :param db_config: Optional DatabaseConfig parameter to specify connection retry parameters. + :return: DatabaseHandler object. + """ + + if not db_config: + db_config = CommonConfig.database() - db_config = CommonConfig.database() retries_config = db_config.retries() assert retries_config.max_attempts() > 0, "max_tries can't be negative." @@ -57,12 +70,15 @@ def connect_to_db() -> DatabaseHandler: else: log.info("Out of retries, giving up and exiting...") - # Don't throw any exceptions because they might be caught by - # the try-catch block, and so the caller will just assume that - # there was something wrong with the input data and proceed - # with processing next item in the job queue (e.g. the next - # story). Instead, just quit and wait for someone to restart - # the whole app that requires database access. - fatal_error(error_message) + if retries_config.fatal_error_on_failure(): + # Don't throw any exceptions because they might be caught by + # the try-catch block, and so the caller will just assume that + # there was something wrong with the input data and proceed + # with processing next item in the job queue (e.g. the next + # story). Instead, just quit and wait for someone to restart + # the whole app that requires database access. + fatal_error(error_message) + else: + raise McConnectToDBError(error_message) return db diff --git a/apps/common/src/python/mediawords/util/config/common.py b/apps/common/src/python/mediawords/util/config/common.py index 114514a52c..9293f6a4cd 100644 --- a/apps/common/src/python/mediawords/util/config/common.py +++ b/apps/common/src/python/mediawords/util/config/common.py @@ -4,6 +4,7 @@ from mediawords.util.config import env_value, McConfigException from mediawords.util.parse_json import decode_json, McDecodeJSONException +from mediawords.util.perl import decode_object_from_bytes_if_needed from mediawords.util.log import create_logger log = create_logger(__name__) @@ -12,54 +13,110 @@ class ConnectRetriesConfig(object): """Connect retries configuration.""" - @staticmethod - def sleep_between_attempts() -> float: + __slots__ = [ + '__sleep_between_attempts', + '__max_attempts', + '__fatal_error_on_failure', + ] + + def __init__(self, + sleep_between_attempts: float = 1.0, + max_attempts: int = 60, + fatal_error_on_failure: bool = True): + + if isinstance(sleep_between_attempts, bytes): + sleep_between_attempts = decode_object_from_bytes_if_needed(sleep_between_attempts) + if isinstance(max_attempts, bytes): + max_attempts = decode_object_from_bytes_if_needed(max_attempts) + if isinstance(fatal_error_on_failure, bytes): + fatal_error_on_failure = decode_object_from_bytes_if_needed(fatal_error_on_failure) + + self.__sleep_between_attempts = float(sleep_between_attempts) + self.__max_attempts = int(max_attempts) + self.__fatal_error_on_failure = bool(fatal_error_on_failure) + + def sleep_between_attempts(self) -> float: """Seconds (or parts of second) to sleep between retries.""" - return 1.0 + return self.__sleep_between_attempts - @staticmethod - def max_attempts() -> int: + def max_attempts(self) -> int: """Max. number of attempts to connect. Must be positive (we want to try connecting at least one time). """ - return 60 + return self.__max_attempts + + def fatal_error_on_failure(self) -> bool: + """ + Return True if connect_to_db() should call fatal_error() and thus stop the whole process when giving up. + + True is a useful value in production when you might want the process that's unable to connect to the database to + just die. However, you might choose to return False here too if the caller is prepared to handle connection + failures more gracefully (e.g. Temporal's retries). + """ + return self.__fatal_error_on_failure class DatabaseConfig(object): """PostgreSQL database configuration.""" - @staticmethod - def hostname() -> str: + __slots__ = [ + '__hostname', + '__port', + '__database_name', + '__username', + '__password', + '__retries', + ] + + def __init__(self, + hostname: str = 'postgresql-pgbouncer', + port: int = 6432, + database_name: str = 'mediacloud', + username: str = 'mediacloud', + password: str = 'mediacloud', + retries: Optional[ConnectRetriesConfig] = None): + if not retries: + retries = ConnectRetriesConfig() + + if isinstance(port, bytes): + port = decode_object_from_bytes_if_needed(port) + + hostname = decode_object_from_bytes_if_needed(hostname) + database_name = decode_object_from_bytes_if_needed(database_name) + username = decode_object_from_bytes_if_needed(username) + password = decode_object_from_bytes_if_needed(password) + + self.__hostname = hostname + self.__port = int(port) + self.__database_name = database_name + self.__username = username + self.__password = password + self.__retries = retries + + def hostname(self) -> str: """Hostname.""" - # Container's name from docker-compose.yml - return "postgresql-pgbouncer" + return self.__hostname - @staticmethod - def port() -> int: + def port(self) -> int: """Port.""" - # Container's exposed port from docker-compose.yml - return 6432 + return self.__port - @staticmethod - def database_name() -> str: + def database_name(self) -> str: """Database name.""" - return "mediacloud" + return self.__database_name - @staticmethod - def username() -> str: + def username(self) -> str: """Username.""" - return "mediacloud" + return self.__username - @staticmethod - def password() -> str: + def password(self) -> str: """Password.""" - return "mediacloud" + return self.__password - @staticmethod - def retries() -> ConnectRetriesConfig: + def retries(self) -> ConnectRetriesConfig: """connect_to_db() retries configuration.""" - return ConnectRetriesConfig() + return self.__retries class AmazonS3DownloadsConfig(object): From 118d32d5d5d2f9e392bf28f99a206f48f5b2442a Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 22 Apr 2021 21:45:15 +0300 Subject: [PATCH 059/175] Make RabbitMQ (Celery) retries configurable --- .../src/python/mediawords/job/__init__.py | 21 ++- .../python/mediawords/util/config/common.py | 120 ++++++++++++++---- 2 files changed, 116 insertions(+), 25 deletions(-) diff --git a/apps/common/src/python/mediawords/job/__init__.py b/apps/common/src/python/mediawords/job/__init__.py index 7a110d32e7..428599bc03 100644 --- a/apps/common/src/python/mediawords/job/__init__.py +++ b/apps/common/src/python/mediawords/job/__init__.py @@ -10,7 +10,7 @@ from mediawords.db import connect_to_db, DatabaseHandler from mediawords.db.locks import get_session_lock, release_session_lock from mediawords.job.states import STATE_QUEUED, STATE_RUNNING, STATE_COMPLETED, STATE_ERROR -from mediawords.util.config.common import CommonConfig +from mediawords.util.config.common import CommonConfig, RabbitMQConfig from mediawords.util.log import create_logger from mediawords.util.parse_json import encode_json, decode_json from mediawords.util.perl import decode_object_from_bytes_if_needed @@ -382,7 +382,7 @@ class JobBroker(object): '__queue_name', ] - def __init__(self, queue_name: str): + def __init__(self, queue_name: str, rabbitmq_config: Optional[RabbitMQConfig] = None): """ Create job broker object. @@ -397,7 +397,9 @@ def __init__(self, queue_name: str): config = CommonConfig() - rabbitmq_config = config.rabbitmq() + if not rabbitmq_config: + rabbitmq_config = config.rabbitmq() + broker_uri = 'amqp://{username}:{password}@{hostname}:{port}/{vhost}'.format( username=rabbitmq_config.username(), password=rabbitmq_config.password(), @@ -440,6 +442,19 @@ def __init__(self, queue_name: str): self.__app.conf.worker_max_tasks_per_child = 1000 + retries_config = rabbitmq_config.retries() + if retries_config: + self.__app.task_publish_retry = True + self.__app.task_publish_retry_policy = { + 'max_retries': retries_config.max_retries(), + 'interval_start': retries_config.interval_start(), + 'interval_step': retries_config.interval_step(), + 'interval_max': retries_config.interval_max(), + } + + else: + self.__app.task_publish_retry = False + queue = Queue( name=queue_name, exchange=Exchange(queue_name), diff --git a/apps/common/src/python/mediawords/util/config/common.py b/apps/common/src/python/mediawords/util/config/common.py index 9293f6a4cd..c76181cbb5 100644 --- a/apps/common/src/python/mediawords/util/config/common.py +++ b/apps/common/src/python/mediawords/util/config/common.py @@ -1,6 +1,6 @@ import collections import re -from typing import List, Pattern, Optional +from typing import List, Pattern, Optional, Union from mediawords.util.config import env_value, McConfigException from mediawords.util.parse_json import decode_json, McDecodeJSONException @@ -143,41 +143,117 @@ def directory_name() -> str: return env_value('MC_DOWNLOADS_AMAZON_S3_DIRECTORY_NAME', allow_empty_string=True) +class RabbitMQRetriesConfig(object): + """ + RabbitMQ retries configuration. + + https://docs.celeryproject.org/en/v4.4.7/userguide/calling.html#calling-retry + """ + + __slots__ = [ + '__max_retries', + '__interval_start', + '__interval_step', + '__interval_max', + ] + + def __init__(self, + max_retries: Optional[int] = 3, + interval_start: Union[int, float] = 0, + interval_step: Union[int, float] = 0.2, + interval_max: Union[int, float] = 0.2): + if isinstance(max_retries, bytes): + max_retries = decode_object_from_bytes_if_needed(max_retries) + if isinstance(interval_start, bytes): + interval_start = decode_object_from_bytes_if_needed(interval_start) + if isinstance(interval_step, bytes): + interval_step = decode_object_from_bytes_if_needed(interval_step) + if isinstance(interval_max, bytes): + interval_max = decode_object_from_bytes_if_needed(interval_max) + + self.__max_retries = None if max_retries is None else int(max_retries) # We want to preserve None here + self.__interval_start = float(interval_start) + self.__interval_step = float(interval_step) + self.__interval_max = float(interval_max) + + def max_retries(self) -> Optional[int]: + return self.__max_retries + + def interval_start(self) -> float: + return self.__interval_start + + def interval_step(self) -> float: + return self.__interval_step + + def interval_max(self) -> float: + return self.__interval_max + + class RabbitMQConfig(object): """RabbitMQ (Celery broker) client configuration.""" - @staticmethod - def hostname() -> str: + __slots__ = [ + '__hostname', + '__port', + '__username', + '__password', + '__vhost', + '__timeout', + '__retries', + ] + + def __init__(self, + hostname: str = 'rabbitmq-server', + port: int = 5672, + username: str = 'mediacloud', + password: str = 'mediacloud', + vhost: str = '/mediacloud', + timeout: int = 60, + retries: Optional[RabbitMQRetriesConfig] = None): + hostname = decode_object_from_bytes_if_needed(hostname) + if isinstance(port, bytes): + port = decode_object_from_bytes_if_needed(port) + username = decode_object_from_bytes_if_needed(username) + password = decode_object_from_bytes_if_needed(password) + vhost = decode_object_from_bytes_if_needed(vhost) + if isinstance(timeout, bytes): + timeout = decode_object_from_bytes_if_needed(timeout) + + self.__hostname = hostname + self.__port = int(port) + self.__username = username + self.__password = password + self.__vhost = vhost + self.__timeout = int(timeout) + self.__retries = retries + + def hostname(self) -> str: """Hostname.""" - # Container's name from docker-compose.yml - return "rabbitmq-server" + return self.__hostname - @staticmethod - def port() -> int: + def port(self) -> int: """Port.""" - # Container's exposed port from docker-compose.yml - return 5672 + return self.__port - @staticmethod - def username() -> str: + def username(self) -> str: """Username.""" - return "mediacloud" + return self.__username - @staticmethod - def password() -> str: + def password(self) -> str: """Password.""" - return "mediacloud" + return self.__password - @staticmethod - def vhost() -> str: + def vhost(self) -> str: """Virtual host.""" - return "/mediacloud" + return self.__vhost - @staticmethod - def timeout() -> int: + def timeout(self) -> int: """Timeout.""" - # FIXME possibly hardcode it somewhere - return 60 + return self.__timeout + + def retries(self) -> Optional[RabbitMQRetriesConfig]: + """Retry policy; if None, retries are disabled.""" + return self.__retries class SMTPConfig(object): From 0959427bd0ec3c49929f87af5195d52246b60ac8 Mon Sep 17 00:00:00 2001 From: Linas Valiukas Date: Thu, 22 Apr 2021 22:11:43 +0300 Subject: [PATCH 060/175] Finish implementing workflow's actions --- .../inspectionProfiles/Project_Default.xml | 1 + .../podcast_transcribe_episode/config.py | 10 ++ .../fetch_episode/transcript.py | 7 ++ .../fetch_transcript/__init__.py | 0 .../fetch_transcript/fetch_store.py | 115 ------------------ .../fetch_transcript/handler.py | 44 ------- .../podcast_transcribe_episode/shared.py | 61 ++++++++-- .../podcast_transcribe_episode/workflow.py | 114 +++++++++++++++-- 8 files changed, 169 insertions(+), 183 deletions(-) delete mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/__init__.py delete mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/fetch_store.py delete mode 100644 apps/podcast-transcribe-episode/src/python/podcast_transcribe_episode/fetch_transcript/handler.py diff --git a/apps/common/.idea/inspectionProfiles/Project_Default.xml b/apps/common/.idea/inspectionProfiles/Project_Default.xml index 76ebfe820e..d3d52a9b48 100644 --- a/apps/common/.idea/inspectionProfiles/Project_Default.xml +++ b/apps/common/.idea/inspectionProfiles/Project_Default.xml @@ -1,6 +1,7 @@