Skip to content
This repository has been archived by the owner on Dec 14, 2023. It is now read-only.

Python mine #753

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
ceb6776
first commit toward python topic spider
hroberts Oct 15, 2020
6f9f38c
progress on python spider migration
hroberts Oct 19, 2020
27311e2
more progress on mine.py, including some working unit tests
hroberts Dec 2, 2020
f40d4cc
more mine.py unit tests
hroberts Dec 4, 2020
67a6246
add add_new_links test and fix fetch_links test
hroberts Dec 4, 2020
7c6a943
more mine.py unit tests
hroberts Dec 4, 2020
58e6dde
more mine.py unit tests
hroberts Dec 8, 2020
2deaec8
add unit test for import_solr_seed_query_montnh
hroberts Dec 14, 2020
502f538
add unit test for test_import_solr_seed_query_month
hroberts Dec 19, 2020
9840044
add unit test for import_solr_seed_query
hroberts Dec 21, 2020
febe00c
add unit test for fetch_social_media_data
hroberts Dec 21, 2020
8d4a69a
add unit test for check_job_error_rate
hroberts Dec 21, 2020
e641d04
update mine.py to remove obsolete topic mode code
hroberts Dec 22, 2020
cb1fd32
add test for import_seed_urls_from_seed_queries
hroberts Dec 26, 2020
0a31553
add test for respidering
hroberts Dec 26, 2020
321e315
start migrating test_tm_mine.t to python
hroberts Dec 27, 2020
99771a1
add working test_mine.py integration test
hroberts Dec 28, 2020
99a7502
merge in Mine.pm updates and clean out perl files
hroberts Dec 29, 2020
6862d3b
migrate topics-mine-public to python and add typehints to mine.py
hroberts Dec 29, 2020
d26314c
add missing bin directory
hroberts Dec 29, 2020
d8be60c
Merge branch 'master' into python_mine
hroberts Dec 29, 2020
8e53b99
Merge branch 'master' into python_mine
pypt Feb 9, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions apps/common/src/python/mediawords/util/mail.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# Environment variable that, when set, will prevent the package from actually sending the email
__ENV_MAIL_DO_NO_SEND = 'MEDIACLOUD_MAIL_DO_NOT_SEND'

# queue a list of test messages sent for validation
_sent_test_messages = []

class McSendEmailException(Exception):
"""send_email() exception."""
Expand All @@ -27,6 +29,10 @@ def disable_test_mode():
del os.environ[__ENV_MAIL_DO_NO_SEND]


def sent_test_messages():
return _sent_test_messages


def test_mode_is_enabled() -> bool:
return __ENV_MAIL_DO_NO_SEND in os.environ

Expand Down Expand Up @@ -123,6 +129,7 @@ def send_email(message: Message) -> bool:
mime_message.attach(message_part)

if test_mode_is_enabled():
_sent_test_messages.append(message)
log.info("Test mode is enabled, not actually sending any email.")
log.debug("Omitted email:\n\n%s" % mime_message.as_string())

Expand Down
2 changes: 1 addition & 1 deletion apps/common/src/python/mediawords/util/url/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def normalize_url(url: str) -> str:
url = fix_common_url_mistakes(url)

try:
url = canonical_url(url)
url = canonical_url(url)
except Exception as ex:
raise McNormalizeURLException("Unable to get canonical URL: %s" % str(ex))

Expand Down
5 changes: 5 additions & 0 deletions apps/common/tests/python/mediawords/util/test_mail.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
Message,
send_email,
send_text_email,
sent_test_messages,
enable_test_mode as enable_mail_test_mode,
disable_test_mode as disable_mail_test_mode,
)
Expand All @@ -29,6 +30,10 @@ def test_send_mail(self):
)
assert send_email(message)

sent_message = sent_test_messages().pop()

assert sent_message == message

def test_send_text_email(self):
assert send_text_email(
to='[email protected]',
Expand Down
6 changes: 6 additions & 0 deletions apps/tools/bin/dev/jumpstart_perl_to_python.pl
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,12 @@ sub main
# eq -> ==
$code =~ s/ eq / == /g;

# undef to None
$code =~ s/undef/None/g;

# add paerns to common db methods
$code =~ s/(hash(es)?|flat)$/$1()/;

print $code;
}

Expand Down
33 changes: 33 additions & 0 deletions apps/topics-base/src/python/topics_base/alert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from mediawords.util.log import create_logger
log = create_logger(__name__)

import mediawords.util.mail
import topics_base.config
import topics_base.messages

def send_topic_alert(db, topic, message):
""" send an alert about significant activity on the topic to all users with at least write access to the topic"""

emails = db.query(
"""
select distinct au.email
from auth_users au
join topic_permissions tp using (auth_users_id)
where
tp.permission in ('admin', 'write') and
tp.topics_id = %(a)s
""",
{'a': topic['topics_id']}).flat()

emails.extend(topics_base.config.TopicsBaseConfig.topic_alert_emails())

emails = set(emails)

for email in emails:
message = topics_base.messages.TopicSpiderUpdateMessage(
to=email,
topic_name=topic['name'],
topic_url="https://topics.mediacloud.org/#/topics/topic['topics_id']/summary",
topic_spider_status=message,
)
mediawords.util.mail.send_email(message)
53 changes: 53 additions & 0 deletions apps/topics-base/tests/python/test_alert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import hashlib

from mediawords.db import connect_to_db
import mediawords.test.db.create
import mediawords.util.mail
import topics_base.alert
from topics_base.config import TopicsBaseConfig

from mediawords.util.log import create_logger

log = create_logger(__name__)

def _create_permission(db, topic, permission):
au = {
'email': f'{permission}@bar.com',
'password_hash': 'x' * 137,
'full_name': 'foo bar'}
au = db.create('auth_users', au)

tp = {
'topics_id': topic['topics_id'],
'auth_users_id': au['auth_users_id'],
'permission': permission}
tp = db.create('topic_permissions', tp)

return au


def test_topic_alert():
db = mediawords.db.connect_to_db()

topic = mediawords.test.db.create.create_test_topic(db, 'test')

au_admin = _create_permission(db, topic, 'admin')
au_read = _create_permission(db, topic, 'read')
au_write = _create_permission(db, topic, 'write')

mediawords.util.mail.enable_test_mode()

test_message = 'foobarbat'

topics_base.alert.send_topic_alert(db, topic, test_message)

sent_mails = mediawords.util.mail.sent_test_messages()

expected_emails = [au['email'] for au in (au_admin, au_write)] + TopicsBaseConfig.topic_alert_emails()
got_emails = [m.to[0] for m in sent_mails]

assert len(sent_mails) == len(expected_emails)

assert set(got_emails) == set(expected_emails)


Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ def _get_youtube_embed_links(db: DatabaseHandler, story: dict) -> List[str]:
"select * from downloads where stories_id = %(a)s order by stories_id limit 1",
{'a': story['stories_id']}).hash()

if not download:
return []

html = fetch_content(db, download)

soup = BeautifulSoup(html, 'lxml')
Expand Down
2 changes: 1 addition & 1 deletion apps/topics-mine-public/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ COPY bin /opt/mediacloud/bin

USER mediacloud

CMD ["topics_mine_public_worker.pl"]
CMD ["topics_mine_public_worker.py"]
20 changes: 0 additions & 20 deletions apps/topics-mine-public/bin/topics_mine_public_worker.pl

This file was deleted.

13 changes: 13 additions & 0 deletions apps/topics-mine-public/bin/topics_mine_public_worker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env python3

from mediawords.job import JobBroker
from mediawords.util.log import create_logger
from topics_mine.mine import run_worker_job

log = create_logger(__name__)

QUEUE_NAME = 'MediaWords::Job::TM::MineTopicPublic'

if __name__ == '__main__':
app = JobBroker(queue_name=QUEUE_NAME)
app.start_worker(handler=run_worker_job)
15 changes: 7 additions & 8 deletions apps/topics-mine/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,22 @@ RUN \
#
true

# Install Perl dependencies
COPY src/cpanfile /var/tmp/
# Install Python dependencies
COPY src/requirements.txt /var/tmp/
RUN \
cd /var/tmp/ && \
cpm install --global --resolver 02packages --no-prebuilt --mirror "$MC_PERL_CPAN_MIRROR" && \
rm cpanfile && \
rm -rf /root/.perl-cpm/ && \
pip3 install -r requirements.txt && \
rm requirements.txt && \
rm -rf /root/.cache/ && \
true

# Copy sources
COPY src/ /opt/mediacloud/src/topics-mine/
ENV PERL5LIB="/opt/mediacloud/src/topics-mine/perl:${PERL5LIB}" \
PYTHONPATH="/opt/mediacloud/src/topics-mine/python:${PYTHONPATH}"
ENV PYTHONPATH="/opt/mediacloud/src/topics-mine/python:${PYTHONPATH}"

# Copy worker script
COPY bin /opt/mediacloud/bin

USER mediacloud

CMD ["topics_mine_worker.pl"]
CMD ["topics_mine_worker.py"]
69 changes: 0 additions & 69 deletions apps/topics-mine/bin/mine_topic.pl

This file was deleted.

36 changes: 36 additions & 0 deletions apps/topics-mine/bin/mine_topic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python3

import argparse

from mediawords.db import connect_to_db
from topics_mine.mine import mine_topic

def main():
"""run mine_topic with cli args."""
parser = argparse.ArgumentParser(description="Run topics_mine job.")
parser.add_argument("-t", "--topics_id", type=int, required=True)
parser.add_argument("-s", "--snapshots_id", type=int, required=False)
parser.add_argument("-r", "--resume_snapshot", type=bool, required=False)
parser.add_argument("-i", "--import_only", type=bool, required=False)
parser.add_argument("-p", "--skip_post_processing", type=bool, required=False)
args = parser.parse_args()

snapshots_id = args.snapshots_id
if args.resume_snapshot:
snapshots_id = db.query(
"select snapshots_id from snapshots where topics_id = %(a)s order by snapshots_id desc limit 1",
{'a': args.topics_id}).flat()[0]


db = connect_to_db()

topic = db.require_by_id('topics', args.topics_id)

mine_topic(
db=db,
topic=topic,
snapshots_id=snapshots_id,
import_only=args.import_only,
skip_post_processing=args.skip_post_processing)

main()
17 changes: 0 additions & 17 deletions apps/topics-mine/bin/topics_mine_worker.pl

This file was deleted.

13 changes: 13 additions & 0 deletions apps/topics-mine/bin/topics_mine_worker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env python3

from mediawords.job import JobBroker
from mediawords.util.log import create_logger
from topics_mine.mine import run_worker_job

log = create_logger(__name__)

QUEUE_NAME = 'MediaWords::Job::TM::MineTopic'

if __name__ == '__main__':
app = JobBroker(queue_name=QUEUE_NAME)
app.start_worker(handler=run_worker_job)
Loading