diff --git a/install/install_python_dependencies.sh b/install/install_python_dependencies.sh index 991773ad63..ab73742f26 100755 --- a/install/install_python_dependencies.sh +++ b/install/install_python_dependencies.sh @@ -41,8 +41,29 @@ echo "Installing (upgrading) Supervisor..." ( cd /tmp; $COMMAND_PREFIX pip2.7 install --upgrade supervisor ) echo "Installing (upgrading) Virtualenv..." -$COMMAND_PREFIX pip2.7 install --force-reinstall --upgrade virtualenv -$COMMAND_PREFIX pip$PYTHON3_MAJOR_VERSION install --force-reinstall --upgrade virtualenv +$COMMAND_PREFIX pip2.7 install --upgrade virtualenv +$COMMAND_PREFIX pip$PYTHON3_MAJOR_VERSION install --upgrade virtualenv + +# Install system-wide NLTK because otherwise sudo is unable to find +# NLTK installed in virtualenv on Travis + +echo "Installing (upgrading) NLTK to install NLTK's data afterwards..." +$COMMAND_PREFIX pip$PYTHON3_MAJOR_VERSION install --upgrade nltk + +# Installing WordNet with NLTK +# (installing from own mirror on S3 to avoid hitting GitHub: https://github.com/nltk/nltk/issues/1787) +echo "Installing NLTK WordNet data..." +if [ `uname` == 'Darwin' ]; then + NLTK_DATA_PATH=/usr/local/share/nltk_data +else + NLTK_DATA_PATH=/usr/share/nltk_data +fi + +$COMMAND_PREFIX python$PYTHON3_MAJOR_VERSION \ + -m nltk.downloader \ + -u https://s3.amazonaws.com/mediacloud-nltk-data/nltk_data/index.xml \ + -d "$NLTK_DATA_PATH" \ + wordnet punkt echo "Creating mc-venv virtualenv..." echo "$(which python$PYTHON3_MAJOR_VERSION)" @@ -69,3 +90,6 @@ pip$PYTHON3_MAJOR_VERSION install --upgrade -r mediacloud/requirements.txt || { echo "'pip$PYTHON3_MAJOR_VERSION install' failed the first time, retrying..." pip$PYTHON3_MAJOR_VERSION install --upgrade -r mediacloud/requirements.txt } + + + diff --git a/lib/MediaWords/Job/AnnotateWithCoreNLP.pm b/lib/MediaWords/Job/AnnotateWithCoreNLP.pm index a72bf4defe..eea3f0cec7 100644 --- a/lib/MediaWords/Job/AnnotateWithCoreNLP.pm +++ b/lib/MediaWords/Job/AnnotateWithCoreNLP.pm @@ -31,12 +31,20 @@ use MediaWords::Util::CoreNLP; use MediaWords::DBI::Stories; use Readonly; +# Having a global database object should be safe because +# job workers don't fork() +my $db = undef; + # Run CoreNLP job sub run($;$) { my ( $self, $args ) = @_; - my $db = MediaWords::DB::connect_to_db(); + unless ( $db ) + { + # Postpone connecting to the database so that compile test doesn't do that + $db = MediaWords::DB::connect_to_db(); + } my $stories_id = $args->{ stories_id } + 0; unless ( $stories_id ) diff --git a/lib/MediaWords/Job/Bitly/FetchStoryStats.pm b/lib/MediaWords/Job/Bitly/FetchStoryStats.pm index 796cf06f11..013b199140 100644 --- a/lib/MediaWords/Job/Bitly/FetchStoryStats.pm +++ b/lib/MediaWords/Job/Bitly/FetchStoryStats.pm @@ -39,12 +39,17 @@ Readonly my $BITLY_RATE_LIMIT_SECONDS_TO_WAIT => 60 * 10; # every 10 minutes # How many times to try on rate limiting errors Readonly my $BITLY_RATE_LIMIT_TRIES => 7; # try fetching 7 times in total (70 minutes) +# Having a global database object should be safe because +# job workers don't fork() +my $db = undef; + # Run job sub run($;$) { my ( $self, $args ) = @_; - my $db = MediaWords::DB::connect_to_db(); + # Postpone connecting to the database so that compile test doesn't do that + $db ||= MediaWords::DB::connect_to_db(); my $stories_id = $args->{ stories_id } or die "'stories_id' is not set."; my $start_timestamp = $args->{ start_timestamp }; diff --git a/lib/MediaWords/Job/Facebook/FetchStoryStats.pm b/lib/MediaWords/Job/Facebook/FetchStoryStats.pm index ad55115210..44d8535d82 100644 --- a/lib/MediaWords/Job/Facebook/FetchStoryStats.pm +++ b/lib/MediaWords/Job/Facebook/FetchStoryStats.pm @@ -32,6 +32,10 @@ use MediaWords::Util::Process; use Readonly; use Data::Dumper; +# Having a global database object should be safe because +# job workers don't fork() +my $db = undef; + # Run job sub run($;$) { @@ -43,7 +47,8 @@ sub run($;$) fatal_error( 'Facebook API processing is not enabled.' ); } - my $db = MediaWords::DB::connect_to_db(); + # Postpone connecting to the database so that compile test doesn't do that + $db ||= MediaWords::DB::connect_to_db(); my $stories_id = $args->{ stories_id } or die "'stories_id' is not set."; diff --git a/lib/MediaWords/Languages/resources/ja/mecab-ipadic-neologd-dist b/lib/MediaWords/Languages/resources/ja/mecab-ipadic-neologd-dist index 6916402188..e1e57b8d1d 160000 --- a/lib/MediaWords/Languages/resources/ja/mecab-ipadic-neologd-dist +++ b/lib/MediaWords/Languages/resources/ja/mecab-ipadic-neologd-dist @@ -1 +1 @@ -Subproject commit 691640218849beee1363620e864481c0c766b013 +Subproject commit e1e57b8d1d402981f6ed452adda30c473905d8c0 diff --git a/mediacloud/mediawords/db/handler.py b/mediacloud/mediawords/db/handler.py index 49da513959..735bf34a3c 100644 --- a/mediacloud/mediawords/db/handler.py +++ b/mediacloud/mediawords/db/handler.py @@ -239,8 +239,9 @@ def schema_is_up_to_date(self) -> bool: raise McSchemaIsUpToDateException("Current schema version is 0") # Target schema version - sql = open(mc_sql_schema_path(), 'r').read() - target_schema_version = schema_version_from_lines(sql) + sql = open(mc_sql_schema_path(), 'r') + target_schema_version = schema_version_from_lines(sql.read()) + sql.close() if not target_schema_version: raise McSchemaIsUpToDateException("Invalid target schema version.") diff --git a/mediacloud/mediawords/util/config.py b/mediacloud/mediawords/util/config.py index 61091c63ac..dcd534e75c 100644 --- a/mediacloud/mediawords/util/config.py +++ b/mediacloud/mediawords/util/config.py @@ -43,8 +43,9 @@ def __parse_yaml(config_file: str) -> dict: if not os.path.isfile(config_file): raise McConfigException("Configuration file '%s' was not found." % config_file) - yaml_file = open(config_file, 'r').read() - yaml_data = yaml.load(yaml_file, Loader=Loader) + yaml_file = open(config_file, 'r') + yaml_data = yaml.load(yaml_file.read(), Loader=Loader) + yaml_file.close() return yaml_data diff --git a/mediacloud/mediawords/util/topic_modeling/__init__.py b/mediacloud/mediawords/util/topic_modeling/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mediacloud/mediawords/util/topic_modeling/model_gensim.py b/mediacloud/mediawords/util/topic_modeling/model_gensim.py new file mode 100644 index 0000000000..e05cd0847a --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/model_gensim.py @@ -0,0 +1,101 @@ +import gensim + +# from mediawords.db import connect_to_db +from mediawords.util.topic_modeling.sample_handler import SampleHandler +from mediawords.util.topic_modeling.topic_model import BaseTopicModel +from mediawords.util.topic_modeling.token_pool import TokenPool +from typing import Dict, List + + +class ModelGensim(BaseTopicModel): + """Generate topics of each story based on the LDA model + ModelGensim operates on a single story at a time + by comparing the occurrence of each token in all sentences of that story. + It does not consider the rest of stories. The benefits of this approach include: + 1. Each story contains the word in the topics of that story + 2. There is a fixed number of topics for each story""" + + def __init__(self) -> None: + self._story_number = 0 + self._stories_ids = [] + self._stories_tokens = [] + self._dictionary = None + self._corpus = [] + self._WORD_SPLITTER = ' + ' + self._COEFFICIENT_SPLITTER = '*' + + def add_stories(self, stories: Dict[int, List[List[str]]]) -> None: + """ + Adding new stories into the model + :param stories: a dictionary of new stories + """ + for story in stories.items(): + story_id = story[0] + story_tokens = story[1] + self._stories_ids.append(story_id) + self._stories_tokens.append(story_tokens) + + self._story_number = len(self._stories_ids) + + def summarize_topic(self, topic_number: int = 1, + word_number: int = 4, passes: int = 100) -> Dict[int, list]: + """ + summarize the topic of each story based on the frequency of occurrence of each word + :return: a dictionary of story id + and corresponding list of TOPIC_NUMBER topics (each topic contains WORD_NUMBER words) + """ + + story_topic = {} + + for i in range(len(self._stories_ids)): + # turn our token documents into a id <-> term dictionary + self._dictionary = gensim.corpora.Dictionary(self._stories_tokens[i]) + + # convert token documents into a document-term matrix + self._corpus = [self._dictionary.doc2bow(text) for text in self._stories_tokens[i]] + + # generate LDA model + self._model = gensim.models.ldamodel.LdaModel( + corpus=self._corpus, num_topics=topic_number, + id2word=self._dictionary, passes=passes) + + raw_topics = self._model.print_topics(num_topics=topic_number, num_words=word_number) + + story_topic[self._stories_ids[i]] = self._format_topics(raw_topics=raw_topics) + + return story_topic + + def _format_topics(self, raw_topics: List[tuple]) -> List[List[str]]: + """ + Return topics in the desired format + :param raw_topics: un-formatted topics + :return: formatted topics + """ + formatted_topics = [] + for topic in raw_topics: + words_str = topic[1] + # change the format + # from 'COEFFICIENT1*"WORD1" + COEFFICIENT2*"WORD2" + COEFFICIENT3*"WORD3"' + # to [WORD1, WORD2, WORD3] + words = [word_str.split(self._COEFFICIENT_SPLITTER)[1][1:-1] + for word_str in words_str.split(self._WORD_SPLITTER)] + formatted_topics.append(words) + + return formatted_topics + + def evaluate(self): + pass + + +# A sample output +if __name__ == '__main__': + model = ModelGensim() + + # pool = TokenPool(connect_db()) + # model.add_stories(pool.output_tokens(1, 0)) + # model.add_stories(pool.output_tokens(5, 1)) + + pool = TokenPool(SampleHandler()) + model.add_stories(pool.output_tokens()) + + print(model.summarize_topic()) diff --git a/mediacloud/mediawords/util/topic_modeling/model_lda.py b/mediacloud/mediawords/util/topic_modeling/model_lda.py new file mode 100644 index 0000000000..4efdcbaa37 --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/model_lda.py @@ -0,0 +1,285 @@ +import lda +import numpy as np +import logging + +# from mediawords.db import connect_to_db +from mediawords.util.topic_modeling.optimal_finder import OptimalFinder +from mediawords.util.topic_modeling.sample_handler import SampleHandler +from mediawords.util.topic_modeling.token_pool import TokenPool +from mediawords.util.topic_modeling.topic_model import BaseTopicModel +from gensim import corpora +from typing import Dict, List + + +class ModelLDA(BaseTopicModel): + """Generate topics of each story based on the LDA model + ModelLDA operates on all stories. + It groups the words that often occur together among all stories into a topic + and assign that each story with the topic that has the closest match. This means: + 1. We can only select the total number of topics among all stories + 2. The number of topics for each story is not fixed. Theoretically speaking, + some stories' topic words might not be the best match of the content of that story. + (i.e. some times we might find two stories have exactly the same topic) + 3. Since the topics are compared among all stories, + the difference between the topics are more significant than ModelGensim""" + + def __init__(self) -> None: + """Initialisations""" + super().__init__() + self._stories_ids = [] + self._stories_tokens = [] + self._vocab = [] + self._token_matrix = np.empty + self._stories_number = 0 + self._random_state = 1 + self._max_iteration = 10000 + logging.getLogger("lda").setLevel(logging.WARN) + + def add_stories(self, stories: Dict[int, List[List[str]]]) -> None: + """ + Adding new stories into the model + :param stories: a dictionary of new stories + """ + new_stories_tokens = [] + + for story in stories.items(): + story_id = story[0] + story_tokens = story[1] + self._stories_ids.append(story_id) + new_stories_tokens.append( + [tokens for sentence_tokens in story_tokens for tokens in sentence_tokens]) + + self._stories_tokens += new_stories_tokens + self._stories_number = len(self._stories_ids) + self._recompute_matrix(new_stories_tokens=new_stories_tokens) + + def _recompute_matrix(self, new_stories_tokens: list) -> None: + """ + Recompute the token matrix based on new tokens in new stories + :param new_stories_tokens: a list of new tokens + """ + dictionary = corpora.Dictionary(new_stories_tokens) + + self._vocab = list(dictionary.token2id.keys()) + + token_count = [] + for story_tokens in self._stories_tokens: + token_count.append([story_tokens.count(token) for token in self._vocab]) + + self._token_matrix = np.array(token_count) + + def summarize_topic(self, total_topic_num: int = 0, + topic_word_num: int = 4, + iteration_num: int = None) -> Dict[int, List[str]]: + """ + summarize the topic of each story based on the frequency of occurrence of each word + :return: a dictionary of story id + :rtype: list + and corresponding list of TOPIC_NUMBER topics (each topic contains WORD_NUMBER words) + """ + + iteration_num = iteration_num if iteration_num else self._max_iteration + + # logging.debug(msg="total_topic_num={}".format(total_topic_num)) + total_topic_num = total_topic_num if total_topic_num else self._stories_number + logging.debug(msg="total_topic_num={}".format(total_topic_num)) + + # turn our token documents into a id <-> term dictionary + self._model = lda.LDA(n_topics=total_topic_num, + n_iter=iteration_num, + random_state=self._random_state) + + self._model.fit_transform(self._token_matrix) + topic_word = self._model.topic_word_ + n_top_words = topic_word_num + + topic_words_list = [] + for i, topic_dist in enumerate(topic_word): + topic_words_list.append( + np.array(self._vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]) + + doc_topic = self._model.doc_topic_ + + story_topic = {} + + for i in range(self._stories_number): + story_topic[self._stories_ids[i]] = list(topic_words_list[doc_topic[i].argmax()]) + + return story_topic + + def evaluate(self, topic_num: int=None) -> List: + """ + Show the log likelihood for the current model + :param topic_num: total number of topics + :return: the log likelihood value + """ + if not topic_num: + topic_num = self._stories_number + + if not self._model: + logging.debug(msg="Model does not exist, " + "train a new one with topic_num = {}".format(topic_num)) + self._train(topic_num=topic_num) + + if self._model.n_topics != topic_num: + logging.debug(msg="model.n_topics({}) != desired topic_num ({})" + .format(self._model.n_topics, topic_num)) + self._train(topic_num=topic_num) + + return [self._model.n_topics, self._model.loglikelihood()] + + def _train(self, topic_num: int, word_num: int = 4, num_iteration: int = None) -> float: + """ + Avoid unnecessary trainings + :param topic_num: total number of topics + :param word_num: number of words for each topic + :param num_iteration: number of iteration for each time + :return: the final log likelihood value + """ + num_iteration = num_iteration if num_iteration \ + else self._max_iteration + + if (not self._model) or (self._model.n_topics != topic_num): + self.summarize_topic( + total_topic_num=topic_num, + topic_word_num=word_num, + iteration_num=num_iteration) + + return self._model.loglikelihood() + + def tune_with_polynomial(self, topic_word_num: int = 4, + score_dict: Dict[float, int] = None) -> int: + """Tune the model on total number of topics + until the optimal parameters are found""" + + logging.debug("pre preparation score_dict:{}".format(score_dict)) + + score_dict = self._prepare_sample_points( + topic_word_num=topic_word_num, score_dict=score_dict) + + logging.debug("post preparation score_dict:{}".format(score_dict)) + + maximum_topic_num = self._locate_max_point(score_dict=score_dict) + optimal_topic_num = score_dict.get(max(score_dict.keys())) + + return self._resolve_conflict(optimal=optimal_topic_num, + maximum=maximum_topic_num, + topic_word_num=topic_word_num, + score_dict=score_dict) + + def _prepare_sample_points(self, topic_word_num: int = 4, + score_dict: Dict[float, int]=None) -> Dict[float, int]: + """ + Prepare and store topic_num and corresponding likelihood value in a dictionary + so that they can be used to build polynomial model + :param topic_word_num: number of words for each topic + :param score_dict: A dictionary of likelihood scores : topic_num + :return: updated score_dict + """ + topic_num_samples = score_dict.values() if score_dict \ + else [1, int(self._stories_number * 0.5), self._stories_number] + + score_dict = score_dict if score_dict else {} + + logging.debug(topic_num_samples) + + for topic_num in iter(topic_num_samples): + if topic_num not in score_dict.values(): + likelihood = self._train(topic_num=topic_num, word_num=topic_word_num) + logging.debug(msg="Num = {}, lh={}".format(topic_num, likelihood)) + score_dict[likelihood] = topic_num + + return score_dict + + @staticmethod + def _locate_max_point(score_dict: Dict[float, int]=None): + """ + Use optimalFinder to identify the max point(s) + and convert it to integer (as it is used as topic_num) + :param score_dict: A dictionary of likelihood scores : topic_num + :return: topic_num that is predicted to have the max likelihood + """ + max_point = OptimalFinder().find_extreme( + x=list(score_dict.values()), + y=list(score_dict.keys()))[0] + logging.debug(msg="topic_num before rounding={}".format(max_point)) + + int_max_point = 1 if int(round(max_point)) == 0 else int(round(max_point)) + return int_max_point + + def _resolve_conflict(self, optimal: int, maximum: int, + topic_word_num: int, score_dict: Dict[float, int]): + """ + If maximum value != optimal value, try to resolve this conflict via iteration + :param optimal: the optimal value in the current score_dict + :param maximum: the maximum value predicted by polynomial model + :param topic_word_num: number of words in each topic + :param score_dict: A dictionary of likelihood scores : topic_num + :return: + """ + + if maximum == optimal: + # No conflict + return optimal + + # Has conflict, expand sample set to refine polynomial model + candidates = self._find_candidates( + optimal=optimal, + maximum=maximum, + checked=list(score_dict.values())) + + if not candidates: + # Cannot expand anymore, return current best value + return optimal + + for candidate in candidates: + # compute more topic_num-likelihood pair to refine model + likelihood = self._train(topic_num=candidate, word_num=topic_word_num) + score_dict[likelihood] = candidate + + # Iteratively tune with more data pairs + return self.tune_with_polynomial( + topic_word_num=topic_word_num, score_dict=score_dict) + + def _find_candidates(self, optimal: int, maximum: int, checked: List[int]) -> List[int]: + """ + Based on the optimal topic_num, maximum point on polynomial diagram, + generate a new list of candidates as sample points to refine the diagram + :param optimal: optimal topic_num in the current score_dict + :param maximum: maximum point in the polynomial diagram + :param checked: topic_num that has been checked, hence do not need to re-compute + :return: qualified new candidates to check + """ + + candidates = [optimal, maximum, int((optimal+maximum) * 0.5)] + qualified = [] + + for candidate in candidates: + # candidate for topic_num should be at least 1 + if candidate < 1: + continue + # avoid the long tail for accuracy + if candidate > self._stories_number: + continue + # no need to check candidate again + if candidate in checked: + continue + qualified.append(candidate) + + return qualified + + +# A sample output +if __name__ == '__main__': + model = ModelLDA() + + # pool = TokenPool(connect_to_db()) + pool = TokenPool(SampleHandler()) + + model.add_stories(pool.output_tokens()) + + topic_number = model.tune_with_polynomial() + print(topic_number) + + evaluation = model.evaluate(topic_num=topic_number) + print(evaluation) diff --git a/mediacloud/mediawords/util/topic_modeling/model_nmf.py b/mediacloud/mediawords/util/topic_modeling/model_nmf.py new file mode 100644 index 0000000000..7c9b116fd9 --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/model_nmf.py @@ -0,0 +1,115 @@ +import numpy as np +import logging +from sklearn import decomposition + +# from mediawords.db import connect_to_db +from mediawords.util.topic_modeling.sample_handler import SampleHandler +from mediawords.util.topic_modeling.token_pool import TokenPool +from mediawords.util.topic_modeling.topic_model import BaseTopicModel +from gensim import corpora +from typing import Dict, List + + +class ModelNMF(BaseTopicModel): + """Generate topics of each story based on the NMF model + ModelNMG applies non-negative matrix factorization. + Whereas LDA is a probabilistic model capable of expressing uncertainty about the + placement of topics across texts and the assignment of words to topics, + NMF is a deterministic algorithm which arrives at a single representation of the corpus. + Because of this, the topic it came up with might be slightly different from LDA.""" + + def __init__(self) -> None: + """Initialisations""" + super().__init__() + self._stories_ids = [] + self._stories_tokens = [] + self._vocab = [] + self._token_matrix = np.empty + self._stories_number = 0 + self._random_state = 1 + logging.getLogger("lda").setLevel(logging.WARNING) + + def add_stories(self, stories: Dict[int, List[List[str]]]) -> None: + """ + Adding new stories into the model + :param stories: a dictionary of new stories + """ + new_stories_tokens = [] + + for story in stories.items(): + story_id = story[0] + story_tokens = story[1] + self._stories_ids.append(story_id) + new_stories_tokens.append( + [tokens for sentence_tokens in story_tokens for tokens in sentence_tokens]) + + self._stories_tokens += new_stories_tokens + self._stories_number = len(self._stories_ids) + self._recompute_matrix(new_stories_tokens=new_stories_tokens) + + def _recompute_matrix(self, new_stories_tokens: list) -> None: + """ + Recompute the token matrix based on new tokens in new stories + :param new_stories_tokens: a list of new tokens + """ + + # turn our token documents into a id <-> term dictionary + dictionary = corpora.Dictionary(new_stories_tokens) + + self._vocab = list(dictionary.token2id.keys()) + + token_count = [] + for story_tokens in self._stories_tokens: + token_count.append([story_tokens.count(token) for token in self._vocab]) + + self._token_matrix = np.array(token_count) + + def summarize_topic(self, total_topic_num: int = 0, each_topic_num: int = 1, + topic_word_num: int = 4, iteration_num: int = 1000) -> Dict[int, list]: + """ + summarize the topic of each story based on the frequency of occurrence of each word + :return: a dictionary of story id + and corresponding list of TOPIC_NUMBER topics (each topic contains WORD_NUMBER words) + """ + total_topic_num = total_topic_num if total_topic_num else self._stories_number + + self._model = decomposition.NMF( + n_components=total_topic_num, + max_iter=iteration_num, + random_state=self._random_state) + + document_topic = self._model.fit_transform(self._token_matrix) + + components = self._model.components_ + + topic_words_list = [] + for topic in components: + word_idx = np.argsort(topic)[::-1][0:topic_word_num] + topic_words_list.append([self._vocab[i] for i in word_idx]) + + document_topic /= np.sum(document_topic, axis=1, keepdims=True) + + story_topic = {} + + for i in range(self._stories_number): + top_topic_ids = np.argsort(document_topic[i, :])[::-1][0:each_topic_num] + story_topic[self._stories_ids[i]] = [topic_words_list[i] for i in top_topic_ids] + + return story_topic + + def evaluate(self): + pass + + +# A sample output +if __name__ == '__main__': + model = ModelNMF() + + # pool = TokenPool(connect_to_db()) + # model.add_stories(pool.output_tokens(1, 0)) + # model.add_stories(pool.output_tokens(5, 2)) + + pool = TokenPool(SampleHandler()) + model.add_stories(pool.output_tokens()) + + print(model.summarize_topic()) diff --git a/mediacloud/mediawords/util/topic_modeling/optimal_finder.py b/mediacloud/mediawords/util/topic_modeling/optimal_finder.py new file mode 100644 index 0000000000..f7e137b08e --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/optimal_finder.py @@ -0,0 +1,57 @@ +import numpy as np +import logging + +from typing import List +from numpy.polynomial import polynomial + + +class OptimalFinder: + """Given a list of data points, + identify the best fit polynomial equation, + and find the root point(s) which is the max/min value""" + + @staticmethod + def _identify_equation(x: List[int], + y: List[float], + degree: int=2) -> List[int]: + """ + Identify the polynomial equation of x and y + :param x: a list of x values + :param y: a list of y values + :param degree:c + :return: coefficient of polynomials, params[i] * x^(degree-i) + """ + + params = list(np.polyfit(x=x, y=y, deg=degree)[::-1]) + logging.warning(msg="Equation params = {}".format(params)) + return params + + @staticmethod + def _find_roots(params: List[int]=None) -> List[int]: + """ + Find the root of a polynomial equation + :param params: parameters of polynomial equation, params[i] * x^(degree-i) + :return: the list of roots + """ + roots = list(np.roots(params)) + logging.warning(msg="Equation roots = {}".format(roots)) + return roots + + def find_extreme(self, + x: List[int], + y: List[float], + degree: int=2) -> List[int]: + """ + Find out the extreme value of the polynomial via derivative + :param x: a list of x values + :param y: a list of y values + :param degree: max power of x + :return: the list of extreme values + """ + if len(x) < 3: + return [x[y.index(max(y))]] + params = self._identify_equation(x=x, y=y, degree=degree) + first_der_params = [param for param in polynomial.polyder(params)][::-1] + logging.warning(msg="First Derivative Parameters = {}".format(first_der_params)) + roots = self._find_roots(params=first_der_params) + return roots diff --git a/mediacloud/mediawords/util/topic_modeling/sample_handler.py b/mediacloud/mediawords/util/topic_modeling/sample_handler.py new file mode 100644 index 0000000000..285a473ae2 --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/sample_handler.py @@ -0,0 +1,23 @@ +import os +import ast + +from mediawords.util.paths import mc_root_path + + +class SampleHandler: + """ + Mimic the behaviour of database handler, handles access to the sample file instead. + """ + _SAMPLE_STORIES \ + = os.path.join(mc_root_path(), + "mediacloud/mediawords/util/topic_modeling/sample_stories_1.txt") + + def query(self): + """ + mimics the behaviour of database query, except no query command is needed + :return: the sample data, which mimics the content of database + """ + with open(self._SAMPLE_STORIES) as sample_file: + lines = sample_file.readlines()[0] + + return ast.literal_eval(lines) diff --git a/mediacloud/mediawords/util/topic_modeling/sample_stories_1.txt b/mediacloud/mediawords/util/topic_modeling/sample_stories_1.txt new file mode 100644 index 0000000000..c2e3163dc2 --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/sample_stories_1.txt @@ -0,0 +1 @@ +[{'stories_id': 11, 'sentence': 'U.S. | U.S. Court Seems Split on Utah Gay Marriage Ban'}, {'stories_id': 11, 'sentence': 'U.S. Court Seems Split on Utah Gay Marriage Ban'}, {'stories_id': 11, 'sentence': 'DENVER — The push for same-sex marriage, which has celebrated victory after victory in courtrooms across the country, entered an uncertain stage on Thursday as a federal appeals court appeared divided about whether the socially conservative state of Utah could limit marriage to a man and a woman.'}, {'stories_id': 11, 'sentence': 'In an hour of arguments inside a packed courtroom, three judges from the Federal Court of Appeals for the 10th Circuit sparred with lawyers about how such bans affected the children of same-sex parents and whether preventing gay couples from marrying actually did anything to promote or strengthen heterosexual unions and families.'}, {'stories_id': 11, 'sentence': 'Related Coverage'}, {'stories_id': 11, 'sentence': 'U.S. to Recognize 1,300 Marriages Disputed by Utah JAN.'}, {'stories_id': 11, 'sentence': '10, 2014'}, {'stories_id': 11, 'sentence': 'Justices’ Halt to Gay Marriage Leaves Utah Couples in Limbo JAN.'}, {'stories_id': 11, 'sentence': '6, 2014'}, {'stories_id': 11, 'sentence': 'Federal Judge Rules That Same-Sex Marriage Is Legal in Utah DEC.'}, {'stories_id': 11, 'sentence': '20, 2013'}, {'stories_id': 11, 'sentence': 'The three judges — two appointed by Republican presidents and one by a Democrat — focused on fine-grained issues, like what level of judicial scrutiny to apply to the case, as well as more profound questions of how to define a marriage and whether state bans on same-sex nuptials were akin to those against polygamy, or instead fundamentally violated the constitutional rights of same-sex couples.'}, {'stories_id': 11, 'sentence': 'While the decision will reverberate across Utah, it will hardly be the last word on whether same-sex couples have the same rights to marry as heterosexuals.'}, {'stories_id': 11, 'sentence': 'Next week, the same appeals court is scheduled to hear arguments over Oklahoma’s ban on same-sex marriage, which a federal judge declared unconstitutional in January.'}, {'stories_id': 11, 'sentence': 'Marriage cases in several other states, including Virginia and Texas, are percolating through the courts, and the Supreme Court is widely expected to tackle the issue.'}, {'stories_id': 11, 'sentence': 'In December, Utah briefly became the 18th state to legalize same-sex marriage when a federal judge in the state tossed out a voter-approved ban on such nuptials, one of about 11 similar prohibitions that passed in 2004.'}, {'stories_id': 11, 'sentence': 'The judge, who was appointed by President Obama with support from conservative Utah politicians, said the ban violated the “fundamental right” of same-sex couples to marry.'}, {'stories_id': 11, 'sentence': 'The prohibition, Judge Robert J. Shelby wrote, violated guarantees of due process and equal protection in the Constitution.'}, {'stories_id': 11, 'sentence': 'His ruling touched off 17 days of legal chaos as hundreds of same-sex couples poured into county clerks’ offices across the state to wed while Utah officials scrambled to stop them and put a halt to the marriages.'}, {'stories_id': 11, 'sentence': 'By the time the Supreme Court intervened and issued a stay in the case — effectively suspending the Utah judge’s ruling and temporarily reinstating the ban — more than 1,000 same-sex couples had married, and many had changed their names, signed up for spousal health insurance and taken steps to become legal parents of children they were raising.'}, {'stories_id': 11, 'sentence': 'On Thursday, Peggy A. Tomsic, a lawyer for the three same-sex couples who had gone to court against Utah, argued in the courtroom here that the state’s ban stigmatized same-sex couples, denying them a fundamental right for no valid reason.'}, {'stories_id': 11, 'sentence': 'Gene C. Schaerr, a lawyer for Utah, argued that the state’s residents had the right to limit marriages to exclude same-sex couples, and said that redefining it as “genderless” posed risks to a traditional view of the institution.'}, {'stories_id': 11, 'sentence': 'The judges fired a barrage of skeptical questions, questioning the state’s legal team on whether banning same-sex marriage was akin to outlawing interracial unions, and skeptically asking the plaintiffs whether the state was not entitled to set its own definitions of marriage.'}, {'stories_id': 11, 'sentence': 'Advertisement'}, {'stories_id': 11, 'sentence': 'Judge Paul J. Kelly, who was nominated by President George Bush, appeared more deferential to Utah’s voters and its Legislature while Judge Carlos F. Lucero, a Clinton appointee, asked pointed questions about whether Utah was stigmatizing children of gay couples.'}, {'stories_id': 11, 'sentence': 'Legal observers said the deciding vote appeared to belong to Judge Jerome A. Holmes, who was nominated by President George W. Bush, and lofted tough questions at both sides.'}, {'stories_id': 11, 'sentence': '“Why does it matter who’s claiming the right?”'}, {'stories_id': 11, 'sentence': 'Judge Holmes asked a lawyer representing Utah.'}, {'stories_id': 11, 'sentence': '“It’s a fundamental right, and why does it matter the participants in that enterprise?'}, {'stories_id': 11, 'sentence': 'Why does it matter?”'}, {'stories_id': 11, 'sentence': 'Thursday’s arguments signaled the first time an appeals court had considered the issue since the Supreme Court handed two major victories to gay-rights supporters last summer, striking down a law that denied federal benefits to same-sex couples and clearing the way for same-sex marriages across California.'}, {'stories_id': 11, 'sentence': 'It was a day freighted with emotion for gay-rights supporters and same-sex couples in Utah.'}, {'stories_id': 11, 'sentence': 'Dozens flew to Denver from Utah to attend the arguments, lining up early Thursday morning for a seat in the courtroom.'}, {'stories_id': 11, 'sentence': 'A conservative state lawmaker was one of a handful of supporters of the ban to attend the hearing.'}, {'stories_id': 11, 'sentence': '“Our lives are on the line here,” said Derek Kitchen, the plaintiff who lent his last name to the case — Kitchen v. Herbert.'}, {'stories_id': 11, 'sentence': 'Gary R. Herbert is Utah’s Republican governor.'}, {'stories_id': 11, 'sentence': 'As Mr. Kitchen and the other plaintiffs chatted and exchanged reassuring pats on the shoulder in the courtroom, they were approached by Utah’s attorney general, Sean Reyes, whose office has taken the lead role in defending the same-sex marriage ban.'}, {'stories_id': 11, 'sentence': 'Shaking hands and greeting the plaintiffs, Mr. Reyes crouched down and told them: “I’m sorry that we’re causing you pain.'}, {'stories_id': 11, 'sentence': 'Sometime after the case is over, I hope we can sit down.”'}, {'stories_id': 11, 'sentence': 'After the hearing, Mr. Reyes said he had told the plaintiffs that the legal confrontation was not personal, and that he knew that the plaintiffs’ families were as important to them as his own was to him.'}, {'stories_id': 11, 'sentence': 'But he said it was unclear what would happen to the unions and benefits of Utah’s newly married same-sex couples if the state prevailed in its appeals.'}, {'stories_id': 11, 'sentence': 'Utah has previously raised the possibility that those marriages could be dissolved.'}, {'stories_id': 11, 'sentence': 'Separately, in Indiana on Thursday, a federal judge ruled that the state must, for now, recognize the same-sex marriage of a woman who is terminally ill.'}, {'stories_id': 11, 'sentence': 'Nikole Quasney and Amy Sandler have two children and joined one of five lawsuits challenging the state’s ban on same-sex marriage last month, citing the need to have their relationship legally recognized in order to access benefits for surviving family members.'}, {'stories_id': 11, 'sentence': 'Ms. Quasney received a diagnosis of ovarian cancer in 2009; the couple married in Massachusetts last year.'}] \ No newline at end of file diff --git a/mediacloud/mediawords/util/topic_modeling/sample_stories_10.txt b/mediacloud/mediawords/util/topic_modeling/sample_stories_10.txt new file mode 100644 index 0000000000..42d02cae7e --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/sample_stories_10.txt @@ -0,0 +1 @@ +[{'sentence': 'WASHINGTON — Each of the three previous presidents — two Republicans, one Democrat — signed an increase in the federal minimum wage.', 'stories_id': 14}, {'sentence': 'Smugglers sent migrants across the Rio Grande at a point where the water is too shallow for Border Patrol boats that might have turned them back safely at the midriver boundary between the United States and Mexico.', 'stories_id': 16}, {'sentence': 'Twitter and Facebook Wield Little Influence on TV Watching', 'stories_id': 20}, {'sentence': 'U.S. | U.S. Court Seems Split on Utah Gay Marriage Ban', 'stories_id': 11}, {'sentence': 'How is the English language so impoverished that we do not have a word for the glory of rice brought nearly to a scorch at the bottom of a pot?', 'stories_id': 17}, {'sentence': 'Nine former professional hockey players have filed a lawsuit against the N.H.L. that says the league “intentionally created, fostered and promoted a culture of extreme violence.”', 'stories_id': 15}, {'sentence': 'WASHINGTON — A House committee voted on Thursday to hold a former Internal Revenue Service official in contempt for refusing to answer its questions about her role in holding up applications for tax exemption from conservative political groups before the last election.', 'stories_id': 18}, {'sentence': 'Mike Greste, the brother of a detained Al Jazeera journalist, Peter Greste, commented after an Egyptian judge dismissed videos presented by the prosecution.', 'stories_id': 19}, {'sentence': 'CBS made its choice, quickly and definitively: Stephen Colbert will succeed David Letterman as the host of its late-night franchise, which Mr. Letterman created when he came to the network in 1993.', 'stories_id': 21}, {'sentence': 'Sebelius Resigns After Troubles Over Health Site.', 'stories_id': 13}, {'sentence': 'The official, Lois Lerner, faced the same panel, the Oversight and Government Reform Committee, last year and made a statement denying any wrongdoing.', 'stories_id': 18}, {'sentence': 'The Spanish call it socarrat; the Chinese, guo ba.', 'stories_id': 17}, {'sentence': 'Ending a stormy five-year tenure marred by the disastrous rollout of President Obama’s signature legislative achievement, Kathleen Sebelius is resigning as secretary of health and human services.', 'stories_id': 13}, {'sentence': 'Listen to executives at Twitter and Facebook talk about how we watch television and you might walk away thinking that Americans are chattering nonstop on the social networks while watching their favorite shows.', 'stories_id': 20}, {'sentence': 'The network made the announcement on Thursday, exactly one week after Mr. Letterman said that he would be leaving the “Late Show With David Letterman” after one more year on the air.', 'stories_id': 21}, {'sentence': 'Why not President Obama?', 'stories_id': 14}, {'sentence': 'U.S. Court Seems Split on Utah Gay Marriage Ban', 'stories_id': 11}, {'sentence': 'HIDALGO, Tex. — Border Patrol agents in olive uniforms stood in broad daylight on the banks of the Rio Grande, while on the Mexican side smugglers pulled up in vans and unloaded illegal migrants.', 'stories_id': 16}, {'sentence': 'The suit, which was filed Wednesday in federal court in Manhattan, is the latest in a growing string of challenges to the N.H.L. Similar to suits brought by retired N.F.L. players , the complaint said that the N.H.L. failed to take adequate steps to warn the players of the dangers of the sport and deliberately promoted violence for profit.', 'stories_id': 15}, {'sentence': 'The image above is from March 31.', 'stories_id': 19}, {'sentence': 'Given Mr. Obama’s emphasis on income inequality, and the popularity of an increase in opinion polls, you would think he would.', 'stories_id': 14}, {'sentence': 'CAIRO — Prosecutors on Thursday were unable to produce video footage that they say is the basis of their case against three journalists accused of conspiring to broadcast false reports about civil strife in Egypt.', 'stories_id': 19}, {'sentence': 'The complaint is more graphic than other suits brought by former hockey players, highlighting the role of enforcers in the N.H.L. over many years and mentioning movies that celebrated fighting in hockey.', 'stories_id': 15}, {'sentence': 'The agents were clearly visible on that recent afternoon, but the migrants were undeterred.', 'stories_id': 16}, {'sentence': 'Mr. Colbert , the star of Comedy Central’s “Colbert Report,” will be — in one way — an all-new talent for CBS because he will drop the broadly satirical blowhard conservative character he has played for nine years, and instead perform as himself.', 'stories_id': 21}, {'sentence': 'It is graten in Haiti, nurungji in Korea, pegao in Puerto Rico, khao tang in Thailand, xoon in Senegal.', 'stories_id': 17}, {'sentence': 'Then she refused to answer questions , invoking her Fifth Amendment right to not incriminate herself.', 'stories_id': 18}, {'sentence': 'The reality is that most of us don’t tweet or post at all while we’re plopped in front of the tube.', 'stories_id': 20}, {'sentence': 'DENVER — The push for same-sex marriage, which has celebrated victory after victory in courtrooms across the country, entered an uncertain stage on Thursday as a federal appeals court appeared divided about whether the socially conservative state of Utah could limit marriage to a man and a woman.', 'stories_id': 11}, {'sentence': 'In Persian cuisine, it is tahdig and merits almost its own subgenre, with variations from potatoes to lettuce layered beneath rice in a heavy pan.', 'stories_id': 17}, {'sentence': '“Through the sophisticated use of extreme violence as a commodity, from which the N.H.L. has generated billions of dollars, the N.H.L. has subjected and continues to subject its players to the imminent risk of head trauma and, as a result, devastating and long-term negative health consequences,” the lawsuit said.', 'stories_id': 15}, {'sentence': 'Mainly women and children, 45 in all, they crossed the narrow river on the smugglers’ rafts, scrambled up the bluff and turned themselves in, signaling a growing challenge for the immigration authorities.', 'stories_id': 16}, {'sentence': 'Mr. Colbert became the immediate front-runner for the position both because of an increasing recognition of his talent — his show won two Emmy Awards last year — and because he clearly wanted the job.', 'stories_id': 21}, {'sentence': 'Republicans were outraged, asserting that Ms. Lerner had effectively waived her Fifth Amendment right by commenting on the accusations against her in her statement and in other settings, including under questioning from the Justice Department.', 'stories_id': 18}, {'sentence': 'In an hour of arguments inside a packed courtroom, three judges from the Federal Court of Appeals for the 10th Circuit sparred with lawyers about how such bans affected the children of same-sex parents and whether preventing gay couples from marrying actually did anything to promote or strengthen heterosexual unions and families.', 'stories_id': 11}, {'sentence': 'When we do, half the time we’re talking about something other than TV.', 'stories_id': 20}, {'sentence': 'But the story of recent increases underscores the indispensable ingredient he so far lacks: a Republican leader strongly motivated to make a deal over the party’s philosophical objections.', 'stories_id': 14}, {'sentence': 'Instead, they showed a Cairo courtroom footage of family photographs, trotting horses and Somali refugees in Kenya.', 'stories_id': 19}, {'sentence': 'The committee determined last year, in a party-line vote, that Ms. Lerner had indeed waived her right to not testify.', 'stories_id': 18}, {'sentence': 'Related Coverage', 'stories_id': 11}, {'sentence': 'His representation had ensured that he would be available to CBS by syncing his recent contracts with Mr. Letterman’s.', 'stories_id': 21}, {'sentence': 'And social media conversation is far weaker than traditional factors, like TV commercials for new shows or our sheer laziness in changing channels, in prompting us to tune into each season’s new offerings.', 'stories_id': 20}, {'sentence': '“It is obvious the prosecutor has not even looked at our videos or the evidence,” one of the defendants, Mohamed Fadel Fahmy, shouted across the courtroom here.', 'stories_id': 19}, {'sentence': 'In 1989, it was a new Republican in the White House.', 'stories_id': 14}, {'sentence': 'At Parmys Persian Fusion , which opened in November in the East Village, lavash is the crust, scotched with tiny broken sunrays that turn out to be grains of rice, flattened and bronzed.', 'stories_id': 17}, {'sentence': 'After six years of steep declines across the Southwest, illegal crossings have soared in South Texas while remaining low elsewhere.', 'stories_id': 16}, {'sentence': 'The plaintiffs in the suit are the former players Dan LaCouture, 36; Dan Keczmer, 45; Jack Carlson, 59; Richard Brennan, 41; Brad Maxwell, 56; Michael Peluso, 48; Tom Younghans, 61; Allan Rourke, 34; and Scott Bailey, 41.', 'stories_id': 15}, {'sentence': 'On Thursday, it voted 21-12 to hold her in contempt and refer the matter to the full House of Representatives.', 'stories_id': 18}, {'sentence': '“The trial is a joke,” he said.', 'stories_id': 19}, {'sentence': 'President George Bush, while campaigning to succeed Ronald Reagan, had promised “a kinder, gentler America.”', 'stories_id': 14}, {'sentence': 'Some of the players were brawlers, like Carlson, who racked up 1,111 penalty minutes in the N.H.L. and the World Hockey Association.', 'stories_id': 15}, {'sentence': 'Those are among the crucial findings of a new study released Thursday by the Council for Research Excellence, a Nielsen-funded group that does in-depth research on how Americans use media that is shared with its member broadcasters, advertisers, publishers and social media companies.', 'stories_id': 20}, {'sentence': 'His current deal with Comedy Central will expire at the end of this year, making the timing ideal for him to leave for CBS.', 'stories_id': 21}, {'sentence': 'The Border Patrol made more than 90,700 apprehensions in the Rio Grande Valley in the past six months, a 69 percent increase over last year.', 'stories_id': 16}, {'sentence': 'U.S. to Recognize 1,300 Marriages Disputed by Utah JAN.', 'stories_id': 11}, {'sentence': 'They pop under the teeth.', 'stories_id': 17}, {'sentence': '10, 2014', 'stories_id': 11}, {'sentence': 'The council surveyed 1,665 respondents, ages 15 to 54, who were selected to be representative of the online population.', 'stories_id': 20}, {'sentence': 'Over this is poured gheimeh, a thick, deep red stew of beef, broken-down tomatoes and yellow split peas, saturated with the tang of limes boiled and sun-baked until black and imploding.', 'stories_id': 17}, {'sentence': 'The migrants are no longer primarily Mexican laborers.', 'stories_id': 16}, {'sentence': 'Mr. Taylor added: “Ms. Lerner did not waive her Fifth Amendment rights by proclaiming her innocence.', 'stories_id': 18}, {'sentence': 'In a statement on Thursday, he said: “I won’t be doing the new show in character, so we’ll all get to find out how much of him was me.', 'stories_id': 21}, {'sentence': '“This is arbitrary detention.”', 'stories_id': 19}, {'sentence': 'He was supposed to play the third Hanson brother in the 1977 movie “Slap Shot,” but was called up from the minors just before filming began.', 'stories_id': 15}, {'sentence': 'The Democrats then controlling both houses of Congress set out to take him up on it.', 'stories_id': 14}, {'sentence': 'Justices’ Halt to Gay Marriage Leaves Utah Couples in Limbo JAN.', 'stories_id': 11}, {'sentence': 'Peluso, who played in the N.H.L. from 1990 to 1998, led the league with 408 penalty minutes in 1991-92 and fought, according to the suit, 179 times in his nine-year career.', 'stories_id': 15}, {'sentence': 'There is not a court in this country that will hold Ms. Lerner in contempt of Congress.”', 'stories_id': 18}, {'sentence': 'The participants used a mobile app to report any time they saw, heard or communicated something about prime-time TV shows over the course of 21 days last fall, as the new season’s lineup of TV shows made their debuts.', 'stories_id': 20}, {'sentence': 'I’m looking forward to it.”', 'stories_id': 21}, {'sentence': 'Instead they are Central Americans, including many families with small children and youngsters without their parents, who risk a danger-filled journey across Mexico.', 'stories_id': 16}, {'sentence': 'The judge nonetheless rejected the journalists’ appeals to be released on bail and returned them to jail until the next court session, scheduled for April 22.', 'stories_id': 19}, {'sentence': 'Mr. Bush drove a hard bargain on the minimum wage.', 'stories_id': 14}, {'sentence': 'This is intended as an appetizer; the kitchen has overshot.', 'stories_id': 17}, {'sentence': 'The three defendants — Peter Greste, an Australian; Mr. Fahmy, a dual citizen of Egypt and Canada; and Baher Mohamed, an Egyptian — have been held since their arrest in December on charges that they conspired with the Muslim Brotherhood to broadcast false reports of unrest in order to bring down the military-backed government.', 'stories_id': 19}, {'sentence': '6, 2014', 'stories_id': 11}, {'sentence': 'He vetoed the first version Congress sent on grounds that it raised the wage by 30 cents an hour too much.', 'stories_id': 14}, {'sentence': 'In a 2011 interview with juniorhockey.com , Peluso said he was suffering concussion-related seizures and depression in retirement and complained about poor pension benefits and health insurance from the N.H.L. and the N.H.L. Players’ Association.', 'stories_id': 15}, {'sentence': 'Driven out by deepening poverty but also by rampant gang violence, increasing numbers of migrants caught here seek asylum, setting off lengthy legal procedures to determine whether they qualify.', 'stories_id': 16}, {'sentence': 'Turshi, a loose condiment of pickled vegetables that looks like salsa verde, arrives with the bread but is better reserved for the rice and meat.', 'stories_id': 17}, {'sentence': 'Representative John J. Duncan Jr., a Republican member of the committee from Tennessee and a former judge, said Thursday that Ms. Lerner could not be allowed to make a statement asserting her innocence and then invoke her Fifth Amendment right.', 'stories_id': 18}, {'sentence': 'Only 16.1 percent of the survey respondents said they had used social media while watching TV during prime time.', 'stories_id': 20}, {'sentence': 'Mr. Colbert, 49, had been subtly shifting away from the character in recent years, especially in on-air interviews.', 'stories_id': 21}, {'sentence': 'People close to him said he had for some time believed he would soon have to move beyond the satirical Colbert character — though not from the name.', 'stories_id': 21}, {'sentence': 'But he eventually accepted a two-stage increase to $4.25 an hour on the condition that lawmakers include a lower “training wage” for teenagers.', 'stories_id': 14}, {'sentence': 'Grilled eggplant is littered with dried mint and garlic chips fried nearly black, under a ring of kashk (whey) with a sourness past yogurt’s.', 'stories_id': 17}, {'sentence': '“If that was possible, every person, every defendant in any proceeding in this country would do that,” Mr. Duncan said.', 'stories_id': 18}, {'sentence': 'All three journalists worked for Al Jazeera’s English-language news channel.', 'stories_id': 19}, {'sentence': 'And less than half of the people using social media were actually discussing the show they were watching.', 'stories_id': 20}, {'sentence': 'The new migrant flow, largely from El Salvador, Guatemala and Honduras, is straining resources and confounding Obama administration security strategies that work effectively in other regions.', 'stories_id': 16}, {'sentence': '“ There is no question in my mind that brain injuries and depression are linked,” Peluso told the website.', 'stories_id': 15}, {'sentence': 'Federal Judge Rules That Same-Sex Marriage Is Legal in Utah DEC.', 'stories_id': 11}, {'sentence': 'In 1996, it was a new Republican Senate leader.', 'stories_id': 14}, {'sentence': 'He has used the French pronunciation of Colbert (Cole-BEAR, rather than COLE-burt) during his entire career in show business.', 'stories_id': 21}, {'sentence': '20, 2013', 'stories_id': 11}, {'sentence': '“However, I find it has more to do with low self-esteem.', 'stories_id': 15}, {'sentence': 'It is further complicating President Obama’s uphill push on immigration, fueling Republican arguments for more border security before any overhaul.', 'stories_id': 16}, {'sentence': 'Facebook was by far the most popular social network for people chatting during shows, used by about 11.4 percent of TV watchers, compared with 3.3 percent for Twitter.', 'stories_id': 20}, {'sentence': 'A fourth Al Jazeera journalist, Abdullah Elshamy, who worked for its main Arabic-language channel, has been held without charges since last August.', 'stories_id': 19}, {'sentence': '“They’d come in and testify and then plead the Fifth so they couldn’t be questioned, so they couldn’t be cross-examined, so that they couldn’t be held accountable.”', 'stories_id': 18}, {'sentence': 'Kuku sabzi, described on the menu as a pie, is closer to a frittata, moist yet springy, with almost more herbs than egg.', 'stories_id': 17}, {'sentence': '“To allow this,” Mr. Duncan said, “makes a mockery of our system.”', 'stories_id': 18}, {'sentence': 'Other creative details of the new show are still undetermined, CBS executives said, including whether the show will remain in New York or relocate to Los Angeles.', 'stories_id': 21}, {'sentence': 'With detention facilities, asylum offices and immigration courts overwhelmed, enough migrants have been released temporarily in the United States that back home in Central America people have heard that those who make it to American soil have a good chance of staying.', 'stories_id': 16}, {'sentence': 'The stews are dense and rich: ghormeh sabzi, underscored by bittersweet fenugreek and whole collapsing orbs of black limes; fesenjan, chicken sticky with pomegranate molasses and simmered with crushed walnuts, with an infusion of sweet potato purée for extra body; lamb shank slow-cooked with cinnamon and dunked in a ruddy broth that turns out to be the part everyone wants.', 'stories_id': 17}, {'sentence': 'The three judges — two appointed by Republican presidents and one by a Democrat — focused on fine-grained issues, like what level of judicial scrutiny to apply to the case, as well as more profound questions of how to define a marriage and whether state bans on same-sex nuptials were akin to those against polygamy, or instead fundamentally violated the constitutional rights of same-sex couples.', 'stories_id': 11}, {'sentence': 'Trent Lott took over after Bob Dole, then running for president against the incumbent Democrat, Bill Clinton, resigned his Senate seat.', 'stories_id': 14}, {'sentence': 'They have denied any connection to the Muslim Brotherhood.', 'stories_id': 19}, {'sentence': 'Former skilled guys suffered head injuries, and they don’t appear to be suicidal.”', 'stories_id': 15}, {'sentence': 'The research findings contradict the notion — peddled heavily by Twitter and Facebook in their pitches to producers — that conversations on Twitter and Facebook are a big factor driving people to tune into TV shows.', 'stories_id': 20}, {'sentence': 'But if you are Persian, you are here for the kebabs.', 'stories_id': 17}, {'sentence': 'While the decision will reverberate across Utah, it will hardly be the last word on whether same-sex couples have the same rights to marry as heterosexuals.', 'stories_id': 11}, {'sentence': 'But several executives connected to the negotiations pointed out that Mr. Colbert had established a settled family life in Montclair, N.J., and had never looked to move to Hollywood.', 'stories_id': 21}, {'sentence': '“Word has gotten out that we’re giving people permission and walking them out the door,” said Chris Cabrera, a Border Patrol agent who is vice president of the local of the National Border Patrol Council, the agents’ union.', 'stories_id': 16}, {'sentence': '“Social media did have an impact on viewing choice, but it was still relatively small compared to traditional promotion,” said Beth Rockwood, senior vice president for market resources at Discovery Communications , who is the chairwoman of the research group’s social media committee.', 'stories_id': 20}, {'sentence': 'Mr. Clinton, who had battled fiercely with the House speaker, Newt Gingrich, and Mr. Dole, emerged with the upper hand after a government shutdown.', 'stories_id': 14}, {'sentence': 'Multimedia Feature: Timeline of Turmoil in Egypt After Mubarak and Morsi', 'stories_id': 19}, {'sentence': 'Democrats accused Republican members of making a mockery of a citizen’s constitutional rights.', 'stories_id': 18}, {'sentence': 'LaCouture, who retired in 2009, was involved in 52 fights.', 'stories_id': 15}, {'sentence': '“So they’re coming across in droves.”', 'stories_id': 16}, {'sentence': 'Only 6.8 percent of the respondents said that something on a social network pushed them to tune into a new prime time show.', 'stories_id': 20}, {'sentence': 'Representative Elijah E. Cummings of Maryland, the ranking Democrat on the committee, compared the committee’s chairman, Representative Darrell Issa of California, to Joseph R. McCarthy, the Republican senator who used his subpoena power to accuse citizens of Communist sympathies in the 1950s.', 'stories_id': 18}, {'sentence': 'The lawsuit said he sustained a concussion at the hands of Robyn Regehr in 2004 and suffered from headaches, irritability, sensitivity to light, charge of personality and depression.', 'stories_id': 15}, {'sentence': 'Their case has attracted international attention because the journalists are experienced and highly regarded professionals.', 'stories_id': 19}, {'sentence': 'Next week, the same appeals court is scheduled to hear arguments over Oklahoma’s ban on same-sex marriage, which a federal judge declared unconstitutional in January.', 'stories_id': 11}, {'sentence': 'Mr. Lott sought to get the legislative wheels turning again to help Republicans preserve their Senate majority in that November’s elections.', 'stories_id': 14}, {'sentence': 'Also, CBS owns the Ed Sullivan Theater on Broadway, where Mr. Letterman has worked for the last 21 years.', 'stories_id': 21}, {'sentence': 'Best are the lamb chops sweetened with a red-wine-vinegar reduction; Cornish game hen soaked in saffron and lemon; and koobideh, a mash of beef ground three times and adrenalized with jalapeños.', 'stories_id': 17}, {'sentence': '“I cannot cast a vote that would place me on the same page of the history books as Senator Joseph McCarthy or the House Un-American Activities Committee,” Mr. Cummings said.', 'stories_id': 18}, {'sentence': 'Marriage cases in several other states, including Virginia and Texas, are percolating through the courts, and the Supreme Court is widely expected to tackle the issue.', 'stories_id': 11}, {'sentence': 'That shaker on the table is filled with sumac; now is the time to use it.', 'stories_id': 17}, {'sentence': 'Mr. Greste previously worked for the BBC, and Mr. Fahmy worked for CNN and was a reporting assistant for The New York Times.', 'stories_id': 19}, {'sentence': 'Other players in the lawsuit were journeymen who rarely fought, like Rourke, who, according to hockeyfights.com, had 17 fights in his 14-year major junior and professional career.', 'stories_id': 15}, {'sentence': 'In Mexican border cities like Reynosa, just across the river, migrants have become easy prey for Mexican drug cartels that have seized control of the human smuggling business, heightening perils for illegal crossers and security risks for the United States.', 'stories_id': 16}, {'sentence': '“The thing that was completely balled up in the spokes was the minimum wage,” Mr. Lott, now a lobbyist, recalled in an interview.', 'stories_id': 14}, {'sentence': 'Nearly 40 percent of respondents said TV commercials for a new show prompted them to tune in, and about one-third said they watched because it was a program they already watched regularly.', 'stories_id': 20}, {'sentence': 'It is the natural home for the new Colbert show, the executives said.', 'stories_id': 21}, {'sentence': 'Keczmer never accumulated more than 75 minutes in penalties in a season during his 10-year professional career and, according to the suit, had one N.H.L. fight.', 'stories_id': 15}, {'sentence': 'Both sides cited legal scholars who supported their interpretation of whether Ms. Lerner’s statements amounted to a waiver of her Fifth Amendment right.', 'stories_id': 18}, {'sentence': 'On my visits, I brought a tough crowd of Iranian descent.', 'stories_id': 17}, {'sentence': 'In December, Utah briefly became the 18th state to legalize same-sex marriage when a federal judge in the state tossed out a voter-approved ban on such nuptials, one of about 11 similar prohibitions that passed in 2004.', 'stories_id': 11}, {'sentence': 'Even the couch potato factor was more important than Twitter or Facebook: About one in 10 people said they checked out a new show because it was appearing on the channel they were already watching.', 'stories_id': 20}, {'sentence': 'Leslie Moonves, the chief executive of CBS, who was the primary mover in getting the deal done, said the negotiations moved at a breakneck pace beginning the day Mr. Letterman announced his plans.', 'stories_id': 21}, {'sentence': 'So he shepherded a two-stage rise to $5.15 an hour that included some tax breaks for businesses.', 'stories_id': 14}, {'sentence': 'At the Rio Grande that afternoon, the smugglers calculatedly sent the migrants across at a point where the water is too shallow for Border Patrol boats that might have turned them back safely at the midriver boundary between the United States and Mexico.', 'stories_id': 16}, {'sentence': 'But their case has also opened a window into the treatment of thousands of other Egyptians detained since last August in the sweeping crackdown on dissent that followed the military ouster of President Mohamed Morsi of the Muslim Brotherhood.', 'stories_id': 19}, {'sentence': 'A separate panel, the House Ways and Means Committee, voted along party lines on Wednesday to formally ask Attorney General Eric H. Holder Jr. to begin a criminal investigation of Ms. Lerner , accusing her of “willful misconduct.”', 'stories_id': 18}, {'sentence': 'Mr. Lott got what he wanted.', 'stories_id': 14}, {'sentence': 'The judge, who was appointed by President Obama with support from conservative Utah politicians, said the ban violated the “fundamental right” of same-sex couples to marry.', 'stories_id': 11}, {'sentence': 'My guests approved, but they were exacting about the kebabs.', 'stories_id': 17}, {'sentence': 'The case will be heard by United States District Judge Shira A. Scheindlin, who ruled that New York City’s stop-and-frisk policy should be overturned and then was removed from the case by the Court of Appeals.', 'stories_id': 15}, {'sentence': 'Several local university students also accused in the case stood alongside the three journalists on Thursday in the metal cage that holds defendants in Egyptian courtrooms.', 'stories_id': 19}, {'sentence': 'The researchers did find some groups that were big into social TV chatter.', 'stories_id': 20}, {'sentence': 'A Border Patrol chief, Raul Ortiz, watched in frustration from a helicopter overhead.', 'stories_id': 16}, {'sentence': 'Mr. Moonves said a “barrage of calls” immediately came in from representatives of comics seeking the job.', 'stories_id': 21}, {'sentence': 'Two days after signing the minimum-wage increase, Mr. Clinton signed a separate compromise overhauling the welfare system.', 'stories_id': 14}, {'sentence': 'This is the latest lawsuit involving violence in the N.H.L. In May, the family of Derek Boogaard filed a wrongful-death lawsuit against the N.H.L., saying the league was responsible for the physical trauma and brain damage Boogaard sustained in six seasons as one of the league’s top enforcers.', 'stories_id': 15}, {'sentence': '“Somebody probably told them they’re going to get released,” he said.', 'stories_id': 16}, {'sentence': 'One of them, Khaled Mohamed Abdel Raouf, fainted and police officers carried his limp body out of the courtroom.', 'stories_id': 19}, {'sentence': 'Generally, women, Hispanics and people aged 25 to 34 were more likely to watch and post.', 'stories_id': 20}, {'sentence': 'The steak is a little dry, they said.', 'stories_id': 17}, {'sentence': 'But when Mr. Colbert’s agent, James Dixon, called to express Mr. Colbert’s interest, the talks quickly became serious.', 'stories_id': 21}, {'sentence': 'It was a highly unusual step for the tax-writing committee.', 'stories_id': 18}, {'sentence': 'The prohibition, Judge Robert J. Shelby wrote, violated guarantees of due process and equal protection in the Constitution.', 'stories_id': 11}, {'sentence': 'Male, Asian and black viewers, as well as people aged 45 to 54, were less likely to chat about social TV.', 'stories_id': 20}, {'sentence': 'The other defendants said Mr. Raouf had been on a hunger strike to protest the conditions of his incarceration in the notorious wing of Tora prison known as the Scorpion.', 'stories_id': 19}, {'sentence': 'His ruling touched off 17 days of legal chaos as hundreds of same-sex couples poured into county clerks’ offices across the state to wed while Utah officials scrambled to stop them and put a halt to the marriages.', 'stories_id': 11}, {'sentence': 'Ms. Lerner was the head of the I.R.S.’s division on tax-exempt organizations when it flagged Tea Party-affiliated groups for special scrutiny, slowing down their approval.', 'stories_id': 18}, {'sentence': 'Republicans added two seats to their Senate majority in November.', 'stories_id': 14}, {'sentence': 'As agents booked them, the migrants waited quietly: a Guatemalan mother carrying a toddler with a baby bottle, another with an infant wrapped in blankets.', 'stories_id': 16}, {'sentence': 'Where is the saffron?', 'stories_id': 17}, {'sentence': 'The five-year deal was not difficult to conclude, Mr. Moonves said, because both sides were equally interested.', 'stories_id': 21}, {'sentence': 'Boogaard died of an accidental overdose of prescription painkillers and alcohol in 2011.', 'stories_id': 15}, {'sentence': 'The Treasury Department’s inspector general concluded that employees under Ms. Lerner had acted inappropriately but that there was no evidence to support Republicans’ accusations of political motivation.', 'stories_id': 18}, {'sentence': 'Eleven years later, President George W. Bush was the Republican in need.', 'stories_id': 14}, {'sentence': 'A 9-year-old girl said she was traveling by herself, hoping to rejoin her mother and two brothers in Louisiana.', 'stories_id': 16}, {'sentence': 'Also, the council said that about 22 percent of the whole survey group were “superconnectors,” defined as people who actively follow shows and actors on social media and comment or interact with them several times a day.', 'stories_id': 20}, {'sentence': 'But he said that Mr. Colbert had one special request: “He said, ‘I want to be sure Dave is on board.’ ” Mr. Moonves said he had already decided that “it was essential to me to get Dave’s blessing.”', 'stories_id': 21}, {'sentence': 'By the time the Supreme Court intervened and issued a stay in the case — effectively suspending the Utah judge’s ruling and temporarily reinstating the ban — more than 1,000 same-sex couples had married, and many had changed their names, signed up for spousal health insurance and taken steps to become legal parents of children they were raising.', 'stories_id': 11}, {'sentence': 'And why the wanton strewing of shredded onions?', 'stories_id': 17}, {'sentence': 'The students are being charged along with the journalists as part of the same conspiracy, but several of the students have said that they do not know the journalists or understand what is said to be their connection to the case.', 'stories_id': 19}, {'sentence': 'In November, a group of players who were in the league in the 1970s, ’80s and ’90s, filed a lawsuit in federal court in Washington, saying N.H.L. officials should have done more to address head injuries but instead celebrated a culture of speed and violence.', 'stories_id': 15}, {'sentence': 'The Oversight Committee, however, concluded last month that Ms. Lerner was motivated by political ideology.', 'stories_id': 18}, {'sentence': 'Neither the prosecutors nor the judge displayed any visible reaction to the startling lack of evidence.', 'stories_id': 19}, {'sentence': 'The long-delayed civil suit of the former Colorado Avalanche player Steve Moore against Todd Bertuzzi and the Vancouver Canucks is scheduled to be heard in an Ontario court in September.', 'stories_id': 15}, {'sentence': '(I nodded supportively, having found almost everything, apart from an unfortunate salmon skewer, delicious.)', 'stories_id': 17}, {'sentence': 'So he called and spoke to the star personally to let him know that the network was leaning toward hiring Mr. Colbert.', 'stories_id': 21}, {'sentence': 'On Thursday, Peggy A. Tomsic, a lawyer for the three same-sex couples who had gone to court against Utah, argued in the courtroom here that the state’s ban stigmatized same-sex couples, denying them a fundamental right for no valid reason.', 'stories_id': 11}, {'sentence': 'Those superconnectors were significantly more active on social media than other people, suggesting that advertisers and TV producers might want to find ways to better target those people with their social media promotions.', 'stories_id': 20}, {'sentence': 'But she did not know where in Louisiana they were.', 'stories_id': 16}, {'sentence': 'Raising the minimum wage did not come naturally to Mr. Bush, who promoted a more conservative, market-oriented economic agenda than his father.', 'stories_id': 14}, {'sentence': 'The Oversight Committee has collected thousands of pages of I.R.S. documents but has accused the agency of stonewalling its investigation.', 'stories_id': 18}, {'sentence': 'Bertuzzi attacked Moore during a game in March 2004, breaking three of Moore’s neck vertebrae and ending his career.', 'stories_id': 15}, {'sentence': '“The superconnectors are an important group to think about,” Ms. Rockwood said.', 'stories_id': 20}, {'sentence': 'But by 2007, he had suffered grievous political damage from setbacks in the Iraq war and his administration’s handling of Hurricane Katrina.', 'stories_id': 14}, {'sentence': 'After a two-week journey from Honduras, her only connection to them was one telephone number on a scrap of paper.', 'stories_id': 16}, {'sentence': '“Dave was very happy,” Mr. Moonves said.', 'stories_id': 21}, {'sentence': 'Gene C. Schaerr, a lawyer for Utah, argued that the state’s residents had the right to limit marriages to exclude same-sex couples, and said that redefining it as “genderless” posed risks to a traditional view of the institution.', 'stories_id': 11}, {'sentence': 'At one point, the judge ordered the courtroom technicians to display video footage contained on a small USB drive belonging to Mr. Greste, but it turned out to contain only material from his earlier work, in Nairobi.', 'stories_id': 19}, {'sentence': 'The restaurant feels roomy, with walls and pillars of exposed brick and curved mirrors.', 'stories_id': 17}, {'sentence': 'Even if the full House votes to find Ms. Lerner in contempt, it is not likely to have any practical effect.', 'stories_id': 18}, {'sentence': 'For a while the court watched a news conference held in English by a Kenyan official.', 'stories_id': 19}, {'sentence': 'A television murmurs distractingly behind the bar, often tuned to QVC.', 'stories_id': 17}, {'sentence': 'The judges fired a barrage of skeptical questions, questioning the state’s legal team on whether banning same-sex marriage was akin to outlawing interracial unions, and skeptically asking the plaintiffs whether the state was not entitled to set its own definitions of marriage.', 'stories_id': 11}, {'sentence': 'A Honduran woman said the group had followed the instructions of the Mexican smugglers.', 'stories_id': 16}, {'sentence': 'And live events, like awards shows, drew more social media chatter — an area that Twitter views as a particular strength.', 'stories_id': 20}, {'sentence': '“He was very supportive and said it was a great choice.”', 'stories_id': 21}, {'sentence': 'Bill Daly, deputy commissioner of the N.H.L., said that this week’s suit did not appear to be substantively different from the one filed in November.', 'stories_id': 15}, {'sentence': 'Democrats, who had recaptured both houses of Congress in the 2006 midterm elections, set out to end the longest period without a minimum wage increase since its inception during the administration of President Franklin D. Roosevelt.', 'stories_id': 14}, {'sentence': '“In short, we are completely satisfied with our record on Player Safety, including as it relates to head injuries and brain trauma,” he said in a statement.', 'stories_id': 15}, {'sentence': '“We were getting jammed,” said Tony Fratto, then a White House aide.', 'stories_id': 14}, {'sentence': 'The soundtrack vacillates between phantoms of the ’80s (“Careless Whisper,” “Lady in Red”) and Parsi pop.', 'stories_id': 17}, {'sentence': '“The Emmys were a real standout in the period we were surveying,” Ms. Rockwood said.', 'stories_id': 20}, {'sentence': 'In a statement, Mr. Letterman said: “Stephen has always been a real friend to me.', 'stories_id': 21}, {'sentence': 'A defense lawyer interrupted to tell the judge, who does not appear to speak English, that the news conference and other Kenyan material was irrelevant to the charges.', 'stories_id': 19}, {'sentence': 'Advertisement', 'stories_id': 11}, {'sentence': 'Mr. Holder was cited for contempt by the chamber in 2012 for failing to disclose documents related to the botched gunrunning investigation known as Operation Fast and Furious.', 'stories_id': 18}, {'sentence': '“They just told us to cross and start walking,” she said.', 'stories_id': 16}, {'sentence': 'Judge Paul J. Kelly, who was nominated by President George Bush, appeared more deferential to Utah’s voters and its Legislature while Judge Carlos F. Lucero, a Clinton appointee, asked pointed questions about whether Utah was stigmatizing children of gay couples.', 'stories_id': 11}, {'sentence': 'Mr. Bush ultimately accepted a two-stage increase to $7.25 an hour as part of a bill to finance his most urgent priority, the Iraq war.', 'stories_id': 14}, {'sentence': '“We do not believe the new Complaint provides any valid basis for liability or damages as against the National Hockey League and we intend to defend the case and others that may follow it vigorously.”', 'stories_id': 15}, {'sentence': 'Other migrants were trying to elude the Border Patrol, and within the hour Chief Ortiz saw his interdiction efforts working according to plan.', 'stories_id': 16}, {'sentence': 'Among the desserts, pomegranate sorbet and rose-petal gelato bear no trace of their alleged flavors, and both are striped, discordantly, with chocolate sauce.', 'stories_id': 17}, {'sentence': 'The criminal referral against Mr. Holder was sent to the Justice Department, which did not pursue it, as George W. Bush’s Justice Department declined to pursue contempt citations passed in 2008 against White House officials.', 'stories_id': 18}, {'sentence': 'But the judge nonetheless ordered the video to continue.', 'stories_id': 19}, {'sentence': 'Daily Report: As the Internet Grows, It Grows Less Secure', 'stories_id': 20}, {'sentence': 'I’m very excited for him, and I’m flattered that CBS chose him.', 'stories_id': 21}, {'sentence': 'Now Mr. Obama seeks a Republican partner.', 'stories_id': 14}, {'sentence': '(“There’s the fusion,” one disgruntled diner said.)', 'stories_id': 17}, {'sentence': 'Microsoft Touts Data Protection Approval in Europe; Eager for New Customers', 'stories_id': 20}, {'sentence': 'I also happen to know they wanted another guy with glasses.”', 'stories_id': 21}, {'sentence': 'Over the course of the court session, more than a half dozen video clips were screened, but they appeared to come from the BBC, Sky News, Al Arabiya, and Mr. Greste’s family vacation.', 'stories_id': 19}, {'sentence': 'Legal observers said the deciding vote appeared to belong to Judge Jerome A. Holmes, who was nominated by President George W. Bush, and lofted tough questions at both sides.', 'stories_id': 11}, {'sentence': 'A short way upriver in deeper water, agents radioed that they had turned back a raft with eight “bodies.”', 'stories_id': 16}, {'sentence': 'The N.H.L. set up a concussion study program in 1997, the first in North American major league sports, and has in recent years modified rules in response to increased concern about head trauma.', 'stories_id': 15}, {'sentence': 'None came from Al Jazeera or were related to the charges in this case.', 'stories_id': 19}, {'sentence': 'The minimum wage remains at $7.25 — in inflation-adjusted terms, more than $2 below where it stood 40 years ago.', 'stories_id': 14}, {'sentence': 'Mr. Colbert has made a name for pushing the edges of political satire, at times enraging voices on the right with his bumptious rendering of conservative positions.', 'stories_id': 21}, {'sentence': 'When your need to know is right now.', 'stories_id': 20}, {'sentence': 'But then comes zoolbia bamieh, a swirl of deep-fried dough coated with rosewater-infused honey, alongside the Persian equivalent of doughnut holes.', 'stories_id': 17}, {'sentence': 'But the lawsuit calls those moves “untimely and ineffective.”', 'stories_id': 15}, {'sentence': 'Moments later a surveillance blimp cruising nearby detected people lying under dense brush.', 'stories_id': 16}, {'sentence': '“Why does it matter who’s claiming the right?”', 'stories_id': 11}, {'sentence': 'Famously, he disturbed the media universe at the White House Correspondents’ Association dinner in 2006 when he gave no quarter in mocking then-President Bush.', 'stories_id': 21}, {'sentence': 'As the helicopter swooped low, the pilot spotted sneakers at the base of the trees.', 'stories_id': 16}, {'sentence': 'Despite calls from around the world for the release of the journalists, the judge ordered the prosecutors to sort through the video material before the next hearing.', 'stories_id': 19}, {'sentence': 'Judge Holmes asked a lawyer representing Utah.', 'stories_id': 11}, {'sentence': 'Lawyers representing the players did not return calls for comment about their strategy, though their complaint includes mentions of the Broad Street Bullies of the 1970s Philadelphia Flyers, movies like “The Last Gladiators” and “Mystery, Alaska,” and the recent bench-clearing brawl in a charity hockey game between New York City firefighters and policemen.', 'stories_id': 15}, {'sentence': 'But there is little sign that any critical mass of Republicans wants to make it happen, much less Speaker John A. Boehner or the Senate minority leader, Mitch McConnell.', 'stories_id': 14}, {'sentence': 'They are almost painfully sweet, which is the point.', 'stories_id': 17}, {'sentence': 'Download for quick access to up-to-the minute technology news.', 'stories_id': 20}, {'sentence': 'Agents on the ground flushed out nine migrants, all men.', 'stories_id': 16}, {'sentence': 'Though he has never openly endorsed Democrats or liberal positions (hardly what his conservative character would do), he did turn up seated next to Michelle Obama at a state dinner at the White House this year (and his character even bragged about it on the air).', 'stories_id': 21}, {'sentence': 'When they appeared, my companions, for the first time all evening, said not a word.', 'stories_id': 17}, {'sentence': 'Their position does not surprise Democrats in Congress and the White House.', 'stories_id': 14}, {'sentence': '(The complaint also mistakenly said that Gordie Howe, who is still alive, died in 2009.)', 'stories_id': 15}, {'sentence': '“It’s a fundamental right, and why does it matter the participants in that enterprise?', 'stories_id': 11}, {'sentence': 'Why does it matter?”', 'stories_id': 11}, {'sentence': 'While the complaint seeks a jury trial, legal experts said the players would prefer to settle out of court, no doubt aware of the $765 million proposed settlement between the N.F.L. and retired football players.', 'stories_id': 15}, {'sentence': 'Illegal Crossings in Rio Grande Valley', 'stories_id': 16}, {'sentence': 'Then one of them smiled.', 'stories_id': 17}, {'sentence': 'National polling and midterm election geography point toward a larger House Republican majority, and perhaps a new majority in the Senate.', 'stories_id': 14}, {'sentence': 'The news of Mr. Colbert’s appointment inflamed conservative commentators like Rush Limbaugh who said CBS had “declared war on the heartland of America.”', 'stories_id': 21}, {'sentence': 'The suits brought by thousands of retired N.F.L. players were originally filed in states around the country over many months.', 'stories_id': 15}, {'sentence': '“I’m happy now,” she said.', 'stories_id': 17}, {'sentence': 'Republicans see more to lose among conservative core supporters and business donors with a wage increase than there might be to gain among swing voters who may not show up at the polls.', 'stories_id': 14}, {'sentence': 'Thursday’s arguments signaled the first time an appeals court had considered the issue since the Supreme Court handed two major victories to gay-rights supporters last summer, striking down a law that denied federal benefits to same-sex couples and clearing the way for same-sex marriages across California.', 'stories_id': 11}, {'sentence': 'Migrants crossing the South Texas border illegally in recent years are no longer primarily from Mexico, but from other countries, mainly in Central America.', 'stories_id': 16}, {'sentence': 'But CBS executives made it clear that they expected Mr. Colbert to broaden his appeal when he moved to the medium of late night on a network.', 'stories_id': 21}, {'sentence': '122,501', 'stories_id': 16}, {'sentence': 'Midterm election turnout is sure to be much lower than in a presidential year.', 'stories_id': 14}, {'sentence': 'They were eventually consolidated and heard in federal court in Philadelphia.', 'stories_id': 15}, {'sentence': 'Mr. Colbert has demonstrated that he can do more than political satire.', 'stories_id': 21}, {'sentence': 'It was a day freighted with emotion for gay-rights supporters and same-sex couples in Utah.', 'stories_id': 11}, {'sentence': 'Dozens flew to Denver from Utah to attend the arguments, lining up early Thursday morning for a seat in the courtroom.', 'stories_id': 11}, {'sentence': 'Advertisement', 'stories_id': 14}, {'sentence': 'That is a possible course of action for the N.H.L. cases, which so far only involve a few dozen players.', 'stories_id': 15}, {'sentence': '96,829', 'stories_id': 16}, {'sentence': 'He won a Grammy Award for his musical Christmas special, “A Colbert Christmas,” in 2009, and starred as Harry in a 2011 production of “Company” by the New York Philharmonic.', 'stories_id': 21}, {'sentence': 'His Comedy Central show has won three Emmy Awards for best writing for a variety show and two Peabody Awards.', 'stories_id': 21}, {'sentence': 'The two most recent complaints allege that the league was negligent in not doing more to warn players of the dangers of concussions and committed fraud by deliberately hiding information it did have about those dangers.', 'stories_id': 15}, {'sentence': '“They believe they’ll recapture the majority,” said Ed Pagano, who recently left a White House job as Senate liaison for Mr. Obama.', 'stories_id': 14}, {'sentence': 'Spike caused mostly by a large influx of Brazilians.', 'stories_id': 16}, {'sentence': 'A conservative state lawmaker was one of a handful of supporters of the ban to attend the hearing.', 'stories_id': 11}, {'sentence': 'Proving those allegations, though, could be just as difficult as it has been for the retired N.F.L. players.', 'stories_id': 15}, {'sentence': 'He is also a favorite of a wide range of other comedians, including the two men who will be his direct competitors.', 'stories_id': 21}, {'sentence': '“Our lives are on the line here,” said Derek Kitchen, the plaintiff who lent his last name to the case — Kitchen v. Herbert.', 'stories_id': 11}, {'sentence': '“Why would they want to upset the status quo?”', 'stories_id': 14}, {'sentence': 'MEXICO', 'stories_id': 16}, {'sentence': '57,624', 'stories_id': 16}, {'sentence': 'Republicans cite substantive reasons for holding back, too.', 'stories_id': 14}, {'sentence': 'Gary R. Herbert is Utah’s Republican governor.', 'stories_id': 11}, {'sentence': 'The N.H.L. players will have to provide evidence that the league purposely hid information about the dangers of fighting and hockey.', 'stories_id': 15}, {'sentence': 'Jimmy Fallon, the new host of NBC’s “Tonight” show, has described Mr. Colbert (who had a cameo on the premiere of Mr. Fallon’s show this year) as “a genius, the funniest man alive.”', 'stories_id': 21}, {'sentence': 'Jimmy Kimmel, who hosts ABC’s show, (and shares Mr. Dixon as an agent) posted on Twitter on Thursday: “a finer or funnier man I do not know.”', 'stories_id': 21}, {'sentence': 'The players will also have to show that injuries they received in the N.H.L. led to their current ailments.', 'stories_id': 15}, {'sentence': 'OTHER', 'stories_id': 16}, {'sentence': 'As Mr. Kitchen and the other plaintiffs chatted and exchanged reassuring pats on the shoulder in the courtroom, they were approached by Utah’s attorney general, Sean Reyes, whose office has taken the lead role in defending the same-sex marriage ban.', 'stories_id': 11}, {'sentence': 'Since the labor market is still soft and the economic recovery tepid, Republicans say they remain concerned about job losses from a higher minimum wage.', 'stories_id': 14}, {'sentence': 'Shaking hands and greeting the plaintiffs, Mr. Reyes crouched down and told them: “I’m sorry that we’re causing you pain.', 'stories_id': 11}, {'sentence': 'At the same time, many of the most populous states — including New York, California, New Jersey, Illinois and Ohio — have raised the minimum wage on their own.', 'stories_id': 14}, {'sentence': 'Mr. Colbert has been comfortable as a product pitchman on his show, integrating products ranging from Halls cough drops to Budweiser beer.', 'stories_id': 21}, {'sentence': 'COUNTRIES', 'stories_id': 16}, {'sentence': 'The plaintiffs “would like to have a settlement within the contours of what you’ve found in the attempted settlement with the N.F.L.,” said Mark Conrad, the director of the Sports Business Program at Fordham University.', 'stories_id': 15}, {'sentence': 'Sometime after the case is over, I hope we can sit down.”', 'stories_id': 11}, {'sentence': '10,742', 'stories_id': 16}, {'sentence': 'Although Mr. Obama has encouraged that trend, it paradoxically lessens the urgency of his call for Congress to act.', 'stories_id': 14}, {'sentence': '“But I wouldn’t underestimate the league’s ability to fight this.”', 'stories_id': 15}, {'sentence': 'Occasionally, he has segments that seem connected to branded entertainment deals, but actually parody the conventions of late-night television.', 'stories_id': 21}, {'sentence': 'Frequently those segments have been about Doritos snack chips.', 'stories_id': 21}, {'sentence': '“The administration imagines a pressure on Republicans that simply does not exist,” said Representative Tom Cole of Oklahoma.', 'stories_id': 14}, {'sentence': 'After the hearing, Mr. Reyes said he had told the plaintiffs that the legal confrontation was not personal, and that he knew that the plaintiffs’ families were as important to them as his own was to him.', 'stories_id': 11}, {'sentence': '’00', 'stories_id': 16}, {'sentence': 'Mr. Colbert also recently became a pitchman in actual commercials , for Wonderful pistachios.', 'stories_id': 21}, {'sentence': '’02', 'stories_id': 16}, {'sentence': 'But he said it was unclear what would happen to the unions and benefits of Utah’s newly married same-sex couples if the state prevailed in its appeals.', 'stories_id': 11}, {'sentence': '“It may exist in their coalition, but not ours.”', 'stories_id': 14}, {'sentence': '“Technology, air operations, ground units, that’s the complete package,” Chief Ortiz said.', 'stories_id': 16}, {'sentence': 'The Democratic coalition itself represents another political obstacle.', 'stories_id': 14}, {'sentence': 'The first two commercials were shown in February during the Super Bowl.', 'stories_id': 21}, {'sentence': 'Utah has previously raised the possibility that those marriages could be dissolved.', 'stories_id': 11}, {'sentence': 'Separately, in Indiana on Thursday, a federal judge ruled that the state must, for now, recognize the same-sex marriage of a woman who is terminally ill.', 'stories_id': 11}, {'sentence': 'In presidential years, Mr. Obama’s party relies heavily on young voters, minorities and unmarried women, all of whom would be disproportionately affected by a minimum-wage increase.', 'stories_id': 14}, {'sentence': 'The selection of Mr. Colbert will most likely push several rows of dominoes into action in late night.', 'stories_id': 21}, {'sentence': 'The new migrants head for South Texas because it is the shortest distance from Central America.', 'stories_id': 16}, {'sentence': 'Many young people ride across Mexico on top of freight trains, jumping off in Reynosa.', 'stories_id': 16}, {'sentence': 'The need to persuade more of those voters to turn out for this fall’s midterm elections encourages Democratic leaders to hold out for the president’s requested $2.85-an-hour increase rather than compromise, while pushing state-level minimum-wage ballot measures.', 'stories_id': 14}, {'sentence': 'Nikole Quasney and Amy Sandler have two children and joined one of five lawsuits challenging the state’s ban on same-sex marriage last month, citing the need to have their relationship legally recognized in order to access benefits for surviving family members.', 'stories_id': 11}, {'sentence': 'Comedy Central will need a host for its 11:31 p.m. show.', 'stories_id': 21}, {'sentence': 'Ms. Quasney received a diagnosis of ovarian cancer in 2009; the couple married in Massachusetts last year.', 'stories_id': 11}, {'sentence': 'The White House could yet succeed in bludgeoning Republican lawmakers to give in.', 'stories_id': 14}, {'sentence': 'The Rio Grande twists and winds, and those who make it across can quickly hide in sugar cane fields and orchards.', 'stories_id': 16}, {'sentence': 'Chris Hardwick, who is hosting a new late-night show on the channel, “@Midnight,” will surely be among those mentioned as a possibility to move up a half-hour.', 'stories_id': 21}, {'sentence': 'In many places it is a short sprint to shopping malls and suburban streets where smugglers pick up migrants to continue north.', 'stories_id': 16}, {'sentence': 'If Mr. Obama can inch up poll ratings for himself and for his health care law, Mr. Boehner and Mr. McConnell — who is up for re-election in Kentucky — may find accomplishing something with a Democratic president as useful as Mr. Lott and Mr. Gingrich once did.', 'stories_id': 14}, {'sentence': 'But that cable channel has recently added a number of hit shows with new performers, some of whom — Daniel Tosh, the team of Key and Peele, and Amy Schumer — could qualify for Mr. Colbert’s old post.', 'stories_id': 21}, {'sentence': 'If selected, Ms. Schumer could quell some of the criticism of late-night shows being too much a male preserve, just as Key and Peele might answer critics who charge it is too white.', 'stories_id': 21}, {'sentence': 'If the White House does not succeed, Mr. Obama could yet find a different, postelection path to a minimum-wage increase.', 'stories_id': 14}, {'sentence': 'Border Patrol officials said apprehensions were higher partly because they were catching many more of the illegal crossers.', 'stories_id': 16}, {'sentence': 'About 3,000 agents in the Rio Grande Valley — 495 new this year — patrol in helicopters and boats, on all-terrain vehicles and horseback.', 'stories_id': 16}, {'sentence': 'CBS will face questions about its own host-in-waiting, Craig Ferguson, whose contract concludes at the end of this year.', 'stories_id': 21}, {'sentence': 'The common thread of the last three wage increases is a president of one party forging agreement with a Congress controlled by the other.', 'stories_id': 14}, {'sentence': '“Our system works best” for bipartisan compromise with that alignment, Mr. Fratto said.', 'stories_id': 14}, {'sentence': 'If Mr. Ferguson decides to leave, the network will be seeking another host for its 12:35 a.m. show.', 'stories_id': 21}, {'sentence': 'Drones and aerostat blimps are watching from the sky.', 'stories_id': 16}, {'sentence': 'Under a new strategy, border agencies are working with federal drug agents, the F.B.I. and Texas police to break up Mexican smuggling organizations by prosecuting operatives on this side of the border.', 'stories_id': 16}, {'sentence': '“No decision has been made about 12:35,” Mr. Moonves said.', 'stories_id': 21}, {'sentence': 'If Republicans emerge in 2015 with control of both the House and the Senate, they might even find a motivation to make a deal.', 'stories_id': 14}, {'sentence': 'But whereas Mexicans can be swiftly returned by the Border Patrol, migrants from noncontiguous countries must be formally deported and flown home by other agencies.', 'stories_id': 16}, {'sentence': '“We’re in discussions.', 'stories_id': 21}, {'sentence': 'Even though federal flights are leaving South Texas every day, Central Americans are often detained longer.', 'stories_id': 16}, {'sentence': 'Our pat answer is, Let us deal with one hour at a time.”', 'stories_id': 21}, {'sentence': 'Women with children are detained separately.', 'stories_id': 16}, {'sentence': 'The main hour is dealt with for the long term, Mr. Moonves said.', 'stories_id': 21}, {'sentence': '“This is like a 20-year decision.', 'stories_id': 21}, {'sentence': 'But because the nearest facility for “family units” is in Pennsylvania, families apprehended in the Rio Grande Valley are likely to be released while their cases proceed, a senior deportations official said.', 'stories_id': 16}, {'sentence': 'I’m confident I made the right one.”', 'stories_id': 21}, {'sentence': 'Minors without parents are turned over to the Department of Health and Human Services, which holds them in shelters that provide medical care and schooling and tries to send them to relatives in the United States.', 'stories_id': 16}, {'sentence': 'The authorities here are expecting 35,000 unaccompanied minors this year, triple the number two years ago.', 'stories_id': 16}, {'sentence': 'Under asylum law, border agents are required to ask migrants if they are afraid of returning to their countries.', 'stories_id': 16}, {'sentence': 'If the answer is yes, migrants must be detained until an immigration officer interviews them to determine if the fear is credible.', 'stories_id': 16}, {'sentence': 'If the officer concludes it is, the migrant can petition for asylum.', 'stories_id': 16}, {'sentence': 'An immigration judge will decide whether there is a “well-founded fear of persecution” based on race, religion, nationality, political opinion or “membership in a particular social group.”', 'stories_id': 16}, {'sentence': 'Immigration officials said they had set the bar intentionally low for the initial “credible fear” test, to avoid turning away a foreigner in danger.', 'stories_id': 16}, {'sentence': 'In 2013, 85 percent of fear claims were found to be credible, according to federal figures.', 'stories_id': 16}, {'sentence': 'As more Central Americans have come, fear claims have spiked, more than doubling in 2013 to 36,026 from 13,931 in 2012.', 'stories_id': 16}, {'sentence': 'united states', 'stories_id': 16}, {'sentence': 'TEXAS', 'stories_id': 16}, {'sentence': 'Rio Grande', 'stories_id': 16}, {'sentence': 'Hidalgo', 'stories_id': 16}, {'sentence': 'Mexico', 'stories_id': 16}, {'sentence': 'Honduras', 'stories_id': 16}, {'sentence': 'Guatemala', 'stories_id': 16}, {'sentence': 'El Salvador', 'stories_id': 16}, {'sentence': '500 miles', 'stories_id': 16}, {'sentence': 'The chances have not improved much to win asylum in the end, however.', 'stories_id': 16}, {'sentence': 'In 2012, immigration courts approved 34 percent of asylum petitions from migrants facing deportation — 2,888 cases nationwide.', 'stories_id': 16}, {'sentence': 'Many Central Americans say they are fleeing extortion or forced recruitment by criminal gangs.', 'stories_id': 16}, {'sentence': 'But immigration courts have rarely recognized those threats as grounds for asylum.', 'stories_id': 16}, {'sentence': 'Yet because of immense backlogs in the courts — with the average wait for a hearing currently at about 19 months — claiming fear of return has allowed some Central Americans to prolong their time in the United States.', 'stories_id': 16}, {'sentence': 'At the big immigration detention center at Port Isabel, which serves much of the Rio Grande Valley, half of about 1,100 detainees at any given time are asylum seekers, officials said.', 'stories_id': 16}, {'sentence': 'With the asylum system already stretched, the nearest officers are in Houston, doing interviews by video conference.', 'stories_id': 16}, {'sentence': 'In 2013, the closest immigration court, in Harlingen, was swamped with new cases, becoming even more backlogged.', 'stories_id': 16}, {'sentence': 'Detention beds fill up, and migrants deemed to present no security risk are released under supervision, officials said, with their next court hearing often more than a year away.', 'stories_id': 16}, {'sentence': 'At their now teeming front-line stations along the river, Border Patrol officials readily admit they are not set up to hold migrants for long.', 'stories_id': 16}, {'sentence': 'Agents and migrants alike refer to the cells there as “hieleras” — freezers.', 'stories_id': 16}, {'sentence': 'In cinder-block rooms with concrete benches and a toilet in the corner, there are no chairs, beds, showers or hot food.', 'stories_id': 16}, {'sentence': 'On a recent day, migrants caked in river mud were packed shoulder to shoulder, many on the floor, trying to warm up in space blankets the Border Patrol provides.', 'stories_id': 16}, {'sentence': 'Some held their fingers to their lips to signal hunger.', 'stories_id': 16}, {'sentence': 'But agents said they have accelerated their work so more migrants are deported directly from Border Patrol stations in as little as two days.', 'stories_id': 16}, {'sentence': 'Officials said few migrants — only 4 percent — claim fear of returning when they are with the Border Patrol.', 'stories_id': 16}, {'sentence': 'Rather, migrants are claiming fear after they are sent to longer-term detention centers like Port Isabel, leading officials to suspect they have been coached by other detainees.', 'stories_id': 16}, {'sentence': 'But lawyers for asylum seekers said migrants frequently report that Border Patrol agents never asked them about their concerns, or that they were too exhausted or intimidated to express them in the hours after being caught.', 'stories_id': 16}, {'sentence': '“A lot of times these people had very real, legitimate fears,” said Kimi Jackson, director of the South Texas Pro Bono Asylum Representation Project, known as ProBAR.', 'stories_id': 16}, {'sentence': '“But it seems to them they were not asked the questions by the Border Patrol in the type of situation where they could talk freely.”', 'stories_id': 16}, {'sentence': 'On a helicopter with its pilot and a Border Patrol chief, a reporter and photographer for The New York Times watched migrants crossing the Rio Grande and turning themselves in.', 'stories_id': 16}, {'sentence': 'Lawyers said officials had started to make it far harder for migrants to win release by requiring many more to post bond, with rates rising to as high as $10,000.', 'stories_id': 16}, {'sentence': 'That news had not reached migrants at a shelter run by nuns in Reynosa.', 'stories_id': 16}, {'sentence': 'Several said they were heading to the United States to seek “asilo.”', 'stories_id': 16}, {'sentence': 'They could say truthfully they were afraid to go home.', 'stories_id': 16}, {'sentence': 'Luis Fernando Herrera Perdomo, 19, said he fled Honduras after gang members shot and killed a brother who was sleeping in the bed next to his.', 'stories_id': 16}, {'sentence': 'A 29-year-old former soldier from El Salvador, who asked to be identified only as Jesús, said he left his wife and three children to escape a gang that came gunning for him because he arrested some of its members while in the army.', 'stories_id': 16}, {'sentence': 'In Reynosa, the dangers had only multiplied.', 'stories_id': 16}, {'sentence': 'José Rubén Hernández, 32, said he had been kidnapped for two weeks while Mexican smugglers extorted $10,000 in ransom from his frantic family in Honduras.', 'stories_id': 16}, {'sentence': '“We are a gold mine for the cartels,” he said.', 'stories_id': 16}, {'sentence': 'Other migrants had been imprisoned in a smugglers’ stash house until Mexican military troops stormed it to free them.', 'stories_id': 16}, {'sentence': 'Two Hondurans who had just arrived at the shelter displayed new bruises, saying they had been beaten that morning in a rail yard by smugglers associated with the Zetas, a brutal Mexican cartel.', 'stories_id': 16}, {'sentence': 'But the migrants still intended to hire new smugglers and try to cross.', 'stories_id': 16}, {'sentence': '“I’m still alive and I have faith in God, so I will try to make it over to the other side,” Mr. Herrera said.', 'stories_id': 16}, {'sentence': 'Chief Ortiz said agents were speeding deportations to change the message reaching Central America.', 'stories_id': 16}, {'sentence': '“It cost the migrant an awful lot of money and time and effort to get here,” he said.', 'stories_id': 16}, {'sentence': '“If I send somebody back to Guatemala or Honduras, chances are they’re going to sit there and say, ‘You know what, I don’t think I’m going to try this again.’ ”', 'stories_id': 16}, {'sentence': '“The word may get out,” he said.', 'stories_id': 16}] \ No newline at end of file diff --git a/mediacloud/mediawords/util/topic_modeling/sample_stories_5.txt b/mediacloud/mediawords/util/topic_modeling/sample_stories_5.txt new file mode 100644 index 0000000000..4cdb270304 --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/sample_stories_5.txt @@ -0,0 +1 @@ +[{'stories_id': 15, 'sentence': 'Nine former professional hockey players have filed a lawsuit against the N.H.L. that says the league “intentionally created, fostered and promoted a culture of extreme violence.”'}, {'stories_id': 13, 'sentence': 'Sebelius Resigns After Troubles Over Health Site.'}, {'stories_id': 14, 'sentence': 'WASHINGTON — Each of the three previous presidents — two Republicans, one Democrat — signed an increase in the federal minimum wage.'}, {'stories_id': 16, 'sentence': 'Smugglers sent migrants across the Rio Grande at a point where the water is too shallow for Border Patrol boats that might have turned them back safely at the midriver boundary between the United States and Mexico.'}, {'stories_id': 11, 'sentence': 'U.S. | U.S. Court Seems Split on Utah Gay Marriage Ban'}, {'stories_id': 13, 'sentence': 'Ending a stormy five-year tenure marred by the disastrous rollout of President Obama’s signature legislative achievement, Kathleen Sebelius is resigning as secretary of health and human services.'}, {'stories_id': 15, 'sentence': 'The suit, which was filed Wednesday in federal court in Manhattan, is the latest in a growing string of challenges to the N.H.L. Similar to suits brought by retired N.F.L. players , the complaint said that the N.H.L. failed to take adequate steps to warn the players of the dangers of the sport and deliberately promoted violence for profit.'}, {'stories_id': 16, 'sentence': 'HIDALGO, Tex. — Border Patrol agents in olive uniforms stood in broad daylight on the banks of the Rio Grande, while on the Mexican side smugglers pulled up in vans and unloaded illegal migrants.'}, {'stories_id': 11, 'sentence': 'U.S. Court Seems Split on Utah Gay Marriage Ban'}, {'stories_id': 14, 'sentence': 'Why not President Obama?'}, {'stories_id': 11, 'sentence': 'DENVER — The push for same-sex marriage, which has celebrated victory after victory in courtrooms across the country, entered an uncertain stage on Thursday as a federal appeals court appeared divided about whether the socially conservative state of Utah could limit marriage to a man and a woman.'}, {'stories_id': 16, 'sentence': 'The agents were clearly visible on that recent afternoon, but the migrants were undeterred.'}, {'stories_id': 15, 'sentence': 'The complaint is more graphic than other suits brought by former hockey players, highlighting the role of enforcers in the N.H.L. over many years and mentioning movies that celebrated fighting in hockey.'}, {'stories_id': 14, 'sentence': 'Given Mr. Obama’s emphasis on income inequality, and the popularity of an increase in opinion polls, you would think he would.'}, {'stories_id': 16, 'sentence': 'Mainly women and children, 45 in all, they crossed the narrow river on the smugglers’ rafts, scrambled up the bluff and turned themselves in, signaling a growing challenge for the immigration authorities.'}, {'stories_id': 14, 'sentence': 'But the story of recent increases underscores the indispensable ingredient he so far lacks: a Republican leader strongly motivated to make a deal over the party’s philosophical objections.'}, {'stories_id': 11, 'sentence': 'In an hour of arguments inside a packed courtroom, three judges from the Federal Court of Appeals for the 10th Circuit sparred with lawyers about how such bans affected the children of same-sex parents and whether preventing gay couples from marrying actually did anything to promote or strengthen heterosexual unions and families.'}, {'stories_id': 15, 'sentence': '“Through the sophisticated use of extreme violence as a commodity, from which the N.H.L. has generated billions of dollars, the N.H.L. has subjected and continues to subject its players to the imminent risk of head trauma and, as a result, devastating and long-term negative health consequences,” the lawsuit said.'}, {'stories_id': 14, 'sentence': 'In 1989, it was a new Republican in the White House.'}, {'stories_id': 15, 'sentence': 'The plaintiffs in the suit are the former players Dan LaCouture, 36; Dan Keczmer, 45; Jack Carlson, 59; Richard Brennan, 41; Brad Maxwell, 56; Michael Peluso, 48; Tom Younghans, 61; Allan Rourke, 34; and Scott Bailey, 41.'}, {'stories_id': 11, 'sentence': 'Related Coverage'}, {'stories_id': 16, 'sentence': 'After six years of steep declines across the Southwest, illegal crossings have soared in South Texas while remaining low elsewhere.'}, {'stories_id': 14, 'sentence': 'President George Bush, while campaigning to succeed Ronald Reagan, had promised “a kinder, gentler America.”'}, {'stories_id': 16, 'sentence': 'The Border Patrol made more than 90,700 apprehensions in the Rio Grande Valley in the past six months, a 69 percent increase over last year.'}, {'stories_id': 15, 'sentence': 'Some of the players were brawlers, like Carlson, who racked up 1,111 penalty minutes in the N.H.L. and the World Hockey Association.'}, {'stories_id': 11, 'sentence': 'U.S. to Recognize 1,300 Marriages Disputed by Utah JAN.'}, {'stories_id': 15, 'sentence': 'He was supposed to play the third Hanson brother in the 1977 movie “Slap Shot,” but was called up from the minors just before filming began.'}, {'stories_id': 16, 'sentence': 'The migrants are no longer primarily Mexican laborers.'}, {'stories_id': 14, 'sentence': 'The Democrats then controlling both houses of Congress set out to take him up on it.'}, {'stories_id': 11, 'sentence': '10, 2014'}, {'stories_id': 16, 'sentence': 'Instead they are Central Americans, including many families with small children and youngsters without their parents, who risk a danger-filled journey across Mexico.'}, {'stories_id': 11, 'sentence': 'Justices’ Halt to Gay Marriage Leaves Utah Couples in Limbo JAN.'}, {'stories_id': 15, 'sentence': 'Peluso, who played in the N.H.L. from 1990 to 1998, led the league with 408 penalty minutes in 1991-92 and fought, according to the suit, 179 times in his nine-year career.'}, {'stories_id': 14, 'sentence': 'Mr. Bush drove a hard bargain on the minimum wage.'}, {'stories_id': 15, 'sentence': 'In a 2011 interview with juniorhockey.com , Peluso said he was suffering concussion-related seizures and depression in retirement and complained about poor pension benefits and health insurance from the N.H.L. and the N.H.L. Players’ Association.'}, {'stories_id': 16, 'sentence': 'Driven out by deepening poverty but also by rampant gang violence, increasing numbers of migrants caught here seek asylum, setting off lengthy legal procedures to determine whether they qualify.'}, {'stories_id': 11, 'sentence': '6, 2014'}, {'stories_id': 14, 'sentence': 'He vetoed the first version Congress sent on grounds that it raised the wage by 30 cents an hour too much.'}, {'stories_id': 15, 'sentence': '“ There is no question in my mind that brain injuries and depression are linked,” Peluso told the website.'}, {'stories_id': 11, 'sentence': 'Federal Judge Rules That Same-Sex Marriage Is Legal in Utah DEC.'}, {'stories_id': 16, 'sentence': 'The new migrant flow, largely from El Salvador, Guatemala and Honduras, is straining resources and confounding Obama administration security strategies that work effectively in other regions.'}, {'stories_id': 14, 'sentence': 'But he eventually accepted a two-stage increase to $4.25 an hour on the condition that lawmakers include a lower “training wage” for teenagers.'}, {'stories_id': 15, 'sentence': '“However, I find it has more to do with low self-esteem.'}, {'stories_id': 16, 'sentence': 'It is further complicating President Obama’s uphill push on immigration, fueling Republican arguments for more border security before any overhaul.'}, {'stories_id': 14, 'sentence': 'In 1996, it was a new Republican Senate leader.'}, {'stories_id': 11, 'sentence': '20, 2013'}, {'stories_id': 16, 'sentence': 'With detention facilities, asylum offices and immigration courts overwhelmed, enough migrants have been released temporarily in the United States that back home in Central America people have heard that those who make it to American soil have a good chance of staying.'}, {'stories_id': 14, 'sentence': 'Trent Lott took over after Bob Dole, then running for president against the incumbent Democrat, Bill Clinton, resigned his Senate seat.'}, {'stories_id': 15, 'sentence': 'Former skilled guys suffered head injuries, and they don’t appear to be suicidal.”'}, {'stories_id': 11, 'sentence': 'The three judges — two appointed by Republican presidents and one by a Democrat — focused on fine-grained issues, like what level of judicial scrutiny to apply to the case, as well as more profound questions of how to define a marriage and whether state bans on same-sex nuptials were akin to those against polygamy, or instead fundamentally violated the constitutional rights of same-sex couples.'}, {'stories_id': 14, 'sentence': 'Mr. Clinton, who had battled fiercely with the House speaker, Newt Gingrich, and Mr. Dole, emerged with the upper hand after a government shutdown.'}, {'stories_id': 15, 'sentence': 'LaCouture, who retired in 2009, was involved in 52 fights.'}, {'stories_id': 11, 'sentence': 'While the decision will reverberate across Utah, it will hardly be the last word on whether same-sex couples have the same rights to marry as heterosexuals.'}, {'stories_id': 16, 'sentence': '“Word has gotten out that we’re giving people permission and walking them out the door,” said Chris Cabrera, a Border Patrol agent who is vice president of the local of the National Border Patrol Council, the agents’ union.'}, {'stories_id': 15, 'sentence': 'The lawsuit said he sustained a concussion at the hands of Robyn Regehr in 2004 and suffered from headaches, irritability, sensitivity to light, charge of personality and depression.'}, {'stories_id': 11, 'sentence': 'Next week, the same appeals court is scheduled to hear arguments over Oklahoma’s ban on same-sex marriage, which a federal judge declared unconstitutional in January.'}, {'stories_id': 14, 'sentence': 'Mr. Lott sought to get the legislative wheels turning again to help Republicans preserve their Senate majority in that November’s elections.'}, {'stories_id': 16, 'sentence': '“So they’re coming across in droves.”'}, {'stories_id': 16, 'sentence': 'In Mexican border cities like Reynosa, just across the river, migrants have become easy prey for Mexican drug cartels that have seized control of the human smuggling business, heightening perils for illegal crossers and security risks for the United States.'}, {'stories_id': 15, 'sentence': 'Other players in the lawsuit were journeymen who rarely fought, like Rourke, who, according to hockeyfights.com, had 17 fights in his 14-year major junior and professional career.'}, {'stories_id': 14, 'sentence': '“The thing that was completely balled up in the spokes was the minimum wage,” Mr. Lott, now a lobbyist, recalled in an interview.'}, {'stories_id': 11, 'sentence': 'Marriage cases in several other states, including Virginia and Texas, are percolating through the courts, and the Supreme Court is widely expected to tackle the issue.'}, {'stories_id': 14, 'sentence': 'So he shepherded a two-stage rise to $5.15 an hour that included some tax breaks for businesses.'}, {'stories_id': 15, 'sentence': 'Keczmer never accumulated more than 75 minutes in penalties in a season during his 10-year professional career and, according to the suit, had one N.H.L. fight.'}, {'stories_id': 16, 'sentence': 'At the Rio Grande that afternoon, the smugglers calculatedly sent the migrants across at a point where the water is too shallow for Border Patrol boats that might have turned them back safely at the midriver boundary between the United States and Mexico.'}, {'stories_id': 11, 'sentence': 'In December, Utah briefly became the 18th state to legalize same-sex marriage when a federal judge in the state tossed out a voter-approved ban on such nuptials, one of about 11 similar prohibitions that passed in 2004.'}, {'stories_id': 11, 'sentence': 'The judge, who was appointed by President Obama with support from conservative Utah politicians, said the ban violated the “fundamental right” of same-sex couples to marry.'}, {'stories_id': 16, 'sentence': 'A Border Patrol chief, Raul Ortiz, watched in frustration from a helicopter overhead.'}, {'stories_id': 14, 'sentence': 'Mr. Lott got what he wanted.'}, {'stories_id': 15, 'sentence': 'The case will be heard by United States District Judge Shira A. Scheindlin, who ruled that New York City’s stop-and-frisk policy should be overturned and then was removed from the case by the Court of Appeals.'}, {'stories_id': 11, 'sentence': 'The prohibition, Judge Robert J. Shelby wrote, violated guarantees of due process and equal protection in the Constitution.'}, {'stories_id': 14, 'sentence': 'Two days after signing the minimum-wage increase, Mr. Clinton signed a separate compromise overhauling the welfare system.'}, {'stories_id': 16, 'sentence': '“Somebody probably told them they’re going to get released,” he said.'}, {'stories_id': 15, 'sentence': 'This is the latest lawsuit involving violence in the N.H.L. In May, the family of Derek Boogaard filed a wrongful-death lawsuit against the N.H.L., saying the league was responsible for the physical trauma and brain damage Boogaard sustained in six seasons as one of the league’s top enforcers.'}, {'stories_id': 16, 'sentence': 'As agents booked them, the migrants waited quietly: a Guatemalan mother carrying a toddler with a baby bottle, another with an infant wrapped in blankets.'}, {'stories_id': 15, 'sentence': 'Boogaard died of an accidental overdose of prescription painkillers and alcohol in 2011.'}, {'stories_id': 14, 'sentence': 'Republicans added two seats to their Senate majority in November.'}, {'stories_id': 11, 'sentence': 'His ruling touched off 17 days of legal chaos as hundreds of same-sex couples poured into county clerks’ offices across the state to wed while Utah officials scrambled to stop them and put a halt to the marriages.'}, {'stories_id': 15, 'sentence': 'In November, a group of players who were in the league in the 1970s, ’80s and ’90s, filed a lawsuit in federal court in Washington, saying N.H.L. officials should have done more to address head injuries but instead celebrated a culture of speed and violence.'}, {'stories_id': 11, 'sentence': 'By the time the Supreme Court intervened and issued a stay in the case — effectively suspending the Utah judge’s ruling and temporarily reinstating the ban — more than 1,000 same-sex couples had married, and many had changed their names, signed up for spousal health insurance and taken steps to become legal parents of children they were raising.'}, {'stories_id': 14, 'sentence': 'Eleven years later, President George W. Bush was the Republican in need.'}, {'stories_id': 16, 'sentence': 'A 9-year-old girl said she was traveling by herself, hoping to rejoin her mother and two brothers in Louisiana.'}, {'stories_id': 16, 'sentence': 'But she did not know where in Louisiana they were.'}, {'stories_id': 11, 'sentence': 'On Thursday, Peggy A. Tomsic, a lawyer for the three same-sex couples who had gone to court against Utah, argued in the courtroom here that the state’s ban stigmatized same-sex couples, denying them a fundamental right for no valid reason.'}, {'stories_id': 14, 'sentence': 'Raising the minimum wage did not come naturally to Mr. Bush, who promoted a more conservative, market-oriented economic agenda than his father.'}, {'stories_id': 15, 'sentence': 'The long-delayed civil suit of the former Colorado Avalanche player Steve Moore against Todd Bertuzzi and the Vancouver Canucks is scheduled to be heard in an Ontario court in September.'}, {'stories_id': 11, 'sentence': 'Gene C. Schaerr, a lawyer for Utah, argued that the state’s residents had the right to limit marriages to exclude same-sex couples, and said that redefining it as “genderless” posed risks to a traditional view of the institution.'}, {'stories_id': 14, 'sentence': 'But by 2007, he had suffered grievous political damage from setbacks in the Iraq war and his administration’s handling of Hurricane Katrina.'}, {'stories_id': 15, 'sentence': 'Bertuzzi attacked Moore during a game in March 2004, breaking three of Moore’s neck vertebrae and ending his career.'}, {'stories_id': 16, 'sentence': 'After a two-week journey from Honduras, her only connection to them was one telephone number on a scrap of paper.'}, {'stories_id': 14, 'sentence': 'Democrats, who had recaptured both houses of Congress in the 2006 midterm elections, set out to end the longest period without a minimum wage increase since its inception during the administration of President Franklin D. Roosevelt.'}, {'stories_id': 11, 'sentence': 'The judges fired a barrage of skeptical questions, questioning the state’s legal team on whether banning same-sex marriage was akin to outlawing interracial unions, and skeptically asking the plaintiffs whether the state was not entitled to set its own definitions of marriage.'}, {'stories_id': 15, 'sentence': 'Bill Daly, deputy commissioner of the N.H.L., said that this week’s suit did not appear to be substantively different from the one filed in November.'}, {'stories_id': 16, 'sentence': 'A Honduran woman said the group had followed the instructions of the Mexican smugglers.'}, {'stories_id': 15, 'sentence': '“In short, we are completely satisfied with our record on Player Safety, including as it relates to head injuries and brain trauma,” he said in a statement.'}, {'stories_id': 11, 'sentence': 'Advertisement'}, {'stories_id': 16, 'sentence': '“They just told us to cross and start walking,” she said.'}, {'stories_id': 14, 'sentence': '“We were getting jammed,” said Tony Fratto, then a White House aide.'}, {'stories_id': 16, 'sentence': 'Other migrants were trying to elude the Border Patrol, and within the hour Chief Ortiz saw his interdiction efforts working according to plan.'}, {'stories_id': 14, 'sentence': 'Mr. Bush ultimately accepted a two-stage increase to $7.25 an hour as part of a bill to finance his most urgent priority, the Iraq war.'}, {'stories_id': 15, 'sentence': '“We do not believe the new Complaint provides any valid basis for liability or damages as against the National Hockey League and we intend to defend the case and others that may follow it vigorously.”'}, {'stories_id': 11, 'sentence': 'Judge Paul J. Kelly, who was nominated by President George Bush, appeared more deferential to Utah’s voters and its Legislature while Judge Carlos F. Lucero, a Clinton appointee, asked pointed questions about whether Utah was stigmatizing children of gay couples.'}, {'stories_id': 16, 'sentence': 'A short way upriver in deeper water, agents radioed that they had turned back a raft with eight “bodies.”'}, {'stories_id': 15, 'sentence': 'The N.H.L. set up a concussion study program in 1997, the first in North American major league sports, and has in recent years modified rules in response to increased concern about head trauma.'}, {'stories_id': 14, 'sentence': 'Now Mr. Obama seeks a Republican partner.'}, {'stories_id': 11, 'sentence': 'Legal observers said the deciding vote appeared to belong to Judge Jerome A. Holmes, who was nominated by President George W. Bush, and lofted tough questions at both sides.'}, {'stories_id': 15, 'sentence': 'But the lawsuit calls those moves “untimely and ineffective.”'}, {'stories_id': 11, 'sentence': '“Why does it matter who’s claiming the right?”'}, {'stories_id': 14, 'sentence': 'The minimum wage remains at $7.25 — in inflation-adjusted terms, more than $2 below where it stood 40 years ago.'}, {'stories_id': 16, 'sentence': 'Moments later a surveillance blimp cruising nearby detected people lying under dense brush.'}, {'stories_id': 14, 'sentence': 'But there is little sign that any critical mass of Republicans wants to make it happen, much less Speaker John A. Boehner or the Senate minority leader, Mitch McConnell.'}, {'stories_id': 16, 'sentence': 'As the helicopter swooped low, the pilot spotted sneakers at the base of the trees.'}, {'stories_id': 15, 'sentence': 'Lawyers representing the players did not return calls for comment about their strategy, though their complaint includes mentions of the Broad Street Bullies of the 1970s Philadelphia Flyers, movies like “The Last Gladiators” and “Mystery, Alaska,” and the recent bench-clearing brawl in a charity hockey game between New York City firefighters and policemen.'}, {'stories_id': 11, 'sentence': 'Judge Holmes asked a lawyer representing Utah.'}, {'stories_id': 11, 'sentence': '“It’s a fundamental right, and why does it matter the participants in that enterprise?'}, {'stories_id': 14, 'sentence': 'Their position does not surprise Democrats in Congress and the White House.'}, {'stories_id': 15, 'sentence': '(The complaint also mistakenly said that Gordie Howe, who is still alive, died in 2009.)'}, {'stories_id': 16, 'sentence': 'Agents on the ground flushed out nine migrants, all men.'}, {'stories_id': 16, 'sentence': 'Illegal Crossings in Rio Grande Valley'}, {'stories_id': 11, 'sentence': 'Why does it matter?”'}, {'stories_id': 15, 'sentence': 'While the complaint seeks a jury trial, legal experts said the players would prefer to settle out of court, no doubt aware of the $765 million proposed settlement between the N.F.L. and retired football players.'}, {'stories_id': 14, 'sentence': 'National polling and midterm election geography point toward a larger House Republican majority, and perhaps a new majority in the Senate.'}, {'stories_id': 14, 'sentence': 'Republicans see more to lose among conservative core supporters and business donors with a wage increase than there might be to gain among swing voters who may not show up at the polls.'}, {'stories_id': 11, 'sentence': 'Thursday’s arguments signaled the first time an appeals court had considered the issue since the Supreme Court handed two major victories to gay-rights supporters last summer, striking down a law that denied federal benefits to same-sex couples and clearing the way for same-sex marriages across California.'}, {'stories_id': 16, 'sentence': 'Migrants crossing the South Texas border illegally in recent years are no longer primarily from Mexico, but from other countries, mainly in Central America.'}, {'stories_id': 15, 'sentence': 'The suits brought by thousands of retired N.F.L. players were originally filed in states around the country over many months.'}, {'stories_id': 16, 'sentence': '122,501'}, {'stories_id': 14, 'sentence': 'Midterm election turnout is sure to be much lower than in a presidential year.'}, {'stories_id': 15, 'sentence': 'They were eventually consolidated and heard in federal court in Philadelphia.'}, {'stories_id': 11, 'sentence': 'It was a day freighted with emotion for gay-rights supporters and same-sex couples in Utah.'}, {'stories_id': 16, 'sentence': '96,829'}, {'stories_id': 11, 'sentence': 'Dozens flew to Denver from Utah to attend the arguments, lining up early Thursday morning for a seat in the courtroom.'}, {'stories_id': 14, 'sentence': 'Advertisement'}, {'stories_id': 15, 'sentence': 'That is a possible course of action for the N.H.L. cases, which so far only involve a few dozen players.'}, {'stories_id': 11, 'sentence': 'A conservative state lawmaker was one of a handful of supporters of the ban to attend the hearing.'}, {'stories_id': 16, 'sentence': 'Spike caused mostly by a large influx of Brazilians.'}, {'stories_id': 15, 'sentence': 'The two most recent complaints allege that the league was negligent in not doing more to warn players of the dangers of concussions and committed fraud by deliberately hiding information it did have about those dangers.'}, {'stories_id': 14, 'sentence': '“They believe they’ll recapture the majority,” said Ed Pagano, who recently left a White House job as Senate liaison for Mr. Obama.'}, {'stories_id': 14, 'sentence': '“Why would they want to upset the status quo?”'}, {'stories_id': 15, 'sentence': 'Proving those allegations, though, could be just as difficult as it has been for the retired N.F.L. players.'}, {'stories_id': 16, 'sentence': 'MEXICO'}, {'stories_id': 11, 'sentence': '“Our lives are on the line here,” said Derek Kitchen, the plaintiff who lent his last name to the case — Kitchen v. Herbert.'}, {'stories_id': 15, 'sentence': 'The N.H.L. players will have to provide evidence that the league purposely hid information about the dangers of fighting and hockey.'}, {'stories_id': 14, 'sentence': 'Republicans cite substantive reasons for holding back, too.'}, {'stories_id': 11, 'sentence': 'Gary R. Herbert is Utah’s Republican governor.'}, {'stories_id': 16, 'sentence': '57,624'}, {'stories_id': 15, 'sentence': 'The players will also have to show that injuries they received in the N.H.L. led to their current ailments.'}, {'stories_id': 11, 'sentence': 'As Mr. Kitchen and the other plaintiffs chatted and exchanged reassuring pats on the shoulder in the courtroom, they were approached by Utah’s attorney general, Sean Reyes, whose office has taken the lead role in defending the same-sex marriage ban.'}, {'stories_id': 14, 'sentence': 'Since the labor market is still soft and the economic recovery tepid, Republicans say they remain concerned about job losses from a higher minimum wage.'}, {'stories_id': 16, 'sentence': 'OTHER'}, {'stories_id': 15, 'sentence': 'The plaintiffs “would like to have a settlement within the contours of what you’ve found in the attempted settlement with the N.F.L.,” said Mark Conrad, the director of the Sports Business Program at Fordham University.'}, {'stories_id': 14, 'sentence': 'At the same time, many of the most populous states — including New York, California, New Jersey, Illinois and Ohio — have raised the minimum wage on their own.'}, {'stories_id': 16, 'sentence': 'COUNTRIES'}, {'stories_id': 11, 'sentence': 'Shaking hands and greeting the plaintiffs, Mr. Reyes crouched down and told them: “I’m sorry that we’re causing you pain.'}, {'stories_id': 16, 'sentence': '10,742'}, {'stories_id': 15, 'sentence': '“But I wouldn’t underestimate the league’s ability to fight this.”'}, {'stories_id': 14, 'sentence': 'Although Mr. Obama has encouraged that trend, it paradoxically lessens the urgency of his call for Congress to act.'}, {'stories_id': 11, 'sentence': 'Sometime after the case is over, I hope we can sit down.”'}, {'stories_id': 14, 'sentence': '“The administration imagines a pressure on Republicans that simply does not exist,” said Representative Tom Cole of Oklahoma.'}, {'stories_id': 16, 'sentence': '’00'}, {'stories_id': 11, 'sentence': 'After the hearing, Mr. Reyes said he had told the plaintiffs that the legal confrontation was not personal, and that he knew that the plaintiffs’ families were as important to them as his own was to him.'}, {'stories_id': 14, 'sentence': '“It may exist in their coalition, but not ours.”'}, {'stories_id': 16, 'sentence': '’02'}, {'stories_id': 11, 'sentence': 'But he said it was unclear what would happen to the unions and benefits of Utah’s newly married same-sex couples if the state prevailed in its appeals.'}, {'stories_id': 11, 'sentence': 'Utah has previously raised the possibility that those marriages could be dissolved.'}, {'stories_id': 16, 'sentence': '“Technology, air operations, ground units, that’s the complete package,” Chief Ortiz said.'}, {'stories_id': 14, 'sentence': 'The Democratic coalition itself represents another political obstacle.'}, {'stories_id': 14, 'sentence': 'In presidential years, Mr. Obama’s party relies heavily on young voters, minorities and unmarried women, all of whom would be disproportionately affected by a minimum-wage increase.'}, {'stories_id': 16, 'sentence': 'The new migrants head for South Texas because it is the shortest distance from Central America.'}, {'stories_id': 11, 'sentence': 'Separately, in Indiana on Thursday, a federal judge ruled that the state must, for now, recognize the same-sex marriage of a woman who is terminally ill.'}, {'stories_id': 14, 'sentence': 'The need to persuade more of those voters to turn out for this fall’s midterm elections encourages Democratic leaders to hold out for the president’s requested $2.85-an-hour increase rather than compromise, while pushing state-level minimum-wage ballot measures.'}, {'stories_id': 16, 'sentence': 'Many young people ride across Mexico on top of freight trains, jumping off in Reynosa.'}, {'stories_id': 11, 'sentence': 'Nikole Quasney and Amy Sandler have two children and joined one of five lawsuits challenging the state’s ban on same-sex marriage last month, citing the need to have their relationship legally recognized in order to access benefits for surviving family members.'}, {'stories_id': 11, 'sentence': 'Ms. Quasney received a diagnosis of ovarian cancer in 2009; the couple married in Massachusetts last year.'}, {'stories_id': 14, 'sentence': 'The White House could yet succeed in bludgeoning Republican lawmakers to give in.'}, {'stories_id': 16, 'sentence': 'The Rio Grande twists and winds, and those who make it across can quickly hide in sugar cane fields and orchards.'}, {'stories_id': 14, 'sentence': 'If Mr. Obama can inch up poll ratings for himself and for his health care law, Mr. Boehner and Mr. McConnell — who is up for re-election in Kentucky — may find accomplishing something with a Democratic president as useful as Mr. Lott and Mr. Gingrich once did.'}, {'stories_id': 16, 'sentence': 'In many places it is a short sprint to shopping malls and suburban streets where smugglers pick up migrants to continue north.'}, {'stories_id': 16, 'sentence': 'Border Patrol officials said apprehensions were higher partly because they were catching many more of the illegal crossers.'}, {'stories_id': 14, 'sentence': 'If the White House does not succeed, Mr. Obama could yet find a different, postelection path to a minimum-wage increase.'}, {'stories_id': 14, 'sentence': 'The common thread of the last three wage increases is a president of one party forging agreement with a Congress controlled by the other.'}, {'stories_id': 16, 'sentence': 'About 3,000 agents in the Rio Grande Valley — 495 new this year — patrol in helicopters and boats, on all-terrain vehicles and horseback.'}, {'stories_id': 14, 'sentence': '“Our system works best” for bipartisan compromise with that alignment, Mr. Fratto said.'}, {'stories_id': 16, 'sentence': 'Drones and aerostat blimps are watching from the sky.'}, {'stories_id': 14, 'sentence': 'If Republicans emerge in 2015 with control of both the House and the Senate, they might even find a motivation to make a deal.'}, {'stories_id': 16, 'sentence': 'Under a new strategy, border agencies are working with federal drug agents, the F.B.I. and Texas police to break up Mexican smuggling organizations by prosecuting operatives on this side of the border.'}, {'stories_id': 16, 'sentence': 'But whereas Mexicans can be swiftly returned by the Border Patrol, migrants from noncontiguous countries must be formally deported and flown home by other agencies.'}, {'stories_id': 16, 'sentence': 'Even though federal flights are leaving South Texas every day, Central Americans are often detained longer.'}, {'stories_id': 16, 'sentence': 'Women with children are detained separately.'}, {'stories_id': 16, 'sentence': 'But because the nearest facility for “family units” is in Pennsylvania, families apprehended in the Rio Grande Valley are likely to be released while their cases proceed, a senior deportations official said.'}, {'stories_id': 16, 'sentence': 'Minors without parents are turned over to the Department of Health and Human Services, which holds them in shelters that provide medical care and schooling and tries to send them to relatives in the United States.'}, {'stories_id': 16, 'sentence': 'The authorities here are expecting 35,000 unaccompanied minors this year, triple the number two years ago.'}, {'stories_id': 16, 'sentence': 'Under asylum law, border agents are required to ask migrants if they are afraid of returning to their countries.'}, {'stories_id': 16, 'sentence': 'If the answer is yes, migrants must be detained until an immigration officer interviews them to determine if the fear is credible.'}, {'stories_id': 16, 'sentence': 'If the officer concludes it is, the migrant can petition for asylum.'}, {'stories_id': 16, 'sentence': 'An immigration judge will decide whether there is a “well-founded fear of persecution” based on race, religion, nationality, political opinion or “membership in a particular social group.”'}, {'stories_id': 16, 'sentence': 'Immigration officials said they had set the bar intentionally low for the initial “credible fear” test, to avoid turning away a foreigner in danger.'}, {'stories_id': 16, 'sentence': 'In 2013, 85 percent of fear claims were found to be credible, according to federal figures.'}, {'stories_id': 16, 'sentence': 'As more Central Americans have come, fear claims have spiked, more than doubling in 2013 to 36,026 from 13,931 in 2012.'}, {'stories_id': 16, 'sentence': 'united states'}, {'stories_id': 16, 'sentence': 'TEXAS'}, {'stories_id': 16, 'sentence': 'Rio Grande'}, {'stories_id': 16, 'sentence': 'Hidalgo'}, {'stories_id': 16, 'sentence': 'Mexico'}, {'stories_id': 16, 'sentence': 'Honduras'}, {'stories_id': 16, 'sentence': 'Guatemala'}, {'stories_id': 16, 'sentence': 'El Salvador'}, {'stories_id': 16, 'sentence': '500 miles'}, {'stories_id': 16, 'sentence': 'The chances have not improved much to win asylum in the end, however.'}, {'stories_id': 16, 'sentence': 'In 2012, immigration courts approved 34 percent of asylum petitions from migrants facing deportation — 2,888 cases nationwide.'}, {'stories_id': 16, 'sentence': 'Many Central Americans say they are fleeing extortion or forced recruitment by criminal gangs.'}, {'stories_id': 16, 'sentence': 'But immigration courts have rarely recognized those threats as grounds for asylum.'}, {'stories_id': 16, 'sentence': 'Yet because of immense backlogs in the courts — with the average wait for a hearing currently at about 19 months — claiming fear of return has allowed some Central Americans to prolong their time in the United States.'}, {'stories_id': 16, 'sentence': 'At the big immigration detention center at Port Isabel, which serves much of the Rio Grande Valley, half of about 1,100 detainees at any given time are asylum seekers, officials said.'}, {'stories_id': 16, 'sentence': 'With the asylum system already stretched, the nearest officers are in Houston, doing interviews by video conference.'}, {'stories_id': 16, 'sentence': 'In 2013, the closest immigration court, in Harlingen, was swamped with new cases, becoming even more backlogged.'}, {'stories_id': 16, 'sentence': 'Detention beds fill up, and migrants deemed to present no security risk are released under supervision, officials said, with their next court hearing often more than a year away.'}, {'stories_id': 16, 'sentence': 'At their now teeming front-line stations along the river, Border Patrol officials readily admit they are not set up to hold migrants for long.'}, {'stories_id': 16, 'sentence': 'Agents and migrants alike refer to the cells there as “hieleras” — freezers.'}, {'stories_id': 16, 'sentence': 'In cinder-block rooms with concrete benches and a toilet in the corner, there are no chairs, beds, showers or hot food.'}, {'stories_id': 16, 'sentence': 'On a recent day, migrants caked in river mud were packed shoulder to shoulder, many on the floor, trying to warm up in space blankets the Border Patrol provides.'}, {'stories_id': 16, 'sentence': 'Some held their fingers to their lips to signal hunger.'}, {'stories_id': 16, 'sentence': 'But agents said they have accelerated their work so more migrants are deported directly from Border Patrol stations in as little as two days.'}, {'stories_id': 16, 'sentence': 'Officials said few migrants — only 4 percent — claim fear of returning when they are with the Border Patrol.'}, {'stories_id': 16, 'sentence': 'Rather, migrants are claiming fear after they are sent to longer-term detention centers like Port Isabel, leading officials to suspect they have been coached by other detainees.'}, {'stories_id': 16, 'sentence': 'But lawyers for asylum seekers said migrants frequently report that Border Patrol agents never asked them about their concerns, or that they were too exhausted or intimidated to express them in the hours after being caught.'}, {'stories_id': 16, 'sentence': '“A lot of times these people had very real, legitimate fears,” said Kimi Jackson, director of the South Texas Pro Bono Asylum Representation Project, known as ProBAR.'}, {'stories_id': 16, 'sentence': '“But it seems to them they were not asked the questions by the Border Patrol in the type of situation where they could talk freely.”'}, {'stories_id': 16, 'sentence': 'On a helicopter with its pilot and a Border Patrol chief, a reporter and photographer for The New York Times watched migrants crossing the Rio Grande and turning themselves in.'}, {'stories_id': 16, 'sentence': 'Lawyers said officials had started to make it far harder for migrants to win release by requiring many more to post bond, with rates rising to as high as $10,000.'}, {'stories_id': 16, 'sentence': 'That news had not reached migrants at a shelter run by nuns in Reynosa.'}, {'stories_id': 16, 'sentence': 'Several said they were heading to the United States to seek “asilo.”'}, {'stories_id': 16, 'sentence': 'They could say truthfully they were afraid to go home.'}, {'stories_id': 16, 'sentence': 'Luis Fernando Herrera Perdomo, 19, said he fled Honduras after gang members shot and killed a brother who was sleeping in the bed next to his.'}, {'stories_id': 16, 'sentence': 'A 29-year-old former soldier from El Salvador, who asked to be identified only as Jesús, said he left his wife and three children to escape a gang that came gunning for him because he arrested some of its members while in the army.'}, {'stories_id': 16, 'sentence': 'In Reynosa, the dangers had only multiplied.'}, {'stories_id': 16, 'sentence': 'José Rubén Hernández, 32, said he had been kidnapped for two weeks while Mexican smugglers extorted $10,000 in ransom from his frantic family in Honduras.'}, {'stories_id': 16, 'sentence': '“We are a gold mine for the cartels,” he said.'}, {'stories_id': 16, 'sentence': 'Other migrants had been imprisoned in a smugglers’ stash house until Mexican military troops stormed it to free them.'}, {'stories_id': 16, 'sentence': 'Two Hondurans who had just arrived at the shelter displayed new bruises, saying they had been beaten that morning in a rail yard by smugglers associated with the Zetas, a brutal Mexican cartel.'}, {'stories_id': 16, 'sentence': 'But the migrants still intended to hire new smugglers and try to cross.'}, {'stories_id': 16, 'sentence': '“I’m still alive and I have faith in God, so I will try to make it over to the other side,” Mr. Herrera said.'}, {'stories_id': 16, 'sentence': 'Chief Ortiz said agents were speeding deportations to change the message reaching Central America.'}, {'stories_id': 16, 'sentence': '“It cost the migrant an awful lot of money and time and effort to get here,” he said.'}, {'stories_id': 16, 'sentence': '“If I send somebody back to Guatemala or Honduras, chances are they’re going to sit there and say, ‘You know what, I don’t think I’m going to try this again.’ ”'}, {'stories_id': 16, 'sentence': '“The word may get out,” he said.'}] diff --git a/mediacloud/mediawords/util/topic_modeling/test_model_gensim.py b/mediacloud/mediawords/util/topic_modeling/test_model_gensim.py new file mode 100644 index 0000000000..f0ec9ffdce --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/test_model_gensim.py @@ -0,0 +1,95 @@ +import unittest + + +# from mediawords.db import connect_to_db +from mediawords.util.topic_modeling.sample_handler import SampleHandler +from mediawords.util.topic_modeling.token_pool import TokenPool +from mediawords.util.topic_modeling.model_gensim import ModelGensim +from typing import Dict, List + + +class TestModelGensim(unittest.TestCase): + """ + Test the methods in ..model_gensim.py + """ + + def setUp(self): + """ + Prepare the token pool + """ + self.LIMIT = 5 + self.OFFSET = 1 + token_pool = TokenPool(SampleHandler()) + # self._story_tokens = token_pool.output_tokens(limit=self.LIMIT, offset=self.OFFSET) + self._story_tokens = token_pool.output_tokens() + self._flat_story_tokens = self._flatten_story_tokens() + self._lda_model = ModelGensim() + self._lda_model.add_stories(self._story_tokens) + self._topics = self._lda_model.summarize_topic() + + def _flatten_story_tokens(self) -> Dict[int, List[str]]: + """ + Flatten all tokens of a story into a single dimension list + :return: A dictionary of {story_id : [all tokens of that story]} + """ + flat_story_tokens = {} + for story in self._story_tokens.items(): + story_id = story[0] + grouped_tokens = story[1] + flat_story_tokens[story_id] \ + = [tokens for sentence_tokens in grouped_tokens for tokens in sentence_tokens] + return flat_story_tokens + + def test_one_to_one_relationship(self): + """ + Test if there is one-to-one relationship for articles and topics + (i.e. no mysteries topic id or missing article id) + """ + topic_ids = self._topics.keys() + story_ids = self._story_tokens.keys() + + for topic_id in topic_ids: + unittest.TestCase.assertTrue( + self=self, + expr=(topic_id in story_ids), + msg="Mysteries topic id: {}".format(topic_id)) + + for article_id in story_ids: + unittest.TestCase.assertTrue( + self=self, + expr=(article_id in topic_ids), + msg="Missing article id: {}".format(article_id)) + + def test_story_contains_topic_word(self): + """ + Test if each story contains at least one of the topic words + """ + + story_ids = self._story_tokens.keys() + + for story_id in story_ids: + exist = False + for topic in iter(self._topics.get(story_id)): + for word in topic: + exist = word in self._flat_story_tokens.get(story_id) + if exist: + break + if not exist: + raise ValueError("Story {id} does not contain any of its topic words: {topic}" + .format(id=story_id, topic=self._topics.get(story_id))) + + def test_default_topic_params(self): + default_topic_num = 1 + default_word_num = 4 + for topics in self._topics.values(): + unittest.TestCase.assertEqual( + self=self, first=default_topic_num, second=len(topics), + msg="topics = {}".format(topics)) + for topic in topics: + unittest.TestCase.assertEqual( + self=self, first=default_word_num, second=len(topic), + msg="topic = {}".format(topic)) + + +if __name__ == '__main__': + unittest.main() diff --git a/mediacloud/mediawords/util/topic_modeling/test_model_lda.py b/mediacloud/mediawords/util/topic_modeling/test_model_lda.py new file mode 100644 index 0000000000..431576e355 --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/test_model_lda.py @@ -0,0 +1,149 @@ +import unittest +import logging + +# from mediawords.db import connect_to_db +from mediawords.util.topic_modeling.sample_handler import SampleHandler +from mediawords.util.topic_modeling.token_pool import TokenPool +from mediawords.util.topic_modeling.model_lda import ModelLDA +from typing import Dict, List + + +class TestModelLDA(unittest.TestCase): + """ + Test the methods in model_lda.py + """ + @classmethod + def setUpClass(cls): + """ + Setting up the whole class (i.e. only need to run once) + """ + cls.setup_test_data() + + @classmethod + def setup_test_data(cls): + """ + Prepare the token pool and other data + """ + # token_pool = TokenPool(connect_to_db()) + token_pool = TokenPool(SampleHandler()) + + cls._story_tokens = token_pool.output_tokens() + cls._flat_story_tokens = cls._flatten_story_tokens(self=cls()) + cls._lda_model = ModelLDA() + cls._lda_model.add_stories(cls._story_tokens) + cls._optimal_topic_num_poly = cls._lda_model.tune_with_polynomial() + + cls._topics_via_poly \ + = cls._lda_model.summarize_topic(total_topic_num=cls._optimal_topic_num_poly) + + logging.getLogger("lda").setLevel(logging.WARNING) + logging.getLogger("gensim").setLevel(logging.WARNING) + + def _flatten_story_tokens(self) -> Dict[int, List[str]]: + """ + Flatten all tokens of a story into a single dimension list + :return: A dictionary of {story_id : [all tokens of that story]} + """ + flat_story_tokens = {} + for story in self._story_tokens.items(): + story_id = story[0] + grouped_tokens = story[1] + flat_story_tokens[story_id] \ + = [tokens for sentence_tokens in grouped_tokens for tokens in sentence_tokens] + return flat_story_tokens + + def test_one_to_one_relationship(self): + """ + Pass topics generated by both methods to _check_one_to_one_relationship() + """ + self._check_one_to_one_relationship(topics=self._topics_via_poly) + + def _check_one_to_one_relationship(self, topics: Dict[int, List]): + """ + Test if there is one-to-one relationship for articles and topics + (i.e. no mysteries topic id or missing article id) + """ + topic_ids = topics.keys() + story_ids = self._story_tokens.keys() + + for topic_id in topic_ids: + unittest.TestCase.assertTrue( + self=self, + expr=(topic_id in story_ids), + msg="Mysteries topic id: {}".format(topic_id)) + + for article_id in story_ids: + unittest.TestCase.assertTrue( + self=self, + expr=(article_id in topic_ids), + msg="Missing article id: {}".format(article_id)) + + def _check_story_contains_topic_word(self, topics: Dict[int, List]): + """ + Test if each story contains at least one of the topic words + """ + story_ids = self._story_tokens.keys() + + for story_id in story_ids: + # Due to the nature of this algorithm, if a story is too short, the words in it might + # not repeat enough times to be considered as a valid topic. Hence + if len(self._flat_story_tokens.get(story_id)) < 25: + return + exist = False + for topic in iter(topics.get(story_id)): + exist = topic in self._flat_story_tokens.get(story_id) or exist + if exist: + break + if not exist: + raise ValueError("Story {id} does not contain any of its topic words: {topic}\n" + "Story tokens:\n {tokens}" + .format(id=story_id, topic=topics.get(story_id), + tokens=self._flat_story_tokens.get(story_id))) + + def test_default_topic_params(self): + """ + Pass topics generated by both methods to _check_default_topic_params() + """ + self._check_default_topic_params(topics=self._topics_via_poly) + + def _check_default_topic_params(self, topics: Dict[int, List[str]]): + """ + Test if the correct number of words for each topic is returned + """ + default_word_num = 4 + for topics in topics.values(): + unittest.TestCase.assertEqual( + self=self, first=default_word_num, second=len(topics), + msg="Default word number ({}) != word number ({})\nTopic = {}" + .format(default_word_num, len(topics), topics)) + + def test_highest_likelihood(self): + """ + Pass topic_num and the name of tuning method to _check_highest_likelihood + Designed in this way to allow extensibility + (i.e. append more topic_num-name_of_tuning pair) + """ + self._check_highest_likelihood(num=self._optimal_topic_num_poly, name="Polynomial") + + def _check_highest_likelihood(self, num: int, name: str): + """ + Test if the result is the most accurate one + :param num: optimal topic_num found by polynomial + :param name: the name of training method used + """ + optimal_likelihood = self._lda_model.evaluate(topic_num=num)[1] + other_nums = [0, 1, num-1, num+1, num*2] + + for other_num in other_nums: + if (other_num == num) or num < 0: + continue + other_likelihood = self._lda_model.evaluate(topic_num=other_num)[1] + unittest.TestCase.assertGreaterEqual( + self=self, + a=optimal_likelihood, + b=other_likelihood, + msg="Topic num {} has a better likelihood {} than {} with {}:{}" + .format(other_num, other_likelihood, name, num, optimal_likelihood)) + +if __name__ == '__main__': + unittest.main() diff --git a/mediacloud/mediawords/util/topic_modeling/test_model_nmf.py b/mediacloud/mediawords/util/topic_modeling/test_model_nmf.py new file mode 100644 index 0000000000..625271bfb5 --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/test_model_nmf.py @@ -0,0 +1,93 @@ +import unittest + +from mediawords.util.topic_modeling.sample_handler import SampleHandler +from mediawords.util.topic_modeling.token_pool import TokenPool +from mediawords.util.topic_modeling.model_nmf import ModelNMF +from typing import Dict, List + + +class TestModelNMF(unittest.TestCase): + """ + Test the methods in ..model_gensim.py + """ + + def setUp(self): + """ + Prepare the token pool + """ + self.LIMIT = 5 + self.OFFSET = 1 + + token_pool = TokenPool(SampleHandler()) + # self._story_tokens = token_pool.output_tokens(limit=self.LIMIT, offset=self.OFFSET) + self._story_tokens = token_pool.output_tokens() + + self._flat_story_tokens = self._flatten_story_tokens() + self._nmf_model = ModelNMF() + self._nmf_model.add_stories(self._story_tokens) + self._topics = self._nmf_model.summarize_topic() + + def _flatten_story_tokens(self) -> Dict[int, List[str]]: + """ + Flatten all tokens of a story into a single dimension list + :return: A dictionary of {story_id : [all tokens of that story]} + """ + flat_story_tokens = {} + for story in self._story_tokens.items(): + story_id = story[0] + grouped_tokens = story[1] + flat_story_tokens[story_id] \ + = [tokens for sentence_tokens in grouped_tokens for tokens in sentence_tokens] + return flat_story_tokens + + def test_one_to_one_relationship(self): + """ + Test if there is one-to-one relationship for articles and topics + (i.e. no mysteries topic id or missing article id) + """ + topic_ids = self._topics.keys() + story_ids = self._story_tokens.keys() + + for topic_id in topic_ids: + unittest.TestCase.assertTrue( + self=self, + expr=(topic_id in story_ids), + msg="Mysteries topic id: {}".format(topic_id)) + + for article_id in story_ids: + unittest.TestCase.assertTrue( + self=self, + expr=(article_id in topic_ids), + msg="Missing article id: {}".format(article_id)) + + def test_story_contains_topic_word(self): + """ + Test if each story contains at least one of the topic words + """ + + story_ids = self._story_tokens.keys() + + for story_id in story_ids: + exist = False + for topic in iter(self._topics.get(story_id)): + for word in topic: + exist = word in self._flat_story_tokens.get(story_id) + if exist: + break + if not exist: + raise ValueError("Story {id} does not contain any of its topic words: {topic}" + .format(id=story_id, topic=self._topics.get(story_id))) + + def test_default_topic_params(self): + default_topic_num = 1 + default_word_num = 4 + for topics in self._topics.values(): + unittest.TestCase.assertEqual( + self=self, first=default_topic_num, second=len(topics)) + for topic in topics: + unittest.TestCase.assertEqual( + self=self, first=default_word_num, second=len(topic)) + + +if __name__ == '__main__': + unittest.main() diff --git a/mediacloud/mediawords/util/topic_modeling/test_token_pool.py b/mediacloud/mediawords/util/topic_modeling/test_token_pool.py new file mode 100644 index 0000000000..016b958eb7 --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/test_token_pool.py @@ -0,0 +1,66 @@ +import unittest +import os + +# from mediawords.db import connect_to_db +from mediawords.util.topic_modeling.sample_handler import SampleHandler +from mediawords.util.paths import mc_root_path +from mediawords.util.topic_modeling.token_pool import TokenPool + + +class TestTokenPool(unittest.TestCase): + """ + Test the methods in ..token_pool.py + """ + + def setUp(self): + """ + Prepare the token pool + """ + self._LIMIT = 0 + self._OFFSET = 0 + + token_pool = TokenPool(SampleHandler()) + # self._article_tokens = token_pool.output_tokens(limit=self._LIMIT, offset=self._OFFSET) + self._article_tokens = token_pool.output_tokens() + self._STOP_WORDS \ + = os.path.join(mc_root_path(), "lib/MediaWords/Languages/resources/en_stopwords.txt") + + def test_lower_case(self): + """ + Test if all tokens are in lower cases + """ + for sentence_tokens in list(self._article_tokens.values()): + for tokens in sentence_tokens: + for token in tokens: + unittest.TestCase.assertTrue( + self=self, + expr=any(char.isdigit() for char in token) or token.islower(), + msg=token) + + def test_no_stop_words(self): + """ + Test if there is no stop words in the tokens + """ + with open(self._STOP_WORDS) as stop_words_file: + stop_words = stop_words_file.readlines() + stop_words_file.close() + + for sentence_tokens in list(self._article_tokens.values()): + for tokens in sentence_tokens: + for token in tokens: + unittest.TestCase.assertTrue( + self=self, + expr=token not in stop_words, + msg=token) + + def test_correct_limit(self): + """ + Test if the correct number of stories are tokenized + """ + if self._LIMIT: + unittest.TestCase.assertEqual( + self=self, first=self._LIMIT, second=len(self._article_tokens)) + + +if __name__ == '__main__': + unittest.main() diff --git a/mediacloud/mediawords/util/topic_modeling/token_pool.py b/mediacloud/mediawords/util/topic_modeling/token_pool.py new file mode 100644 index 0000000000..5730abf705 --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/token_pool.py @@ -0,0 +1,164 @@ +import os + +# from mediawords.db import connect_to_db +from mediawords.util.topic_modeling.sample_handler import SampleHandler +from mediawords.db import handler +from mediawords.util.paths import mc_root_path +from nltk.stem import WordNetLemmatizer +from nltk import word_tokenize +from typing import Dict, List, Union +import warnings + + +class TokenPool: + """ Fetch the sentences and break it down to words.""" + _LANGUAGE = 'english' + _STORY_SENTENCE_TABLE = 'story_sentences' + _STORY_TABLE = 'stories' + _MAIN_QUERY \ + = """SELECT story_sentences.stories_id, story_sentences.sentence FROM stories + INNER JOIN story_sentences ON stories.stories_id = story_sentences.stories_id + WHERE stories.language = 'en' + ORDER BY stories.stories_id, + story_sentences.sentence_number""" + + # # An alternative SQL + # # the intention was trying to use LIMIT and OFFSET to allow better customization + # _MAIN_QUERY \ + # = """SELECT story_sentences.stories_id, story_sentences.sentence FROM story_sentences + # INNER JOIN stories ON stories.stories_id = story_sentences.stories_id + # WHERE stories.language = 'en' + # AND story_sentences.stories_id IN + # (SELECT stories_id FROM story_sentences + # ORDER BY story_sentences.stories_id) -- nested SELECT statement to cooperate with LIMIT + # ORDER BY story_sentences.sentence_number""" + + _STOP_WORDS \ + = os.path.join(mc_root_path(), "lib/MediaWords/Languages/resources/en_stopwords.txt") + _MIN_TOKEN_LEN = 1 + + def __init__(self, db: Union[handler.DatabaseHandler, SampleHandler]) -> None: + """Initialisations""" + self._stopwords = self._fetch_stopwords() + self._db = db + + # parameter limit and offset cannot fit in the current SQL query + # def _fetch_sentence_dictionaries(self, limit: int, offset: int) -> list: + def _fetch_sentence_dictionaries(self) -> list: + """ + Fetch the sentence from DB + # :param limit: the number of stories to be output, 0 means no limit + :return: the sentences in json format + """ + + # insert LIMIT and OFFSET if needed, but cannot fit in the current SQL query + # query_cmd \ + # = self._MAIN_QUERY[:-51] \ + # + ' LIMIT {} OFFSET {}'.format(limit, offset) \ + # + self._MAIN_QUERY[-51:] \ + # if limit else self._MAIN_QUERY + + query_cmd = self._MAIN_QUERY + + sentence_dictionaries = self._db.query(query_cmd).hashes() \ + if type(self._db) == handler.DatabaseHandler \ + else self._db.query() + + return sentence_dictionaries + + def _bind_stories(self, sentences: list) -> Dict[int, list]: + """ + Break the sentence down into tokens and group them by story ID + :param sentences: a json containing sentences and their story id + :return: a dictionary of stories and words in them + """ + stories = {} + + for sentence in sentences: + processed_sentence = self._process_sentences(sentence=sentence) + + if not processed_sentence: + continue + + if sentence['stories_id'] not in stories.keys(): + stories[sentence['stories_id']] = [] + + stories[sentence['stories_id']].append(processed_sentence) + + return stories + + def _process_sentences(self, sentence: dict) -> list: + """ + Eliminate symbols and stopwords + :param sentence: a raw sentence from story + :return: a cleaned up sentence + """ + sentence_tokens = self._tokenize_sentence(story_sentence=sentence['sentence']) + + # First elimination: save time in lemmatization + useful_tokens = self._eliminate_stopwords(sentence_tokens=sentence_tokens) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + lemmatized_tokens \ + = [WordNetLemmatizer().lemmatize(word=token.lower()) for token in useful_tokens] + + del useful_tokens + + # Second elimination: + # remove the words that are exact match of stop words after lemmatization + useful_tokens = self._eliminate_stopwords(sentence_tokens=lemmatized_tokens) + + return useful_tokens + + def _tokenize_sentence(self, story_sentence: str) -> list: + """ + Remove symbols in the given list of words in story + :param story_sentence: a sentence in an story + :return: a list of non-symbol tokens + """ + sliced_sentence = word_tokenize(text=story_sentence, language=self._LANGUAGE) + + return sliced_sentence + + def _fetch_stopwords(self) -> list: + """ + Fetch the stopwords from file en_stopwords.txt + :return: all stopwords in the file + """ + stop_words_file = open(self._STOP_WORDS) + predefined_stopwords = [element[:-1] for element in stop_words_file.readlines()] + stop_words_file.close() + + return predefined_stopwords + + def _eliminate_stopwords(self, sentence_tokens: list) -> list: + """ + Remove stopwords in the given list of words in story + :param sentence_tokens: a list containing all tokens in a sentence + :return: a list of all the useful words + """ + useful_sentence_tokens \ + = [token for token in sentence_tokens + if ((len(token) > self._MIN_TOKEN_LEN) and (token.lower() not in self._stopwords))] + + return useful_sentence_tokens + + # def output_tokens(self, limit: int = 0, offset: int = 0) -> Dict[int, List[List[str]]]: + def output_tokens(self) -> Dict[int, List[List[str]]]: + """ + Go though each step to output the tokens of stories + :return: a dictionary with key as the id of each story and value as the useful tokens + """ + # sentence_dictionaries = self._fetch_sentence_dictionaries(limit=limit, offset=offset) + sentence_dictionaries = self._fetch_sentence_dictionaries() + processed_stories = self._bind_stories(sentences=sentence_dictionaries) + + return processed_stories + + +# A sample output +if __name__ == '__main__': + # pool = TokenPool(connect_to_db()) + pool = TokenPool(SampleHandler()) + print(pool.output_tokens()) diff --git a/mediacloud/mediawords/util/topic_modeling/topic_model.py b/mediacloud/mediawords/util/topic_modeling/topic_model.py new file mode 100644 index 0000000000..c992b951a7 --- /dev/null +++ b/mediacloud/mediawords/util/topic_modeling/topic_model.py @@ -0,0 +1,33 @@ +from abc import ABC, abstractmethod +from typing import Dict + + +class BaseTopicModel(ABC): + """ + An abstract base topic model class for all topic models + """ + _model = None + + @abstractmethod + def add_stories(self, stories: dict) -> None: + """ + Adding new stories into the model + :param stories: a dictionary of new stories + """ + pass + + @abstractmethod + def summarize_topic(self) -> Dict[int, list]: + """ + summarize the topic of each story based on the frequency of occurrence of each word + :return: a dictionary of article_id : topics + """ + pass + + @abstractmethod + def evaluate(self) -> str: + """ + evaluate the accuracy of models + :return: total number of topics followed by a score/likelihood + """ + pass diff --git a/mediacloud/requirements.txt b/mediacloud/requirements.txt index 520e750c03..cdb25d5be3 100644 --- a/mediacloud/requirements.txt +++ b/mediacloud/requirements.txt @@ -9,6 +9,13 @@ # Unit test coverage coverage +# LDA models +gensim + +# To eliminate the 'module' object has no attribute 'plugin' problem +# while importing gensim +google-compute-engine + # Stemming Hausa words hausastemmer @@ -18,6 +25,9 @@ jieba # Parsing email templates Jinja2 +# LDA models +lda + # Japanese language tokenizer, stemmer, etc. mecab-python3 @@ -51,8 +61,12 @@ readability-lxml==0.6.2 # Making HTTP requests requests +# To apply non-negative matrix factorization +scikit-learn + # math package for forceatlas implementation scipy # Normalizing URLs url_normalize +