diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 02e29d4..df42a5c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -75,6 +75,7 @@ Texthero is there for the NLP-community. If you have an idea on how we can impro 1. Before writing a new function or make any changes, look at similar code for inspiration and to learn about the code format and style. 1. The maximal docstring line length should be 75 characters. This should be manually done as `black` formatting does not enforce limits on docstring line length. 1. Use American English instead of British English (e.g. categorize instead of categorise) when writing comments and documenting docstrings. +1. Use as most as possible quotes or sentences that you can find from superheroes comics or movies, like "HULK SMASH!", "I am Groot!", "I am the vengeance, I am the night, I am BATMAN!", "With great power comes great responsibility.", etc... 1. For default argument values, use the defaults from the underlying library if applicable (e.g. the default arguments from sklearn if using a sklearn algorithm). If other values are used, add a small comment explaining why. Additionally, look for similar functions and use their default values. 1. Default values are defined as follows: `x : int, optional, default=2` diff --git a/texthero/nlp.py b/texthero/nlp.py index 9e8cfbe..1f502f3 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -55,10 +55,11 @@ def named_entities(s: TextSeries, package="spacy") -> pd.Series: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Yesterday I was in NY with Bill de Blasio") + >>> s = pd.Series("Yesterday, Spider-Man met Daredevil in Queens, New-York.") >>> hero.named_entities(s)[0] # doctest: +NORMALIZE_WHITESPACE - [('Yesterday', 'DATE', 0, 9), ('NY', 'GPE', 19, 21), - ('Bill de Blasio', 'PERSON', 27, 41)] + [('Yesterday', 'DATE', 0, 9), ('Spider-Man', 'PERSON', 11, 21), + ('Daredevil', 'GPE', 26, 35), ('Queens', 'GPE', 39, 45), + ('New-York', 'GPE', 47, 55)] """ entities = [] @@ -93,9 +94,9 @@ def noun_chunks(s: TextSeries) -> pd.Series: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("The spotted puppy is sleeping.") + >>> s = pd.Series("A little spider just bite me!") >>> hero.noun_chunks(s) - 0 [(The spotted puppy, NP, 0, 17)] + 0 [(A little spider, NP, 0, 15), (me, NP, 26, 28)] dtype: object """ @@ -130,8 +131,8 @@ def count_sentences(s: TextSeries) -> pd.Series: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series( - ... ["Yesterday I was in NY with Bill de Blasio. Great story...", - ... "This is the F.B.I.! What? Open up!"]) + ... ["Yesterday, Spider-Man met Daredevil in Queens, New-York. Great story...", + ... "This is the S.H.I.E.L.D! What? Open up!"]) >>> hero.count_sentences(s) 0 2 1 3 @@ -166,7 +167,7 @@ def pos_tag(s: TextSeries) -> pd.Series: coarse-grained POS has a NOUN value, then the refined POS will give more details about the type of the noun, whether it is singular, plural and/or proper. - + You can use the spacy `explain` function to find out which fine-grained POS it is. @@ -204,11 +205,11 @@ def pos_tag(s: TextSeries) -> pd.Series: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Today is such a beautiful day") + >>> s = pd.Series("Today is such a marvelous day") >>> print(hero.pos_tag(s)[0]) # doctest: +NORMALIZE_WHITESPACE - [('Today', 'NOUN', 'NN', 0, 5), ('is', 'AUX', 'VBZ', 6, 8), ('such', 'DET', - 'PDT', 9, 13), ('a', 'DET', 'DT', 14, 15), ('beautiful', 'ADJ', 'JJ', 16, - 25), ('day', 'NOUN', 'NN', 26, 29)] + [('Today', 'NOUN', 'NN', 0, 5), ('is', 'AUX', 'VBZ', 6, 8), + ('such', 'DET', 'PDT', 9, 13), ('a', 'DET', 'DT', 14, 15), + ('marvelous', 'ADJ', 'JJ', 16, 25), ('day', 'NOUN', 'NN', 26, 29)] """ pos_tags = [] @@ -264,9 +265,9 @@ def stem(s: TextSeries, stem="snowball", language="english") -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("I used to go \t\n running.") + >>> s = pd.Series("I used to go \t\n flying.") >>> hero.stem(s) - 0 i use to go running. + 0 i use to go flying. dtype: object """ diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 88820a1..290f2ef 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -31,18 +31,21 @@ def fillna(s: TextSeries, replace_string="") -> TextSeries: >>> import texthero as hero >>> import pandas as pd >>> import numpy as np - >>> s = pd.Series(["I'm", np.NaN, pd.NA, "You're"]) + >>> s = pd.Series(["I'm", np.NaN, pd.NA, "BATMAN!"]) >>> hero.fillna(s) - 0 I'm - 1 - 2 - 3 You're + 0 I'm + 1 + 2 + 3 BATMAN! + dtype: object + + dtype: object dtype: object >>> hero.fillna(s, "Missing") 0 I'm 1 Missing 2 Missing - 3 You're + 3 BATMAN! dtype: object """ @@ -54,14 +57,13 @@ def lowercase(s: TextSeries) -> TextSeries: """ Lowercase all texts in a series. - Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("This is NeW YoRk wIth upPer letters") + >>> s = pd.Series("BE thE Best you Can Be!") >>> hero.lowercase(s) - 0 this is new york with upper letters + 0 be the best you can be! dtype: object """ return s.str.lower() @@ -130,12 +132,12 @@ def remove_digits(s: TextSeries, only_blocks=True) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("7ex7hero is fun 1111") + >>> s = pd.Series("Here comes5The Fantastic 4!") >>> hero.preprocessing.remove_digits(s) - 0 7ex7hero is fun + 0 Here comes5The Fantastic ! dtype: object >>> hero.preprocessing.remove_digits(s, only_blocks=False) - 0 ex hero is fun + 0 Here comes The Fantastic ! dtype: object """ @@ -148,8 +150,8 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries: Replace all punctuation with a given symbol. Replace all punctuation from the given - Pandas Series with a custom symbol. - It considers as punctuation characters all :data:`string.punctuation` + Pandas Series with a custom symbol. + It considers as punctuation characters all :data:`string.punctuation` symbols `!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~).` @@ -158,15 +160,15 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries: s : :class:`texthero._types.TextSeries` symbol : str, optional, default=" " - Symbol to use as replacement for all string punctuation. + Symbol to use as replacement for all string punctuation. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Finnaly.") + >>> s = pd.Series("I.am.Groot!") >>> hero.replace_punctuation(s, " ") - 0 Finnaly + 0 I am Groot dtype: object """ @@ -189,9 +191,9 @@ def remove_punctuation(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Finnaly.") + >>> s = pd.Series("I.am.Groot!") >>> hero.remove_punctuation(s) - 0 Finnaly + 0 I am Groot dtype: object """ return replace_punctuation(s, " ") @@ -205,9 +207,9 @@ def _remove_diacritics(text: str) -> str: -------- >>> from texthero.preprocessing import _remove_diacritics >>> import pandas as pd - >>> text = "Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس" + >>> text = "bédéphile, über, 12.89, la Guêpe, 889, shônen, اِس, اُس" >>> _remove_diacritics(text) - 'Montreal, uber, 12.89, Mere, Francoise, noel, 889, اس, اس' + 'bedephile, uber, 12.89, la Guepe, 889, shonen, اس, اس' """ nfkd_form = unicodedata.normalize("NFKD", text) # unicodedata.combining(char) checks if the character is in @@ -229,10 +231,9 @@ def remove_diacritics(s: TextSeries) -> TextSeries: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series( - ... "Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس") + ... "bédéphile, über, 12.89, la Guêpe, 889, shônen, اِس, اُس") >>> hero.remove_diacritics(s)[0] - 'Montreal, uber, 12.89, Mere, Francoise, noel, 889, اس, اس' - + 'bedephile, uber, 12.89, la Guepe, 889, shonen, اس, اس' """ return s.astype("unicode").apply(_remove_diacritics) @@ -252,9 +253,9 @@ def remove_whitespace(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Title \n Subtitle \t ...") + >>> s = pd.Series("I am the vengeance,\n I am the night,\n I am BATMAN!") >>> hero.remove_whitespace(s) - 0 Title Subtitle ... + 0 I am the vengeance, I am the night, I am BATMAN! dtype: object """ @@ -278,18 +279,18 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str: Examples -------- >>> from texthero.preprocessing import _replace_stopwords - >>> s = "the book of the jungle" + >>> s = "Oh my God, Batman!" >>> symbol = "$" - >>> stopwords = ["the", "of"] + >>> stopwords = ["my"] >>> _replace_stopwords(s, stopwords, symbol) - '$ book $ $ jungle' + 'Oh $ God, Batman!' """ pattern = r"""(?x) # Set flag to allow verbose regexps - \w+(?:-\w+)* # Words with optional internal hyphens + \w+(?:-\w+)* # Words with optional internal hyphens | \s* # Any space - | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol + | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol """ return "".join(t if t not in words else symbol for t in re.findall(pattern, text)) @@ -313,15 +314,15 @@ def replace_stopwords( stopwords : Set[str], optional, default=None Set of stopwords string to remove. If not passed, - by default uses NLTK English stopwords. + by default uses NLTK English stopwords. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("the book of the jungle") - >>> hero.replace_stopwords(s, "X") - 0 X book X X jungle + >>> s = pd.Series("Oh my God, Batman!") + >>> hero.replace_stopwords(s, "$") + 0 Oh $ God, Batman! dtype: object """ @@ -357,9 +358,9 @@ def remove_stopwords( >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Texthero is not only for the heroes") + >>> s = pd.Series("I have the power!") >>> hero.remove_stopwords(s) - 0 Texthero heroes + 0 I power! dtype: object Add custom words into the default list of stopwords: @@ -368,10 +369,10 @@ def remove_stopwords( >>> from texthero import stopwords >>> import pandas as pd >>> default_stopwords = stopwords.DEFAULT - >>> custom_stopwords = default_stopwords.union(set(["heroes"])) - >>> s = pd.Series("Texthero is not only for the heroes") + >>> custom_stopwords = default_stopwords.union(set(["power"])) + >>> s = pd.Series("I have the power!") >>> hero.remove_stopwords(s, custom_stopwords) - 0 Texthero + 0 I ! dtype: object @@ -432,7 +433,7 @@ def clean(s: TextSeries, pipeline=None) -> TextSeries: of functions taking as input and returning as output a Pandas Series. If None, the default pipeline is used. - + Examples -------- For the default pipeline: @@ -443,6 +444,7 @@ def clean(s: TextSeries, pipeline=None) -> TextSeries: >>> hero.clean(s) 0 uper 9dig aou dtype: object + """ if not pipeline: @@ -462,12 +464,13 @@ def has_content(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["content", np.nan, "\t\n", " "]) + >>> s = pd.Series(["Flame", np.nan, "on!", "\t\n", " "]) >>> hero.has_content(s) 0 True 1 False - 2 False + 2 True 3 False + 4 False dtype: bool """ @@ -486,9 +489,10 @@ def drop_no_content(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["content", np.nan, "\t\n", " "]) + >>> s = pd.Series(["Flame", np.nan, "on!", "\t\n", " "]) >>> hero.drop_no_content(s) - 0 content + 0 Flame + 2 on! dtype: object """ @@ -505,11 +509,12 @@ def remove_round_brackets(s: TextSeries) -> TextSeries: >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Texthero (is not a superhero!)") + >>> s = pd.Series("HULK (SMASH!)") >>> hero.remove_round_brackets(s) - 0 Texthero + 0 HULK dtype: object + See also -------- :meth:`remove_brackets` @@ -531,11 +536,12 @@ def remove_curly_brackets(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Texthero {is not a superhero!}") + >>> s = pd.Series("HULK {SMASH!}") >>> hero.remove_curly_brackets(s) - 0 Texthero + 0 HULK dtype: object + See also -------- :meth:`remove_brackets` @@ -557,11 +563,12 @@ def remove_square_brackets(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Texthero [is not a superhero!]") + >>> s = pd.Series("HULK [SMASH!]") >>> hero.remove_square_brackets(s) - 0 Texthero + 0 HULK dtype: object + See also -------- :meth:`remove_brackets` @@ -584,11 +591,12 @@ def remove_angle_brackets(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Texthero ") + >>> s = pd.Series("HULK ") >>> hero.remove_angle_brackets(s) - 0 Texthero + 0 HULK dtype: object + See also -------- :meth:`remove_brackets` @@ -611,11 +619,12 @@ def remove_brackets(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Texthero (round) [square] [curly] [angle]") + >>> s = pd.Series("HULK (S) [M] (A) [S] (H)") >>> hero.remove_brackets(s) - 0 Texthero + 0 HULK dtype: object + See also -------- :meth:`remove_round_brackets` @@ -646,9 +655,9 @@ def remove_html_tags(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("

Title

") + >>> s = pd.Series("

HULK

SMASH!

") >>> hero.remove_html_tags(s) - 0 Title + 0 HULKSMASH! dtype: object """ @@ -676,11 +685,10 @@ def tokenize(s: TextSeries) -> TokenSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Today you're looking great!"]) + >>> s = pd.Series(["I am the LAW!"]) >>> hero.tokenize(s) - 0 [Today, you're, looking, great, !] + 0 [I, am, the, LAW, !] dtype: object - """ punct = string.punctuation.replace("_", "") @@ -716,24 +724,26 @@ def phrases( Parameters ---------- s : :class:`texthero._types.TokenSeries` - + min_count : int, optional, default=5 Ignore tokens with frequency less than this. - + threshold : int, optional, default=10 Ignore tokens with a score under that threshold. - + symbol : str, optional, default="_" Character used to join collocation words. Examples -------- >>> import texthero as hero - >>> s = pd.Series([['New', 'York', 'is', 'a', 'beautiful', 'city'], - ... ['Look', ':', 'New', 'York', '!']]) + >>> s = pd.Series([['I', 'have', 'the', 'power', '!'], + ... ['I', 'am', 'Groot', '!'], + ... ['I', 'am', 'the', 'LAW', '.']]) >>> hero.phrases(s, min_count=1, threshold=1) - 0 [New_York, is, a, beautiful, city] - 1 [Look, :, New_York, !] + 0 [I, have, the, power, !] + 1 [I_am, Groot, !] + 2 [I_am, the, LAW, .] dtype: object Reference @@ -741,7 +751,6 @@ def phrases( `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality" `_ - """ if not isinstance(s.iloc[0], list): @@ -772,15 +781,14 @@ def replace_urls(s: TextSeries, symbol: str) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Go to: https://example.com") + >>> s = pd.Series("Find me on https://www.marvel.com/") >>> hero.replace_urls(s, "") - 0 Go to: + 0 Find me on dtype: object See also -------- :meth:`texthero.preprocessing.remove_urls` - """ pattern = r"http\S+" @@ -798,15 +806,14 @@ def remove_urls(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Go to: https://example.com") + >>> s = pd.Series("Find me on https://www.marvel.com/") >>> hero.remove_urls(s) - 0 Go to: + 0 Find me on dtype: object See also -------- :meth:`texthero.preprocessing.replace_urls` - """ return replace_urls(s, " ") @@ -817,7 +824,7 @@ def replace_tags(s: TextSeries, symbol: str) -> TextSeries: """Replace all tags from a given Pandas Series with symbol. A tag is a string formed by @ concatenated with a sequence of characters - and digits. Example: @texthero123. + and digits. Example: @spiderparker59. Parameters ---------- @@ -830,11 +837,10 @@ def replace_tags(s: TextSeries, symbol: str) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Hi @texthero123, we will replace you") + >>> s = pd.Series("Hi @spiderparker59, we will replace you") >>> hero.replace_tags(s, symbol='TAG') 0 Hi TAG, we will replace you dtype: object - """ pattern = r"@[a-zA-Z0-9]+" @@ -847,13 +853,13 @@ def remove_tags(s: TextSeries) -> TextSeries: """Remove all tags from a given Pandas Series. A tag is a string formed by @ concatenated with a sequence of characters - and digits. Example: @texthero123. Tags are replaceb by an empty space ` `. + and digits. Example: @spiderparker59. Tags are replaceb by an empty space ` `. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Hi @tag, we will remove you") + >>> s = pd.Series("Hi @spiderparker59, we will remove you") >>> hero.remove_tags(s) 0 Hi , we will remove you dtype: object @@ -863,6 +869,7 @@ def remove_tags(s: TextSeries) -> TextSeries: :meth:`texthero.preprocessing.replace_tags` for replacing a tag with a custom symbol. """ + return replace_tags(s, " ") @@ -871,7 +878,7 @@ def replace_hashtags(s: TextSeries, symbol: str) -> TextSeries: """Replace all hashtags from a Pandas Series with symbol A hashtag is a string formed by # concatenated with a sequence of - characters, digits and underscores. Example: #texthero_123. + characters, digits and underscores. Example: #spiderparker_59. Parameters ---------- @@ -879,17 +886,17 @@ def replace_hashtags(s: TextSeries, symbol: str) -> TextSeries: symbol : str Symbol to replace hashtags with. - + Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Hi #texthero_123, we will replace you.") + >>> s = pd.Series("Hi #spiderparker_59, we will replace you.") >>> hero.replace_hashtags(s, symbol='HASHTAG') 0 Hi HASHTAG, we will replace you. dtype: object - """ + pattern = r"#[a-zA-Z0-9_]+" return s.str.replace(pattern, symbol) @@ -899,13 +906,13 @@ def remove_hashtags(s: TextSeries) -> TextSeries: """Remove all hashtags from a given Pandas Series A hashtag is a string formed by # concatenated with a sequence of - characters, digits and underscores. Example: #texthero_123. + characters, digits and underscores. Example: #spiderparker_59. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Hi #texthero_123, we will remove you.") + >>> s = pd.Series("Hi #spiderparker_59, we will remove you.") >>> hero.remove_hashtags(s) 0 Hi , we will remove you. dtype: object @@ -915,4 +922,5 @@ def remove_hashtags(s: TextSeries) -> TextSeries: :meth:`texthero.preprocessing.replace_hashtags` for replacing a hashtag with a custom symbol. """ + return replace_hashtags(s, " ") diff --git a/texthero/representation.py b/texthero/representation.py index 8bbfead..37d5fda 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -1,5 +1,5 @@ """ -Map words into vectors using different algorithms such as +Map words into vectors using different algorithms such as TF-IDF, word2vec or GloVe. """ @@ -60,7 +60,7 @@ def count( """ Represent a text-based Pandas Series using count. - Rows of the returned DataFrame represent documents whereas + Rows of the returned DataFrame represent documents whereas columns are terms. The value in the cell document-term is the number of the term in this document. The output is sparse. TODO add tutorial link @@ -78,7 +78,7 @@ def count( min_df : float in range [0.0, 1.0] or int, optional, default=1 When building the vocabulary ignore terms that have a document - frequency (number of documents they appear in) strictly + frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. @@ -96,12 +96,12 @@ def count( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) + >>> s = pd.Series(["Batman is not evil", "Joker is evil"]).pipe(hero.tokenize) >>> hero.count(s) # doctest: +SKIP - Sentence one two - 0 1 1 0 - 1 1 0 1 - + Batman Joker evil is not + 0 1 0 1 1 1 + 1 0 1 1 1 0 + See Also -------- @@ -154,7 +154,7 @@ def term_frequency( min_df : float in range [0.0, 1.0] or int, optional, default=1 When building the vocabulary ignore terms that have a document - frequency (number of documents they appear in) strictly + frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. @@ -169,13 +169,13 @@ def term_frequency( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Text Text of doc one", "Text of of doc two", "Aha hi bnd one"]).pipe(hero.tokenize) + >>> s = pd.Series(["Batman is not evil", "Joker is evil", "Bane is evil too"]).pipe(hero.tokenize) >>> hero.term_frequency(s) # doctest: +SKIP - term_frequency - Aha Text bnd doc hi of one two - 0 0.00 0.4 0.00 0.2 0.00 0.2 0.20 0.0 - 1 0.00 0.2 0.00 0.2 0.00 0.4 0.00 0.2 - 2 0.25 0.0 0.25 0.0 0.25 0.0 0.25 0.0 + term_frequency + Bane Batman Joker evil is not too + 0 0.00 0.25 0.000000 0.250000 0.250000 0.25 0.00 + 1 0.00 0.00 0.333333 0.333333 0.333333 0.00 0.00 + 2 0.25 0.00 0.000000 0.250000 0.250000 0.00 0.25 See Also -------- @@ -217,8 +217,8 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram Different from the `sklearn-implementation of tfidf `, this function does *not* - normalize the output in any way, so the result is exactly what you + extraction.text.TfidfVectorizer.html>`, this function does *not* + normalize the output in any way, so the result is exactly what you get applying the formula described above. The input Series should already be tokenized. If not, it will @@ -233,15 +233,15 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram min_df : float in range [0.0, 1.0] or int, optional, default=1 When building the vocabulary ignore terms that have a document - frequency (number of documents they appear in) strictly + frequency (number of documents they appear in) strictly lower than the given threshold. - If float, the parameter represents a proportion of documents, + If float, the parameter represents a proportion of documents, integer absolute counts. max_df : float in range [0.0, 1.0] or int, default=1.0 Ignore terms that have a document frequency (number of documents they appear in) frequency strictly higher than the given threshold. - This arguments basically permits to remove corpus-specific stop + This arguments basically permits to remove corpus-specific stop words. If float, the parameter represents a proportion of documents, integer absolute counts. @@ -249,11 +249,11 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) - >>> hero.tfidf(s) # doctest: +SKIP - Bye Hi Test - 0 1.0 1.405465 0.000000 - 1 2.0 0.000000 1.405465 + >>> s = pd.Series(["I am the LAW", "I am Groot"]).pipe(hero.tokenize) + >>> hero.tfidf(s) # doctest: +SKIP + Groot I LAW am the + 0 0.000000 1.0 1.405465 1.0 1.405465 + 1 1.405465 1.0 0.000000 1.0 0.000000 See Also -------- @@ -296,18 +296,18 @@ def pca( Perform principal component analysis on the given input. Principal Component Analysis (PCA) is a statistical method that is - used to reveal where the variance in a dataset comes from. For - textual data, one could for example first represent a Series of + used to reveal where the variance in a dataset comes from. For + textual data, one could for example first represent a Series of documents using :meth:`texthero.representation.tfidf` to get a vector - representation of each document. Then, PCA can generate new vectors + representation of each document. Then, PCA can generate new vectors from the tfidf representation that showcase the differences among the documents most strongly in fewer dimensions. For example, the tfidf vectors will have length 100 if hero.tfidf was - called on a large corpus with max_features=100. Visualizing 100 + called on a large corpus with max_features=100. Visualizing 100 dimensions is hard! Using PCA with n_components=3, every document will now get a vector of length 3, and the vectors will be chosen so that - the document differences are easily visible. The corpus can now be + the document differences are easily visible. The corpus can now be visualized in 3D and we can get a good first view of the data! In general, *pca* should be called after the text has already been @@ -340,20 +340,20 @@ def pca( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football is great", - ... "Hi, I'm Texthero, who are you? Tell me!"]) + >>> s = pd.Series(["Where is my cape?!", + ... "Hi, I'm Marie-Jane, you're new neighbour!"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> # Attention, your results might differ due to >>> # the randomness in PCA! >>> hero.pca(s) # doctest: +SKIP document - 0 [1.5713577608669735, 1.1102230246251565e-16] - 1 [-1.5713577608669729, 1.1102230246251568e-16] + 0 [-1.7213361830752993, 2.482534153247273e-16] + 1 [1.7213361830752996, 2.4825341532472726e-16] dtype: object See also -------- - `PCA on Wikipedia + `PCA on Wikipedia `_ """ @@ -378,12 +378,12 @@ def nmf( natural language processing to find clusters of similar texts (e.g. some texts in a corpus might be about sports and some about music, so they will differ in the usage - of technical terms; see the example below). + of technical terms; see the example below). Given a document-term matrix (so in texthero usually a Series after applying - :meth:`texthero.representation.tfidf` or some other first - representation function that assigns a scalar (a weight) to each + :meth:`texthero.representation.tfidf` or some other first + representation function that assigns a scalar (a weight) to each word), NMF will find n_components many topics (clusters) and calculate a vector for each document that places it correctly among the topics. @@ -404,27 +404,27 @@ def nmf( Returns ------- - Pandas Series with the vector calculated by NMF for the document in + Pandas Series with the vector calculated by NMF for the document in every cell. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", - ... "Music, Violin, Orchestra", "Football, Music"]) + >>> s = pd.Series(["Saber, Weapon, trident", + ... "Cape, Costume, Mask", "Saber, Cape"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe( ... hero.term_frequency ... ) >>> hero.nmf(s) # doctest: +SKIP - 0 [0.9080190347553924, 0.0] - 1 [0.0, 0.771931061231598] - 2 [0.3725409073202516, 0.31656880119331093] + 0 [0.27766260921934044, 0.0] + 1 [0.0, 0.44747079529871103] + 2 [0.10074274753704408, 0.5022865803893911] dtype: object >>> # As we can see, the third document, which >>> # is a mix of sports and music, is placed >>> # between the two axes (the topics) while - >>> # the other documents are placed right on + >>> # the other documents are placed right on >>> # one topic axis each. See also @@ -464,7 +464,7 @@ def tsne( t-distributed Stochastic Neighbor Embedding (t-SNE) is a machine learning algorithm used to visualize high-dimensional data in fewer dimensions. In natural language processing, the - high-dimensional data is usually a document-term matrix (so in + high-dimensional data is usually a document-term matrix (so in texthero usually a Series after applying :meth:`texthero.representation.tfidf` or some other first representation function that assigns a scalar (a weight) to each word) @@ -520,13 +520,13 @@ def tsne( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", - ... "Music, Violin, Orchestra", "Football, Music"]) + >>> s = pd.Series(["Saber, Weapon, trident", + ... "Cape, Costume, Mask", "Saber, Cape"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) >>> hero.tsne(s, random_state=42) # doctest: +SKIP - 0 [-18.833383560180664, -276.800537109375] - 1 [-210.60179138183594, 143.00535583496094] - 2 [-478.27984619140625, -232.97410583496094] + 0 [-18.833384, -276.80054] + 1 [-210.60179, 143.00536] + 2 [-478.27985, -232.9741] dtype: object See also @@ -575,17 +575,17 @@ def kmeans( Performs K-means clustering algorithm on the given input. K-means clustering is used in natural language processing - to separate texts into k clusters (groups) + to separate texts into k clusters (groups) (e.g. some texts in a corpus might be about sports and some about music, so they will differ in the usage of technical terms; the K-means algorithm uses this - to separate them into two clusters). + to separate them into two clusters). Given a document-term matrix (so in texthero usually a Series after applying - :meth:`texthero.representation.tfidf` or some other first + :meth:`texthero.representation.tfidf` or some other first representation function that assigns a scalar (a weight) to each - word), K-means will find k topics (clusters) and assign a topic to + word), K-means will find k topics (clusters) and assign a topic to each document. Kmeans can directly handle sparse input, so when calling kmeans on a @@ -619,16 +619,16 @@ def kmeans( Returns ------- - Pandas Series with the cluster the document was assigned to in each + Pandas Series with the cluster the document was assigned to in each cell. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", - ... "music, violin, orchestra", - ... "football, fun, sports", "music, fun, guitar"]) + >>> s = pd.Series(["Saber, Weapon, trident", + ... "cape, costume, mask", + ... "saber, power, weapon", "cape, power, trident"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe( ... hero.term_frequency ... ) @@ -644,7 +644,7 @@ def kmeans( See also -------- - `kmeans on Wikipedia + `kmeans on Wikipedia `_ """ @@ -693,8 +693,8 @@ def dbscan( Given a document-term matrix (so in texthero usually a Series after applying :meth:`texthero.representation.tfidf` or some other first - representation function that assigns a scalar (a weight) to each - word), DBSCAN will find topics (clusters) and assign a topic to + representation function that assigns a scalar (a weight) to each + word), DBSCAN will find topics (clusters) and assign a topic to each document. DBSCAN can directly handle sparse input, so when calling dbscan on a @@ -708,7 +708,7 @@ def dbscan( The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most - important DBSCAN parameter to choose appropriately for your data + important DBSCAN parameter to choose appropriately for your data set and distance function. min_samples : int, optional, default=5 @@ -717,7 +717,7 @@ def dbscan( metric : string or callable, optional, default='euclidean' The metric to use when calculating distance between instances in a - feature array. Use + feature array. Use `sorted(sklearn.neighbors.VALID_METRICS['brute'])` to see valid options. @@ -743,17 +743,17 @@ def dbscan( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", - ... "music, violin, orchestra", - ... "football, fun, sports", "music, enjoy, guitar"]) + >>> s = pd.Series(["Saber, Weapon, trident", + ... "cape, costume, mask", + ... "saber, power, weapon", "cape, power, trident"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) - >>> hero.dbscan(s, min_samples=1, eps=4) + >>> hero.dbscan(s, min_samples=1, eps=3) 0 0 1 1 2 0 - 3 1 + 3 2 dtype: category - Categories (2, int64): [0, 1] + Categories (3, int64): [0, 1, 2] >>> # As we can see, the documents are correctly >>> # separated into topics / clusters by the algorithm >>> # and we didn't even have to say how many topics there are! @@ -825,14 +825,14 @@ def meanshift( If not given, the bandwidth is estimated. Estimating takes time at least quadratic in the number of samples - (i.e. documents). For large datasets, it’s wise to set the + (i.e. documents). For large datasets, it’s wise to set the bandwidth to a small value. bin_seeding : bool, optional, default=False If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness - corresponds to the bandwidth. Setting this option to True will + corresponds to the bandwidth. Setting this option to True will speed up the algorithm because fewer seeds will be initialized. min_bin_freq : int, optional, default=1 @@ -927,9 +927,9 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser >>> import texthero as hero >>> import pandas as pd >>> col = ["a","b","c", "d"] - >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], + >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], ... columns=col).astype("Sparse") - >>> hero.normalize(s, norm="max") # doctest: +SKIP + >>> hero.normalize(s, norm="max") # doctest: +SKIP a b c d 0 0.250000 0.500000 0.75 1.000000 1 0.571429 0.285714 1.00 0.714286 diff --git a/texthero/visualization.py b/texthero/visualization.py index a7b2b83..5c336ef 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -64,9 +64,9 @@ def scatterplot( -------- >>> import texthero as hero >>> import pandas as pd - >>> df = pd.DataFrame(["Football, Sports, Soccer", - ... "music, violin, orchestra", "football, fun, sports", - ... "music, fun, guitar"], columns=["texts"]) + >>> df = pd.DataFrame(["Saber, Weapon, trident", + ... "cape, costume, mask", "saber, power, weapon", + ... "cape, power, trident"], columns=["texts"]) >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize) >>> df["pca"] = ( ... hero.tfidf(df["texts"]) @@ -276,13 +276,19 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: -------- >>> import pandas as pd >>> import texthero as hero - >>> s = pd.Series("one two two three three three") - >>> hero.top_words(s) - three 3 - two 2 - one 1 + >>> s = pd.Series("I believe in second chances, I believe in redemption, but, mostly, I believe in my friends.") + >>> hero.top_words(s).sort_index() # sorted by index as the Series object does not have the same order on different machines + I 3 + believe 3 + but 1 + chances 1 + friends 1 + in 3 + mostly 1 + my 1 + redemption 1 + second 1 dtype: int64 - """ # Replace all punctuation that are NOT in-between chacarters