From d07f6b81976bb1ce881427c7020d0c36d8d094ea Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Fri, 4 Dec 2020 18:35:28 +0100 Subject: [PATCH 01/16] Replacement in preprocessing.py --- texthero/preprocessing.py | 169 +++++++++++++++++++------------------- 1 file changed, 83 insertions(+), 86 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 88820a1..a5a871d 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -31,18 +31,18 @@ def fillna(s: TextSeries, replace_string="") -> TextSeries: >>> import texthero as hero >>> import pandas as pd >>> import numpy as np - >>> s = pd.Series(["I'm", np.NaN, pd.NA, "You're"]) + >>> s = pd.Series(["I'm", np.NaN, pd.NA, "BATMAN!"]) >>> hero.fillna(s) 0 I'm - 1 - 2 - 3 You're + 1 + 2 + 3 BATMAN! dtype: object >>> hero.fillna(s, "Missing") 0 I'm 1 Missing 2 Missing - 3 You're + 3 BATMAN! dtype: object """ @@ -54,14 +54,13 @@ def lowercase(s: TextSeries) -> TextSeries: """ Lowercase all texts in a series. - Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("This is NeW YoRk wIth upPer letters") + >>> s = pd.Series("BE thE Best you Can Be!") >>> hero.lowercase(s) - 0 this is new york with upper letters + 0 be the best you can be! dtype: object """ return s.str.lower() @@ -130,12 +129,12 @@ def remove_digits(s: TextSeries, only_blocks=True) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("7ex7hero is fun 1111") + >>> s = pd.Series("Here comes5The Fantastic 4!") >>> hero.preprocessing.remove_digits(s) - 0 7ex7hero is fun + 0 Here comes5The Fantastic ! dtype: object >>> hero.preprocessing.remove_digits(s, only_blocks=False) - 0 ex hero is fun + 0 Here comes The Fantastic ! dtype: object """ @@ -148,8 +147,8 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries: Replace all punctuation with a given symbol. Replace all punctuation from the given - Pandas Series with a custom symbol. - It considers as punctuation characters all :data:`string.punctuation` + Pandas Series with a custom symbol. + It considers as punctuation characters all :data:`string.punctuation` symbols `!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~).` @@ -158,15 +157,15 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries: s : :class:`texthero._types.TextSeries` symbol : str, optional, default=" " - Symbol to use as replacement for all string punctuation. + Symbol to use as replacement for all string punctuation. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Finnaly.") + >>> s = pd.Series("I.am.Groot!") >>> hero.replace_punctuation(s, " ") - 0 Finnaly + 0 I am Groot dtype: object """ @@ -189,9 +188,9 @@ def remove_punctuation(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Finnaly.") + >>> s = pd.Series("I.am.Groot!") >>> hero.remove_punctuation(s) - 0 Finnaly + 0 I am Groot dtype: object """ return replace_punctuation(s, " ") @@ -205,9 +204,9 @@ def _remove_diacritics(text: str) -> str: -------- >>> from texthero.preprocessing import _remove_diacritics >>> import pandas as pd - >>> text = "Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس" + >>> text = "bédéphile, über, 12.89, la Guêpe, 889, shônen, اِس, اُس" >>> _remove_diacritics(text) - 'Montreal, uber, 12.89, Mere, Francoise, noel, 889, اس, اس' + 'bedephile, uber, 12.89, la Guepe, 889, shonen, اس, اس' """ nfkd_form = unicodedata.normalize("NFKD", text) # unicodedata.combining(char) checks if the character is in @@ -229,10 +228,9 @@ def remove_diacritics(s: TextSeries) -> TextSeries: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series( - ... "Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس") + ... "bédéphile, über, 12.89, la Guêpe, 889, shônen, اِس, اُس") >>> hero.remove_diacritics(s)[0] - 'Montreal, uber, 12.89, Mere, Francoise, noel, 889, اس, اس' - + 'bedephile, uber, 12.89, la Guepe, 889, shonen, اس, اس' """ return s.astype("unicode").apply(_remove_diacritics) @@ -252,9 +250,9 @@ def remove_whitespace(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Title \n Subtitle \t ...") + >>> s = pd.Series("I am the vengeance,\n I am the night,\n I am BATMAN!") >>> hero.remove_whitespace(s) - 0 Title Subtitle ... + 0 I am the vengeance, I am the night, I am BATMAN! dtype: object """ @@ -278,18 +276,18 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str: Examples -------- >>> from texthero.preprocessing import _replace_stopwords - >>> s = "the book of the jungle" + >>> s = "Oh my God, Batman!" >>> symbol = "$" - >>> stopwords = ["the", "of"] + >>> stopwords = ["my"] >>> _replace_stopwords(s, stopwords, symbol) - '$ book $ $ jungle' + 'Oh $ God, Batman!' """ pattern = r"""(?x) # Set flag to allow verbose regexps - \w+(?:-\w+)* # Words with optional internal hyphens + \w+(?:-\w+)* # Words with optional internal hyphens | \s* # Any space - | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol + | [][!"#$%&'*+,-./:;<=>?@\\^():_`{|}~] # Any symbol """ return "".join(t if t not in words else symbol for t in re.findall(pattern, text)) @@ -313,15 +311,15 @@ def replace_stopwords( stopwords : Set[str], optional, default=None Set of stopwords string to remove. If not passed, - by default uses NLTK English stopwords. + by default uses NLTK English stopwords. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("the book of the jungle") - >>> hero.replace_stopwords(s, "X") - 0 X book X X jungle + >>> s = pd.Series("Oh my God, Batman!") + >>> hero.replace_stopwords(s, "$") + 0 Oh $ God, Batman! dtype: object """ @@ -357,9 +355,9 @@ def remove_stopwords( >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Texthero is not only for the heroes") + >>> s = pd.Series("I have the power!") >>> hero.remove_stopwords(s) - 0 Texthero heroes + 0 I power! dtype: object Add custom words into the default list of stopwords: @@ -368,10 +366,10 @@ def remove_stopwords( >>> from texthero import stopwords >>> import pandas as pd >>> default_stopwords = stopwords.DEFAULT - >>> custom_stopwords = default_stopwords.union(set(["heroes"])) - >>> s = pd.Series("Texthero is not only for the heroes") + >>> custom_stopwords = default_stopwords.union(set(["power"])) + >>> s = pd.Series("I have the power!") >>> hero.remove_stopwords(s, custom_stopwords) - 0 Texthero + 0 I ! dtype: object @@ -432,7 +430,7 @@ def clean(s: TextSeries, pipeline=None) -> TextSeries: of functions taking as input and returning as output a Pandas Series. If None, the default pipeline is used. - + Examples -------- For the default pipeline: @@ -462,14 +460,14 @@ def has_content(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["content", np.nan, "\t\n", " "]) + >>> s = pd.Series(["Flame", np.nan, "on!", "\t\n", " "]) >>> hero.has_content(s) 0 True 1 False - 2 False + 2 True 3 False + 4 False dtype: bool - """ return (s.pipe(remove_whitespace) != "") & (~s.isna()) @@ -486,11 +484,11 @@ def drop_no_content(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["content", np.nan, "\t\n", " "]) + >>> s = pd.Series(["Flame", np.nan, "on!", "\t\n", " "]) >>> hero.drop_no_content(s) - 0 content + 0 Flame + 2 on! dtype: object - """ return s[has_content(s)] @@ -505,9 +503,9 @@ def remove_round_brackets(s: TextSeries) -> TextSeries: >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Texthero (is not a superhero!)") + >>> s = pd.Series("HULK (SMASH!)") >>> hero.remove_round_brackets(s) - 0 Texthero + 0 HULK dtype: object See also @@ -531,9 +529,9 @@ def remove_curly_brackets(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Texthero {is not a superhero!}") + >>> s = pd.Series("HULK {SMASH!}") >>> hero.remove_curly_brackets(s) - 0 Texthero + 0 HULK dtype: object See also @@ -557,9 +555,9 @@ def remove_square_brackets(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Texthero [is not a superhero!]") + >>> s = pd.Series("HULK [SMASH!]") >>> hero.remove_square_brackets(s) - 0 Texthero + 0 HULK dtype: object See also @@ -584,9 +582,9 @@ def remove_angle_brackets(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Texthero ") + >>> s = pd.Series("HULK ") >>> hero.remove_angle_brackets(s) - 0 Texthero + 0 HULK dtype: object See also @@ -611,9 +609,9 @@ def remove_brackets(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Texthero (round) [square] [curly] [angle]") + >>> s = pd.Series("HULK (S) [M] (A) [S] (H)") >>> hero.remove_brackets(s) - 0 Texthero + 0 HULK dtype: object See also @@ -646,9 +644,9 @@ def remove_html_tags(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("

Title

") + >>> s = pd.Series("

HULK

SMASH!

") >>> hero.remove_html_tags(s) - 0 Title + 0 HULKSMASH! dtype: object """ @@ -676,11 +674,10 @@ def tokenize(s: TextSeries) -> TokenSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Today you're looking great!"]) + >>> s = pd.Series(["I am the LAW!"]) >>> hero.tokenize(s) - 0 [Today, you're, looking, great, !] + 0 [I, am, the, LAW, !] dtype: object - """ punct = string.punctuation.replace("_", "") @@ -716,24 +713,26 @@ def phrases( Parameters ---------- s : :class:`texthero._types.TokenSeries` - + min_count : int, optional, default=5 Ignore tokens with frequency less than this. - + threshold : int, optional, default=10 Ignore tokens with a score under that threshold. - + symbol : str, optional, default="_" Character used to join collocation words. Examples -------- >>> import texthero as hero - >>> s = pd.Series([['New', 'York', 'is', 'a', 'beautiful', 'city'], - ... ['Look', ':', 'New', 'York', '!']]) + >>> s = pd.Series([['I', 'have', 'the', 'power', '!'], + ... ['I', 'am', 'Groot', '!'], + ['I', 'am', 'the', 'LAW', '.']]) >>> hero.phrases(s, min_count=1, threshold=1) - 0 [New_York, is, a, beautiful, city] - 1 [Look, :, New_York, !] + 0 [I, have, the, power, !] + 1 [I_am, Groot, !] + 2 [I_am, the, LAW, .] dtype: object Reference @@ -741,7 +740,6 @@ def phrases( `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality" `_ - """ if not isinstance(s.iloc[0], list): @@ -772,15 +770,14 @@ def replace_urls(s: TextSeries, symbol: str) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Go to: https://example.com") + >>> s = pd.Series("Find me on https://www.marvel.com/") >>> hero.replace_urls(s, "") - 0 Go to: + 0 Find me on dtype: object See also -------- :meth:`texthero.preprocessing.remove_urls` - """ pattern = r"http\S+" @@ -798,15 +795,14 @@ def remove_urls(s: TextSeries) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Go to: https://example.com") + >>> s = pd.Series("Find me on https://www.marvel.com/") >>> hero.remove_urls(s) - 0 Go to: + 0 Find me on dtype: object See also -------- :meth:`texthero.preprocessing.replace_urls` - """ return replace_urls(s, " ") @@ -817,7 +813,7 @@ def replace_tags(s: TextSeries, symbol: str) -> TextSeries: """Replace all tags from a given Pandas Series with symbol. A tag is a string formed by @ concatenated with a sequence of characters - and digits. Example: @texthero123. + and digits. Example: @spiderparker59. Parameters ---------- @@ -830,11 +826,10 @@ def replace_tags(s: TextSeries, symbol: str) -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Hi @texthero123, we will replace you") + >>> s = pd.Series("Hi @spiderparker59, we will replace you") >>> hero.replace_tags(s, symbol='TAG') 0 Hi TAG, we will replace you dtype: object - """ pattern = r"@[a-zA-Z0-9]+" @@ -847,13 +842,13 @@ def remove_tags(s: TextSeries) -> TextSeries: """Remove all tags from a given Pandas Series. A tag is a string formed by @ concatenated with a sequence of characters - and digits. Example: @texthero123. Tags are replaceb by an empty space ` `. + and digits. Example: @spiderparker59. Tags are replaceb by an empty space ` `. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Hi @tag, we will remove you") + >>> s = pd.Series("Hi @spiderparker59, we will remove you") >>> hero.remove_tags(s) 0 Hi , we will remove you dtype: object @@ -863,6 +858,7 @@ def remove_tags(s: TextSeries) -> TextSeries: :meth:`texthero.preprocessing.replace_tags` for replacing a tag with a custom symbol. """ + return replace_tags(s, " ") @@ -871,7 +867,7 @@ def replace_hashtags(s: TextSeries, symbol: str) -> TextSeries: """Replace all hashtags from a Pandas Series with symbol A hashtag is a string formed by # concatenated with a sequence of - characters, digits and underscores. Example: #texthero_123. + characters, digits and underscores. Example: #spiderparker_59. Parameters ---------- @@ -879,17 +875,17 @@ def replace_hashtags(s: TextSeries, symbol: str) -> TextSeries: symbol : str Symbol to replace hashtags with. - + Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Hi #texthero_123, we will replace you.") + >>> s = pd.Series("Hi #spiderparker_59, we will replace you.") >>> hero.replace_hashtags(s, symbol='HASHTAG') 0 Hi HASHTAG, we will replace you. dtype: object - """ + pattern = r"#[a-zA-Z0-9_]+" return s.str.replace(pattern, symbol) @@ -899,13 +895,13 @@ def remove_hashtags(s: TextSeries) -> TextSeries: """Remove all hashtags from a given Pandas Series A hashtag is a string formed by # concatenated with a sequence of - characters, digits and underscores. Example: #texthero_123. + characters, digits and underscores. Example: #spiderparker_59. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Hi #texthero_123, we will remove you.") + >>> s = pd.Series("Hi #spiderparker_59, we will remove you.") >>> hero.remove_hashtags(s) 0 Hi , we will remove you. dtype: object @@ -915,4 +911,5 @@ def remove_hashtags(s: TextSeries) -> TextSeries: :meth:`texthero.preprocessing.replace_hashtags` for replacing a hashtag with a custom symbol. """ + return replace_hashtags(s, " ") From f7e3c56c2e9c5d600aea42745f1cd18d8e714435 Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Fri, 4 Dec 2020 18:53:45 +0100 Subject: [PATCH 02/16] Replacement in visualization.py --- texthero/visualization.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index a7b2b83..91bf7e7 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -64,9 +64,9 @@ def scatterplot( -------- >>> import texthero as hero >>> import pandas as pd - >>> df = pd.DataFrame(["Football, Sports, Soccer", - ... "music, violin, orchestra", "football, fun, sports", - ... "music, fun, guitar"], columns=["texts"]) + >>> df = pd.DataFrame(["spider, scorpion, bat", + ... "cape, costume, armor", "mask, eye-liner, earphones", + ... "cape, whip, ant"], columns=["texts"]) >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize) >>> df["pca"] = ( ... hero.tfidf(df["texts"]) @@ -276,13 +276,19 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: -------- >>> import pandas as pd >>> import texthero as hero - >>> s = pd.Series("one two two three three three") + >>> s = pd.Series("I believe in second chances, I believe in redemption, but, mostly, I believe in my friends.") >>> hero.top_words(s) - three 3 - two 2 - one 1 + believe 3 + I 3 + in 3 + chances 1 + my 1 + friends 1 + mostly 1 + redemption 1 + but 1 + second 1 dtype: int64 - """ # Replace all punctuation that are NOT in-between chacarters From 3eb1a41c09454e91c1d5a6e98c4cebd5c7c20936 Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Fri, 4 Dec 2020 19:01:30 +0100 Subject: [PATCH 03/16] Replacement in nlp.py --- texthero/nlp.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/texthero/nlp.py b/texthero/nlp.py index 9e8cfbe..1f502f3 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -55,10 +55,11 @@ def named_entities(s: TextSeries, package="spacy") -> pd.Series: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Yesterday I was in NY with Bill de Blasio") + >>> s = pd.Series("Yesterday, Spider-Man met Daredevil in Queens, New-York.") >>> hero.named_entities(s)[0] # doctest: +NORMALIZE_WHITESPACE - [('Yesterday', 'DATE', 0, 9), ('NY', 'GPE', 19, 21), - ('Bill de Blasio', 'PERSON', 27, 41)] + [('Yesterday', 'DATE', 0, 9), ('Spider-Man', 'PERSON', 11, 21), + ('Daredevil', 'GPE', 26, 35), ('Queens', 'GPE', 39, 45), + ('New-York', 'GPE', 47, 55)] """ entities = [] @@ -93,9 +94,9 @@ def noun_chunks(s: TextSeries) -> pd.Series: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("The spotted puppy is sleeping.") + >>> s = pd.Series("A little spider just bite me!") >>> hero.noun_chunks(s) - 0 [(The spotted puppy, NP, 0, 17)] + 0 [(A little spider, NP, 0, 15), (me, NP, 26, 28)] dtype: object """ @@ -130,8 +131,8 @@ def count_sentences(s: TextSeries) -> pd.Series: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series( - ... ["Yesterday I was in NY with Bill de Blasio. Great story...", - ... "This is the F.B.I.! What? Open up!"]) + ... ["Yesterday, Spider-Man met Daredevil in Queens, New-York. Great story...", + ... "This is the S.H.I.E.L.D! What? Open up!"]) >>> hero.count_sentences(s) 0 2 1 3 @@ -166,7 +167,7 @@ def pos_tag(s: TextSeries) -> pd.Series: coarse-grained POS has a NOUN value, then the refined POS will give more details about the type of the noun, whether it is singular, plural and/or proper. - + You can use the spacy `explain` function to find out which fine-grained POS it is. @@ -204,11 +205,11 @@ def pos_tag(s: TextSeries) -> pd.Series: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("Today is such a beautiful day") + >>> s = pd.Series("Today is such a marvelous day") >>> print(hero.pos_tag(s)[0]) # doctest: +NORMALIZE_WHITESPACE - [('Today', 'NOUN', 'NN', 0, 5), ('is', 'AUX', 'VBZ', 6, 8), ('such', 'DET', - 'PDT', 9, 13), ('a', 'DET', 'DT', 14, 15), ('beautiful', 'ADJ', 'JJ', 16, - 25), ('day', 'NOUN', 'NN', 26, 29)] + [('Today', 'NOUN', 'NN', 0, 5), ('is', 'AUX', 'VBZ', 6, 8), + ('such', 'DET', 'PDT', 9, 13), ('a', 'DET', 'DT', 14, 15), + ('marvelous', 'ADJ', 'JJ', 16, 25), ('day', 'NOUN', 'NN', 26, 29)] """ pos_tags = [] @@ -264,9 +265,9 @@ def stem(s: TextSeries, stem="snowball", language="english") -> TextSeries: -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series("I used to go \t\n running.") + >>> s = pd.Series("I used to go \t\n flying.") >>> hero.stem(s) - 0 i use to go running. + 0 i use to go flying. dtype: object """ From cdb0d72d3473dde8c49ed80d3656fd773264e964 Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Fri, 4 Dec 2020 19:06:16 +0100 Subject: [PATCH 04/16] Explained the matching content in CONTRIBUTING.md --- CONTRIBUTING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 02e29d4..df42a5c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -75,6 +75,7 @@ Texthero is there for the NLP-community. If you have an idea on how we can impro 1. Before writing a new function or make any changes, look at similar code for inspiration and to learn about the code format and style. 1. The maximal docstring line length should be 75 characters. This should be manually done as `black` formatting does not enforce limits on docstring line length. 1. Use American English instead of British English (e.g. categorize instead of categorise) when writing comments and documenting docstrings. +1. Use as most as possible quotes or sentences that you can find from superheroes comics or movies, like "HULK SMASH!", "I am Groot!", "I am the vengeance, I am the night, I am BATMAN!", "With great power comes great responsibility.", etc... 1. For default argument values, use the defaults from the underlying library if applicable (e.g. the default arguments from sklearn if using a sklearn algorithm). If other values are used, add a small comment explaining why. Additionally, look for similar functions and use their default values. 1. Default values are defined as follows: `x : int, optional, default=2` From be92bb04cce84d08456e63d1c15760cd29627562 Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Fri, 4 Dec 2020 20:06:32 +0100 Subject: [PATCH 05/16] Updated representation.py to match the doctests --- texthero/representation.py | 148 ++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 74 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 8bbfead..44c9b64 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -1,5 +1,5 @@ """ -Map words into vectors using different algorithms such as +Map words into vectors using different algorithms such as TF-IDF, word2vec or GloVe. """ @@ -60,7 +60,7 @@ def count( """ Represent a text-based Pandas Series using count. - Rows of the returned DataFrame represent documents whereas + Rows of the returned DataFrame represent documents whereas columns are terms. The value in the cell document-term is the number of the term in this document. The output is sparse. TODO add tutorial link @@ -78,7 +78,7 @@ def count( min_df : float in range [0.0, 1.0] or int, optional, default=1 When building the vocabulary ignore terms that have a document - frequency (number of documents they appear in) strictly + frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. @@ -96,12 +96,12 @@ def count( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) + >>> s = pd.Series(["Batman is not evil", "Joker is evil"]).pipe(hero.tokenize) >>> hero.count(s) # doctest: +SKIP - Sentence one two - 0 1 1 0 - 1 1 0 1 - + Batman Joker evil is not + 0 1 0 1 1 1 + 1 0 1 1 1 0 + See Also -------- @@ -154,7 +154,7 @@ def term_frequency( min_df : float in range [0.0, 1.0] or int, optional, default=1 When building the vocabulary ignore terms that have a document - frequency (number of documents they appear in) strictly + frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. @@ -169,13 +169,13 @@ def term_frequency( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Text Text of doc one", "Text of of doc two", "Aha hi bnd one"]).pipe(hero.tokenize) + >>> s = pd.Series(["Batman is not evil", "Joker is evil", "Bane is evil too"]).pipe(hero.tokenize) >>> hero.term_frequency(s) # doctest: +SKIP - term_frequency - Aha Text bnd doc hi of one two - 0 0.00 0.4 0.00 0.2 0.00 0.2 0.20 0.0 - 1 0.00 0.2 0.00 0.2 0.00 0.4 0.00 0.2 - 2 0.25 0.0 0.25 0.0 0.25 0.0 0.25 0.0 + term_frequency + Bane Batman Joker evil is not too + 0 0.00 0.25 0.000000 0.250000 0.250000 0.25 0.00 + 1 0.00 0.00 0.333333 0.333333 0.333333 0.00 0.00 + 2 0.25 0.00 0.000000 0.250000 0.250000 0.00 0.25 See Also -------- @@ -217,8 +217,8 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram Different from the `sklearn-implementation of tfidf `, this function does *not* - normalize the output in any way, so the result is exactly what you + extraction.text.TfidfVectorizer.html>`, this function does *not* + normalize the output in any way, so the result is exactly what you get applying the formula described above. The input Series should already be tokenized. If not, it will @@ -233,15 +233,15 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram min_df : float in range [0.0, 1.0] or int, optional, default=1 When building the vocabulary ignore terms that have a document - frequency (number of documents they appear in) strictly + frequency (number of documents they appear in) strictly lower than the given threshold. - If float, the parameter represents a proportion of documents, + If float, the parameter represents a proportion of documents, integer absolute counts. max_df : float in range [0.0, 1.0] or int, default=1.0 Ignore terms that have a document frequency (number of documents they appear in) frequency strictly higher than the given threshold. - This arguments basically permits to remove corpus-specific stop + This arguments basically permits to remove corpus-specific stop words. If float, the parameter represents a proportion of documents, integer absolute counts. @@ -249,11 +249,11 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFram -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) - >>> hero.tfidf(s) # doctest: +SKIP - Bye Hi Test - 0 1.0 1.405465 0.000000 - 1 2.0 0.000000 1.405465 + >>> s = pd.Series(["I am the LAW", "I am Groot"]).pipe(hero.tokenize) + >>> hero.tfidf(s) # doctest: +SKIP + Groot I LAW am the + 0 0.000000 1.0 1.405465 1.0 1.405465 + 1 1.405465 1.0 0.000000 1.0 0.000000 See Also -------- @@ -296,18 +296,18 @@ def pca( Perform principal component analysis on the given input. Principal Component Analysis (PCA) is a statistical method that is - used to reveal where the variance in a dataset comes from. For - textual data, one could for example first represent a Series of + used to reveal where the variance in a dataset comes from. For + textual data, one could for example first represent a Series of documents using :meth:`texthero.representation.tfidf` to get a vector - representation of each document. Then, PCA can generate new vectors + representation of each document. Then, PCA can generate new vectors from the tfidf representation that showcase the differences among the documents most strongly in fewer dimensions. For example, the tfidf vectors will have length 100 if hero.tfidf was - called on a large corpus with max_features=100. Visualizing 100 + called on a large corpus with max_features=100. Visualizing 100 dimensions is hard! Using PCA with n_components=3, every document will now get a vector of length 3, and the vectors will be chosen so that - the document differences are easily visible. The corpus can now be + the document differences are easily visible. The corpus can now be visualized in 3D and we can get a good first view of the data! In general, *pca* should be called after the text has already been @@ -340,20 +340,20 @@ def pca( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football is great", - ... "Hi, I'm Texthero, who are you? Tell me!"]) + >>> s = pd.Series(["Where is my cape?!", + ... "Hi, I'm Marie-Jane, you're new neighbour!"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) >>> # Attention, your results might differ due to >>> # the randomness in PCA! >>> hero.pca(s) # doctest: +SKIP document - 0 [1.5713577608669735, 1.1102230246251565e-16] - 1 [-1.5713577608669729, 1.1102230246251568e-16] + 0 [-1.7213361830752993, 2.482534153247273e-16] + 1 [1.7213361830752996, 2.4825341532472726e-16] dtype: object See also -------- - `PCA on Wikipedia + `PCA on Wikipedia `_ """ @@ -378,12 +378,12 @@ def nmf( natural language processing to find clusters of similar texts (e.g. some texts in a corpus might be about sports and some about music, so they will differ in the usage - of technical terms; see the example below). + of technical terms; see the example below). Given a document-term matrix (so in texthero usually a Series after applying - :meth:`texthero.representation.tfidf` or some other first - representation function that assigns a scalar (a weight) to each + :meth:`texthero.representation.tfidf` or some other first + representation function that assigns a scalar (a weight) to each word), NMF will find n_components many topics (clusters) and calculate a vector for each document that places it correctly among the topics. @@ -404,27 +404,27 @@ def nmf( Returns ------- - Pandas Series with the vector calculated by NMF for the document in + Pandas Series with the vector calculated by NMF for the document in every cell. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", - ... "Music, Violin, Orchestra", "Football, Music"]) + >>> s = pd.Series(["Saber, Weapon, trident", + ... "Cape, Costume, Mask", "Saber, Cape"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe( ... hero.term_frequency ... ) >>> hero.nmf(s) # doctest: +SKIP - 0 [0.9080190347553924, 0.0] - 1 [0.0, 0.771931061231598] - 2 [0.3725409073202516, 0.31656880119331093] + 0 [0.27766260921934044, 0.0] + 1 [0.0, 0.44747079529871103] + 2 [0.10074274753704408, 0.5022865803893911] dtype: object >>> # As we can see, the third document, which >>> # is a mix of sports and music, is placed >>> # between the two axes (the topics) while - >>> # the other documents are placed right on + >>> # the other documents are placed right on >>> # one topic axis each. See also @@ -464,7 +464,7 @@ def tsne( t-distributed Stochastic Neighbor Embedding (t-SNE) is a machine learning algorithm used to visualize high-dimensional data in fewer dimensions. In natural language processing, the - high-dimensional data is usually a document-term matrix (so in + high-dimensional data is usually a document-term matrix (so in texthero usually a Series after applying :meth:`texthero.representation.tfidf` or some other first representation function that assigns a scalar (a weight) to each word) @@ -520,13 +520,13 @@ def tsne( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", - ... "Music, Violin, Orchestra", "Football, Music"]) + >>> s = pd.Series(["Saber, Weapon, trident", + ... "Cape, Costume, Mask", "Saber, Cape"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) >>> hero.tsne(s, random_state=42) # doctest: +SKIP - 0 [-18.833383560180664, -276.800537109375] - 1 [-210.60179138183594, 143.00535583496094] - 2 [-478.27984619140625, -232.97410583496094] + 0 [-18.833384, -276.80054] + 1 [-210.60179, 143.00536] + 2 [-478.27985, -232.9741] dtype: object See also @@ -575,17 +575,17 @@ def kmeans( Performs K-means clustering algorithm on the given input. K-means clustering is used in natural language processing - to separate texts into k clusters (groups) + to separate texts into k clusters (groups) (e.g. some texts in a corpus might be about sports and some about music, so they will differ in the usage of technical terms; the K-means algorithm uses this - to separate them into two clusters). + to separate them into two clusters). Given a document-term matrix (so in texthero usually a Series after applying - :meth:`texthero.representation.tfidf` or some other first + :meth:`texthero.representation.tfidf` or some other first representation function that assigns a scalar (a weight) to each - word), K-means will find k topics (clusters) and assign a topic to + word), K-means will find k topics (clusters) and assign a topic to each document. Kmeans can directly handle sparse input, so when calling kmeans on a @@ -619,16 +619,16 @@ def kmeans( Returns ------- - Pandas Series with the cluster the document was assigned to in each + Pandas Series with the cluster the document was assigned to in each cell. Examples -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", - ... "music, violin, orchestra", - ... "football, fun, sports", "music, fun, guitar"]) + >>> s = pd.Series(["Saber, Weapon, trident", + ... "cape, costume, mask", + ... "saber, power, weapon", "cape, power, trident"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe( ... hero.term_frequency ... ) @@ -644,7 +644,7 @@ def kmeans( See also -------- - `kmeans on Wikipedia + `kmeans on Wikipedia `_ """ @@ -693,8 +693,8 @@ def dbscan( Given a document-term matrix (so in texthero usually a Series after applying :meth:`texthero.representation.tfidf` or some other first - representation function that assigns a scalar (a weight) to each - word), DBSCAN will find topics (clusters) and assign a topic to + representation function that assigns a scalar (a weight) to each + word), DBSCAN will find topics (clusters) and assign a topic to each document. DBSCAN can directly handle sparse input, so when calling dbscan on a @@ -708,7 +708,7 @@ def dbscan( The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most - important DBSCAN parameter to choose appropriately for your data + important DBSCAN parameter to choose appropriately for your data set and distance function. min_samples : int, optional, default=5 @@ -717,7 +717,7 @@ def dbscan( metric : string or callable, optional, default='euclidean' The metric to use when calculating distance between instances in a - feature array. Use + feature array. Use `sorted(sklearn.neighbors.VALID_METRICS['brute'])` to see valid options. @@ -743,17 +743,17 @@ def dbscan( -------- >>> import texthero as hero >>> import pandas as pd - >>> s = pd.Series(["Football, Sports, Soccer", - ... "music, violin, orchestra", - ... "football, fun, sports", "music, enjoy, guitar"]) + >>> s = pd.Series(["Saber, Weapon, trident", + ... "cape, costume, mask", + ... "saber, power, weapon", "cape, power, trident"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf) - >>> hero.dbscan(s, min_samples=1, eps=4) + >>> hero.dbscan(s, min_samples=1, eps=3) 0 0 1 1 2 0 - 3 1 + 3 2 dtype: category - Categories (2, int64): [0, 1] + Categories (3, int64): [0, 1] >>> # As we can see, the documents are correctly >>> # separated into topics / clusters by the algorithm >>> # and we didn't even have to say how many topics there are! @@ -825,14 +825,14 @@ def meanshift( If not given, the bandwidth is estimated. Estimating takes time at least quadratic in the number of samples - (i.e. documents). For large datasets, it’s wise to set the + (i.e. documents). For large datasets, it’s wise to set the bandwidth to a small value. bin_seeding : bool, optional, default=False If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness - corresponds to the bandwidth. Setting this option to True will + corresponds to the bandwidth. Setting this option to True will speed up the algorithm because fewer seeds will be initialized. min_bin_freq : int, optional, default=1 @@ -927,9 +927,9 @@ def normalize(input_matrix: Union[pd.DataFrame, pd.Series], norm="l2") -> pd.Ser >>> import texthero as hero >>> import pandas as pd >>> col = ["a","b","c", "d"] - >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], + >>> s = pd.DataFrame([[1, 2, 3, 4],[4, 2, 7, 5],[2, 2, 3, 5],[1, 2, 9, 8]], ... columns=col).astype("Sparse") - >>> hero.normalize(s, norm="max") # doctest: +SKIP + >>> hero.normalize(s, norm="max") # doctest: +SKIP a b c d 0 0.250000 0.500000 0.75 1.000000 1 0.571429 0.285714 1.00 0.714286 From 5850f379e0bd3ed2712b9e73df8cc884116ceb5c Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Fri, 4 Dec 2020 20:12:02 +0100 Subject: [PATCH 06/16] Updated visualization.py to match with representation.py --- texthero/visualization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index 91bf7e7..61d48d3 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -64,9 +64,9 @@ def scatterplot( -------- >>> import texthero as hero >>> import pandas as pd - >>> df = pd.DataFrame(["spider, scorpion, bat", - ... "cape, costume, armor", "mask, eye-liner, earphones", - ... "cape, whip, ant"], columns=["texts"]) + >>> df = pd.DataFrame(["Saber, Weapon, trident", + ... "cape, costume, mask", "saber, power, weapon", + ... "cape, power, trident"], columns=["texts"]) >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize) >>> df["pca"] = ( ... hero.tfidf(df["texts"]) From 6db47e9b254cc69003877c704a077a8dad5919cc Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Sat, 5 Dec 2020 16:44:39 +0100 Subject: [PATCH 07/16] Fixed issue in doctest output --- texthero/preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index a5a871d..7717d7e 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -727,8 +727,8 @@ def phrases( -------- >>> import texthero as hero >>> s = pd.Series([['I', 'have', 'the', 'power', '!'], - ... ['I', 'am', 'Groot', '!'], - ['I', 'am', 'the', 'LAW', '.']]) + ... ['I', 'am', 'Groot', '!'], + ... ['I', 'am', 'the', 'LAW', '.']]) >>> hero.phrases(s, min_count=1, threshold=1) 0 [I, have, the, power, !] 1 [I_am, Groot, !] From 8eeac99800e39b23beaed69e7254be4552f8e33b Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Sat, 5 Dec 2020 16:50:12 +0100 Subject: [PATCH 08/16] Fixed doctest issues in both preprocessing.py and representation.py --- texthero/preprocessing.py | 14 +++++++++++--- texthero/representation.py | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 7717d7e..4cf72dc 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -36,13 +36,13 @@ def fillna(s: TextSeries, replace_string="") -> TextSeries: 0 I'm 1 2 - 3 BATMAN! + 3 BATMAN! dtype: object >>> hero.fillna(s, "Missing") 0 I'm 1 Missing 2 Missing - 3 BATMAN! + 3 BATMAN! dtype: object """ @@ -165,7 +165,7 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries: >>> import pandas as pd >>> s = pd.Series("I.am.Groot!") >>> hero.replace_punctuation(s, " ") - 0 I am Groot + 0 I am Groot dtype: object """ @@ -441,6 +441,7 @@ def clean(s: TextSeries, pipeline=None) -> TextSeries: >>> hero.clean(s) 0 uper 9dig aou dtype: object + """ if not pipeline: @@ -468,6 +469,7 @@ def has_content(s: TextSeries) -> TextSeries: 3 False 4 False dtype: bool + """ return (s.pipe(remove_whitespace) != "") & (~s.isna()) @@ -489,6 +491,7 @@ def drop_no_content(s: TextSeries) -> TextSeries: 0 Flame 2 on! dtype: object + """ return s[has_content(s)] @@ -508,6 +511,7 @@ def remove_round_brackets(s: TextSeries) -> TextSeries: 0 HULK dtype: object + See also -------- :meth:`remove_brackets` @@ -534,6 +538,7 @@ def remove_curly_brackets(s: TextSeries) -> TextSeries: 0 HULK dtype: object + See also -------- :meth:`remove_brackets` @@ -560,6 +565,7 @@ def remove_square_brackets(s: TextSeries) -> TextSeries: 0 HULK dtype: object + See also -------- :meth:`remove_brackets` @@ -587,6 +593,7 @@ def remove_angle_brackets(s: TextSeries) -> TextSeries: 0 HULK dtype: object + See also -------- :meth:`remove_brackets` @@ -614,6 +621,7 @@ def remove_brackets(s: TextSeries) -> TextSeries: 0 HULK dtype: object + See also -------- :meth:`remove_round_brackets` diff --git a/texthero/representation.py b/texthero/representation.py index 44c9b64..37d5fda 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -753,7 +753,7 @@ def dbscan( 2 0 3 2 dtype: category - Categories (3, int64): [0, 1] + Categories (3, int64): [0, 1, 2] >>> # As we can see, the documents are correctly >>> # separated into topics / clusters by the algorithm >>> # and we didn't even have to say how many topics there are! From 1a7de38408522408b578056e1ffd9747c5fc7ccd Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Sat, 5 Dec 2020 19:26:51 +0100 Subject: [PATCH 09/16] Fixed some formating in doctest --- texthero/preprocessing.py | 6 ++++-- texthero/visualization.py | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 4cf72dc..bdc6bd8 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -33,10 +33,12 @@ def fillna(s: TextSeries, replace_string="") -> TextSeries: >>> import numpy as np >>> s = pd.Series(["I'm", np.NaN, pd.NA, "BATMAN!"]) >>> hero.fillna(s) - 0 I'm + 0 I'm 1 2 - 3 BATMAN! + 3 BATMAN! + + dtype: object dtype: object >>> hero.fillna(s, "Missing") 0 I'm diff --git a/texthero/visualization.py b/texthero/visualization.py index 61d48d3..acf391e 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -278,16 +278,16 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: >>> import texthero as hero >>> s = pd.Series("I believe in second chances, I believe in redemption, but, mostly, I believe in my friends.") >>> hero.top_words(s) + in 3 believe 3 I 3 - in 3 - chances 1 - my 1 + but 1 friends 1 mostly 1 + my 1 redemption 1 - but 1 second 1 + chances 1 dtype: int64 """ From 04ecf6302e5293bcd5076b7c9e946e4bddf9be7c Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Sat, 5 Dec 2020 19:54:26 +0100 Subject: [PATCH 10/16] Fixed trailing whitespaces in doctests --- texthero/preprocessing.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index bdc6bd8..158f43c 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -37,6 +37,7 @@ def fillna(s: TextSeries, replace_string="") -> TextSeries: 1 2 3 BATMAN! + dtype: object dtype: object dtype: object @@ -167,7 +168,7 @@ def replace_punctuation(s: TextSeries, symbol: str = " ") -> TextSeries: >>> import pandas as pd >>> s = pd.Series("I.am.Groot!") >>> hero.replace_punctuation(s, " ") - 0 I am Groot + 0 I am Groot dtype: object """ @@ -192,7 +193,7 @@ def remove_punctuation(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("I.am.Groot!") >>> hero.remove_punctuation(s) - 0 I am Groot + 0 I am Groot dtype: object """ return replace_punctuation(s, " ") @@ -359,7 +360,7 @@ def remove_stopwords( >>> import pandas as pd >>> s = pd.Series("I have the power!") >>> hero.remove_stopwords(s) - 0 I power! + 0 I power dtype: object Add custom words into the default list of stopwords: @@ -371,7 +372,7 @@ def remove_stopwords( >>> custom_stopwords = default_stopwords.union(set(["power"])) >>> s = pd.Series("I have the power!") >>> hero.remove_stopwords(s, custom_stopwords) - 0 I ! + 0 I dtype: object @@ -510,7 +511,7 @@ def remove_round_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("HULK (SMASH!)") >>> hero.remove_round_brackets(s) - 0 HULK + 0 HULK dtype: object @@ -537,7 +538,7 @@ def remove_curly_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("HULK {SMASH!}") >>> hero.remove_curly_brackets(s) - 0 HULK + 0 HULK dtype: object @@ -564,7 +565,7 @@ def remove_square_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("HULK [SMASH!]") >>> hero.remove_square_brackets(s) - 0 HULK + 0 HULK dtype: object @@ -592,7 +593,7 @@ def remove_angle_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("HULK ") >>> hero.remove_angle_brackets(s) - 0 HULK + 0 HULK dtype: object @@ -620,7 +621,7 @@ def remove_brackets(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("HULK (S) [M] (A) [S] (H)") >>> hero.remove_brackets(s) - 0 HULK + 0 HULK dtype: object @@ -807,7 +808,7 @@ def remove_urls(s: TextSeries) -> TextSeries: >>> import pandas as pd >>> s = pd.Series("Find me on https://www.marvel.com/") >>> hero.remove_urls(s) - 0 Find me on + 0 Find me on dtype: object See also From e0d889f542df573d359c6123ec89bee72fbb5741 Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Sat, 5 Dec 2020 19:58:09 +0100 Subject: [PATCH 11/16] Fixed trailing whitespace in doctest --- texthero/preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 158f43c..e0a8895 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -34,8 +34,8 @@ def fillna(s: TextSeries, replace_string="") -> TextSeries: >>> s = pd.Series(["I'm", np.NaN, pd.NA, "BATMAN!"]) >>> hero.fillna(s) 0 I'm - 1 - 2 + 1 + 2 3 BATMAN! dtype: object From 9250b3e2c3720c0bb03d4aad2db92f65dac2e733 Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Sat, 5 Dec 2020 19:58:37 +0100 Subject: [PATCH 12/16] Fixed an error in replace_stopwords example --- texthero/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index e0a8895..7e9ec0e 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -322,7 +322,7 @@ def replace_stopwords( >>> import pandas as pd >>> s = pd.Series("Oh my God, Batman!") >>> hero.replace_stopwords(s, "$") - 0 Oh $ God, Batman! + 0 Oh $ God Batman dtype: object """ From 60c57406af0e8d4a2628ea2e7de7effbd4bfcbd4 Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Sat, 5 Dec 2020 20:01:59 +0100 Subject: [PATCH 13/16] Fixed an output doctext issue in preprocessing.py --- texthero/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 7e9ec0e..f01c26b 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -283,7 +283,7 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str: >>> symbol = "$" >>> stopwords = ["my"] >>> _replace_stopwords(s, stopwords, symbol) - 'Oh $ God, Batman!' + 'Oh $ God Batman' """ From e6f1b155b6cd6a6cf054466fca2f5311779c0b30 Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Sat, 5 Dec 2020 20:11:05 +0100 Subject: [PATCH 14/16] Tried to solve the order error in visualization.py --- texthero/visualization.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index acf391e..5c336ef 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -277,17 +277,17 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: >>> import pandas as pd >>> import texthero as hero >>> s = pd.Series("I believe in second chances, I believe in redemption, but, mostly, I believe in my friends.") - >>> hero.top_words(s) - in 3 - believe 3 + >>> hero.top_words(s).sort_index() # sorted by index as the Series object does not have the same order on different machines I 3 + believe 3 but 1 + chances 1 friends 1 + in 3 mostly 1 my 1 redemption 1 second 1 - chances 1 dtype: int64 """ From 0398f2319cdb14acd19ccca4570dbec4fd78775f Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Sat, 5 Dec 2020 20:17:12 +0100 Subject: [PATCH 15/16] Performed tests with Python 3.8 to test the c-i results --- texthero/preprocessing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index f01c26b..e6877e7 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -283,7 +283,7 @@ def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str: >>> symbol = "$" >>> stopwords = ["my"] >>> _replace_stopwords(s, stopwords, symbol) - 'Oh $ God Batman' + 'Oh $ God, Batman!' """ @@ -322,7 +322,7 @@ def replace_stopwords( >>> import pandas as pd >>> s = pd.Series("Oh my God, Batman!") >>> hero.replace_stopwords(s, "$") - 0 Oh $ God Batman + 0 'Oh $ God, Batman!' dtype: object """ @@ -360,7 +360,7 @@ def remove_stopwords( >>> import pandas as pd >>> s = pd.Series("I have the power!") >>> hero.remove_stopwords(s) - 0 I power + 0 I power! dtype: object Add custom words into the default list of stopwords: @@ -372,7 +372,7 @@ def remove_stopwords( >>> custom_stopwords = default_stopwords.union(set(["power"])) >>> s = pd.Series("I have the power!") >>> hero.remove_stopwords(s, custom_stopwords) - 0 I + 0 I ! dtype: object From 07b4afca417d00bf444ce15d790630b8c5eab9e1 Mon Sep 17 00:00:00 2001 From: k0pernicus <> Date: Sat, 5 Dec 2020 20:23:50 +0100 Subject: [PATCH 16/16] Fixed comma issue in doctest --- texthero/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index e6877e7..290f2ef 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -322,7 +322,7 @@ def replace_stopwords( >>> import pandas as pd >>> s = pd.Series("Oh my God, Batman!") >>> hero.replace_stopwords(s, "$") - 0 'Oh $ God, Batman!' + 0 Oh $ God, Batman! dtype: object """