Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
fa342a9
added MultiIndex DF support
mk2510 Aug 18, 2020
59a9f8c
beginning with tests
henrifroese Aug 19, 2020
19c52de
implemented correct sparse support
mk2510 Aug 19, 2020
66e566c
Merge branch 'master_upstream' into change_representation_to_multicolumn
mk2510 Aug 21, 2020
41f55a8
added back list() and rm .tolist()
mk2510 Aug 21, 2020
217611a
rm .tolist() and added list()
mk2510 Aug 21, 2020
6a3b56d
Adopted the test to the new dataframes
mk2510 Aug 21, 2020
b8ff561
wrong format
mk2510 Aug 21, 2020
e3af2f9
Address most review comments.
henrifroese Aug 21, 2020
77ad80e
Add more unittests for representation
henrifroese Aug 21, 2020
f7eb7c3
- Update _types.py with DocumentTermDF
henrifroese Aug 22, 2020
4937a4f
Fix DocumentTermDF example DataFrame column names
henrifroese Aug 22, 2020
5fc720c
Implement hero.describe
henrifroese Aug 26, 2020
55dcd7f
Change hero.describe to return DataFrame for pretty-printing in Noteb…
henrifroese Aug 26, 2020
f3bbc08
Auto stash before merge of "hero_describe_function" and "origin/hero_…
mk2510 Aug 26, 2020
9e72c85
Add tests for hero.describe
mk2510 Aug 26, 2020
bdaaa84
Merge branch 'master_upstream' into hero_describe_function
mk2510 Sep 22, 2020
9aef332
Improve docstring
henrifroese Apr 4, 2021
bc1bc4c
Fix gensim version
henrifroese Apr 4, 2021
9e2c6b7
Merge branch 'master' into hero_describe_function
henrifroese Apr 4, 2021
f19dd50
Fix spacy version
henrifroese Apr 4, 2021
f2e4194
Merge remote-tracking branch 'origin/hero_describe_function' into her…
henrifroese Apr 4, 2021
67a05e2
Fix categorical tests
henrifroese Apr 4, 2021
942fb9f
Set gensim and spacy versions
henrifroese Apr 4, 2021
296a1d8
fix formatting
henrifroese Apr 4, 2021
adf96e8
Merge remote-tracking branch 'upstream/master' into hero_describe_fun…
henrifroese Apr 8, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,3 +398,60 @@ def test_remove_hashtags(self):
s_true = pd.Series("Hi , we will remove you")

self.assertEqual(preprocessing.remove_hashtags(s), s_true)

"""
Test describe DataFrame
"""

def test_describe(self):
df = pd.DataFrame(
[
["here here here here go", "sport"],
["There There There", "sport"],
["Test, Test, Test, Test, Test, Test, Test, Test", "sport"],
[np.nan, "music"],
["super super", pd.NA],
[pd.NA, pd.NA],
["great great great great great", "music"],
],
columns=["text", "topics"],
)
df_description = preprocessing.describe(df["text"], df["topics"])
df_true = pd.DataFrame(
[
7,
7,
2,
["Test", "great", "here", "There", "super", "go"],
["test", "great", "super", "go"],
6.0,
2.0,
15.0,
5.196152422706632,
3.0,
5.0,
5.0,
0.6,
0.4,
],
columns=["Value"],
index=pd.MultiIndex.from_tuples(
[
("number of documents", ""),
("number of unique documents", ""),
("number of missing documents", ""),
("most common words", ""),
("most common words excluding stopwords", ""),
("average document length", ""),
("length of shortest document", ""),
("length of longest document", ""),
("standard deviation of document lengths", ""),
("25th percentile document lengths", ""),
("50th percentile document lengths", ""),
("75th percentile document lengths", ""),
("label distribution", "sport"),
("label distribution", "music"),
]
),
)
pd.testing.assert_frame_equal(df_description, df_true, check_less_precise=True)
1 change: 1 addition & 0 deletions tests/test_representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ def test_dim_reduction_and_clustering_with_vector_series_input(
rtol=0.1,
atol=0.1,
check_category_order=False,
check_categorical=False,
)

@parameterized.expand(test_cases_dim_reduction_and_clustering)
Expand Down
110 changes: 110 additions & 0 deletions texthero/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pandas as pd

from texthero._types import TokenSeries, TextSeries, InputSeries
from texthero import visualization

from typing import List, Callable, Union

Expand Down Expand Up @@ -916,3 +917,112 @@ def remove_hashtags(s: TextSeries) -> TextSeries:
with a custom symbol.
"""
return replace_hashtags(s, " ")


@InputSeries(TextSeries)
def describe(s: TextSeries, s_labels: pd.Series = None) -> pd.DataFrame:
"""
Describe a given pandas TextSeries (consisting of strings
in every cell). Additionally gather information
about class labels if they are given in s_labels.

In the output, all lengths are in number of characters.

Parameters
----------
s : :class:`texthero._types.TextSeries`
The series with texts to describe.

s_labels : pd.Series
Series with labels for s.

Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> df = pd.read_csv(
... "https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv"
... ) # doctest: +SKIP
>>> df.head(2) # doctest: +SKIP
text topic
0 Claxton hunting first major medal\n\nBritish h... athletics
1 O'Sullivan could run in Worlds\n\nSonia O'Sull... athletics
>>> # Describe both the text and the labels
>>> hero.describe(df["text"], df["topic"]) # doctest: +SKIP
Value
number of documents 737
number of unique documents 727
number of missing documents 0
most common words [the, to, a, in, and, of, for, ", I, is]
most common words excluding stopwords [said, first, england, game, one, year, two, w...
average document length 387.803
length of shortest document 119
length of longest document 1855
standard deviation of document lengths 210.728
25th percentile document lengths 241
50th percentile document lengths 340
75th percentile document lengths 494
label distribution football 0.359566
rugby 0.199457
cricket 0.16825
athletics 0.137042
tennis 0.135685
"""
# Get values we need for several calculations.
description = {}
s_tokenized = tokenize(s)
has_content_mask = has_content(s)
document_lengths = s_tokenized[has_content_mask].map(lambda x: len(x))
document_lengths_description = document_lengths.describe()

# Collect statistics.
description["number of documents"] = len(s.index)
description["number of unique documents"] = len(s.unique())
description["number of missing documents"] = (~has_content_mask).sum()
description["most common words"] = visualization.top_words(s).index[:10].tolist()
description["most common words excluding stopwords"] = (
s.pipe(clean).pipe(visualization.top_words).index[:10].tolist()
)

description["average document length"] = document_lengths_description["mean"]
description["length of shortest document"] = document_lengths_description["min"]
description["length of longest document"] = document_lengths_description["max"]
description[
"standard deviation of document lengths"
] = document_lengths_description["std"]
description["25th percentile document lengths"] = document_lengths_description[
"25%"
]
description["50th percentile document lengths"] = document_lengths_description[
"50%"
]
description["75th percentile document lengths"] = document_lengths_description[
"75%"
]

# Create output Series.
s_description = pd.Series(description)

# Potentially add information about label distribution.
if s_labels is not None:

s_labels_distribution = s_labels.value_counts() / s_labels.value_counts().sum()

# Put the labels distribution into s_description with multiindex to look nice.
s_labels_distribution.index = pd.MultiIndex.from_product(
[["label distribution"], s_labels_distribution.index.values]
)

s_description.index = pd.MultiIndex.from_product(
[s_description.index.values, [""]]
)

s_description = pd.concat([s_description, s_labels_distribution])

# DataFrame will look much nicer for users when printing.
df_description = pd.DataFrame(
s_description.values, index=s_description.index, columns=["Value"]
)
df_description.index.name = "Statistic"

return df_description