jbesomi · henrifroese · Aug 18, 2020 · Aug 19, 2020 · Aug 19, 2020 · Aug 21, 2020
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -398,3 +398,60 @@ def test_remove_hashtags(self):
         s_true = pd.Series("Hi  , we will remove you")
 
         self.assertEqual(preprocessing.remove_hashtags(s), s_true)
+
+    """
+    Test describe DataFrame
+    """
+
+    def test_describe(self):
+        df = pd.DataFrame(
+            [
+                ["here here here here go", "sport"],
+                ["There There There", "sport"],
+                ["Test, Test, Test, Test, Test, Test, Test, Test", "sport"],
+                [np.nan, "music"],
+                ["super super", pd.NA],
+                [pd.NA, pd.NA],
+                ["great great great great great", "music"],
+            ],
+            columns=["text", "topics"],
+        )
+        df_description = preprocessing.describe(df["text"], df["topics"])
+        df_true = pd.DataFrame(
+            [
+                7,
+                7,
+                2,
+                ["Test", "great", "here", "There", "super", "go"],
+                ["test", "great", "super", "go"],
+                6.0,
+                2.0,
+                15.0,
+                5.196152422706632,
+                3.0,
+                5.0,
+                5.0,
+                0.6,
+                0.4,
+            ],
+            columns=["Value"],
+            index=pd.MultiIndex.from_tuples(
+                [
+                    ("number of documents", ""),
+                    ("number of unique documents", ""),
+                    ("number of missing documents", ""),
+                    ("most common words", ""),
+                    ("most common words excluding stopwords", ""),
+                    ("average document length", ""),
+                    ("length of shortest document", ""),
+                    ("length of longest document", ""),
+                    ("standard deviation of document lengths", ""),
+                    ("25th percentile document lengths", ""),
+                    ("50th percentile document lengths", ""),
+                    ("75th percentile document lengths", ""),
+                    ("label distribution", "sport"),
+                    ("label distribution", "music"),
+                ]
+            ),
+        )
+        pd.testing.assert_frame_equal(df_description, df_true, check_less_precise=True)
diff --git a/tests/test_representation.py b/tests/test_representation.py
@@ -252,6 +252,7 @@ def test_dim_reduction_and_clustering_with_vector_series_input(
             rtol=0.1,
             atol=0.1,
             check_category_order=False,
+            check_categorical=False,
         )
 
     @parameterized.expand(test_cases_dim_reduction_and_clustering)

diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py
@@ -12,6 +12,7 @@
 import pandas as pd
 
 from texthero._types import TokenSeries, TextSeries, InputSeries
+from texthero import visualization
 
 from typing import List, Callable, Union
 
@@ -916,3 +917,112 @@ def remove_hashtags(s: TextSeries) -> TextSeries:
         with a custom symbol.
     """
     return replace_hashtags(s, " ")
+
+
+@InputSeries(TextSeries)
+def describe(s: TextSeries, s_labels: pd.Series = None) -> pd.DataFrame:
+    """
+    Describe a given pandas TextSeries (consisting of strings
+    in every cell). Additionally gather information
+    about class labels if they are given in s_labels.
+
+    In the output, all lengths are in number of characters.
+
+    Parameters
+    ----------
+    s : :class:`texthero._types.TextSeries`
+        The series with texts to describe.
+
+    s_labels : pd.Series
+        Series with labels for s.
+
+    Examples
+    --------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> df = pd.read_csv(
+    ... "https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv"
+    ...                  ) # doctest: +SKIP
+    >>> df.head(2) # doctest: +SKIP
+                                                    text      topic
+    0  Claxton hunting first major medal\n\nBritish h...  athletics
+    1  O'Sullivan could run in Worlds\n\nSonia O'Sull...  athletics
+    >>> # Describe both the text and the labels
+    >>> hero.describe(df["text"], df["topic"]) # doctest: +SKIP
+                                                                                                  Value
+    number of documents                                                                             737
+    number of unique documents                                                                      727
+    number of missing documents                                                                       0
+    most common words                                          [the, to, a, in, and, of, for, ", I, is]
+    most common words excluding stopwords             [said, first, england, game, one, year, two, w...
+    average document length                                                                     387.803
+    length of shortest document                                                                     119
+    length of longest document                                                                     1855
+    standard deviation of document lengths                                                      210.728
+    25th percentile document lengths                                                                241
+    50th percentile document lengths                                                                340
+    75th percentile document lengths                                                                494
+    label distribution                     football                                            0.359566
+                                           rugby                                               0.199457
+                                           cricket                                              0.16825
+                                           athletics                                           0.137042
+                                           tennis                                              0.135685
+    """
+    # Get values we need for several calculations.
+    description = {}
+    s_tokenized = tokenize(s)
+    has_content_mask = has_content(s)
+    document_lengths = s_tokenized[has_content_mask].map(lambda x: len(x))
+    document_lengths_description = document_lengths.describe()
+
+    # Collect statistics.
+    description["number of documents"] = len(s.index)
+    description["number of unique documents"] = len(s.unique())
+    description["number of missing documents"] = (~has_content_mask).sum()
+    description["most common words"] = visualization.top_words(s).index[:10].tolist()
+    description["most common words excluding stopwords"] = (
+        s.pipe(clean).pipe(visualization.top_words).index[:10].tolist()
+    )
+
+    description["average document length"] = document_lengths_description["mean"]
+    description["length of shortest document"] = document_lengths_description["min"]
+    description["length of longest document"] = document_lengths_description["max"]
+    description[
+        "standard deviation of document lengths"
+    ] = document_lengths_description["std"]
+    description["25th percentile document lengths"] = document_lengths_description[
+        "25%"
+    ]
+    description["50th percentile document lengths"] = document_lengths_description[
+        "50%"
+    ]
+    description["75th percentile document lengths"] = document_lengths_description[
+        "75%"
+    ]
+
+    # Create output Series.
+    s_description = pd.Series(description)
+
+    # Potentially add information about label distribution.
+    if s_labels is not None:
+
+        s_labels_distribution = s_labels.value_counts() / s_labels.value_counts().sum()
+
+        # Put the labels distribution into s_description with multiindex to look nice.
+        s_labels_distribution.index = pd.MultiIndex.from_product(
+            [["label distribution"], s_labels_distribution.index.values]
+        )
+
+        s_description.index = pd.MultiIndex.from_product(
+            [s_description.index.values, [""]]
+        )
+
+        s_description = pd.concat([s_description, s_labels_distribution])
+
+    # DataFrame will look much nicer for users when printing.
+    df_description = pd.DataFrame(
+        s_description.values, index=s_description.index, columns=["Value"]
+    )
+    df_description.index.name = "Statistic"
+
+    return df_description