Skip to content
Open
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
env: PATH=/c/Python38:/c/Python38/Scripts:$PATH
install:
- pip3 install --upgrade pip # all three OSes agree about 'pip3'
- pip3 install black
- pip3 install black==19.10b0
- pip3 install ".[dev]" .
# 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows
# 'python3' is a 'command not found' error on Windows but 'py' works on Windows only
Expand Down
4 changes: 3 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,12 @@ install_requires =
unidecode>=1.1.1
gensim>=3.6.0
matplotlib>=3.1.0
jinja2>=2.11.1

# TODO pick the correct version.
[options.extras_require]
dev =
black>=19.10b0
black==19.10b0
pytest>=4.0.0
Sphinx>=3.0.3
sphinx-markdown-builder>=0.5.4
Expand Down
8 changes: 8 additions & 0 deletions tests/test_visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,11 @@ def test_top_words_digits_punctuation(self):
def test_wordcloud(self):
s = pd.Series("one two three")
self.assertEqual(visualization.wordcloud(s), None)

"""
Test show_dataframe.
"""

def test_show_dataframe(self):
df = pd.DataFrame([["Test", 0.5], ["ja", 0.3]])
self.assertIsNotNone(visualization.show_dataframe(df, return_HTML=True))
3 changes: 3 additions & 0 deletions texthero/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@
from .nlp import *

from . import stopwords

from . import visualization_server
from .visualization_server import *
80 changes: 79 additions & 1 deletion texthero/visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,24 @@
Visualize insights and statistics of a text-based Pandas DataFrame.
"""

import os
import pandas as pd
import numpy as np
import plotly.express as px
import warnings

from wordcloud import WordCloud

from texthero import preprocessing
from texthero._types import TextSeries, InputSeries
import string
from texthero.visualization_server import _display_df_browser
from texthero import visualization_server

from matplotlib.colors import LinearSegmentedColormap as lsg
import matplotlib.pyplot as plt

from collections import Counter
import string


def scatterplot(
Expand Down Expand Up @@ -306,3 +310,77 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series:
.explode() # one word for each line
.value_counts(normalize=normalize)
)


def show_dataframe(
df: pd.DataFrame, notebook=True, ip="127.0.0.1", port=8888, return_HTML=False
):
"""
Visualize a Pandas DataFrame.

To embed the visualization inside
a Jupyter Notebook (e.g. Google Colab, Kaggle),
set `notebook=True` (default). To visualize
in a separate browser window, set it to
False.

Parameters
----------
df : pd.DataFrame
The DataFrame to visualize.

notebook : bool, default to True
Whether to visualize inside the
current Jupyter Notebook or in
a separate browser window.

ip : string, default = '127.0.0.1'
The ip address used for the local server.
Ignored when notebook is set to True.

port : int, default = 8888
The port number to use for the local server.
If already in use,
a nearby open port will be found.
Ignored when notebook is set to True.

return_HTML : bool, default to False
Whether to return the generated HTML
instead of visualizing it.

Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv") # doctest: +SKIP
>>> hero.show_dataframe(df) # doctest: +SKIP

"""

if return_HTML:
return visualization_server.data_to_html(df)

if notebook:
# Try to check whether the user is in a notebook.
# (Not a safe check.)
try:
__IPYTHON__
import IPython
except:
warnings.warn(
"You do not appear do be inside"
" a Jupyter Notebook. Set"
" notebook=False to show the visualization."
" If you can already see the visualization, "
" ignore this warning.",
RuntimeWarning,
)

return IPython.display.display(
IPython.display.HTML(visualization_server.data_to_html(df))
)

else:
_display_df_browser(
df, ip=ip, port=port,
)
9 changes: 9 additions & 0 deletions texthero/visualization_server/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
Submodule for our more complex visualizations that
run interactively.
"""

from ._display import *
from ._display import _display_df_browser

from ._server import *
112 changes: 112 additions & 0 deletions texthero/visualization_server/_display.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""
Module to display our visualizations interactively
inside a Notebook / Browser.

This file is largely based on https://github.com/jakevdp/mpld3/blob/master/mpld3/_display.py
Copyright (c) 2013, Jake Vanderplas.
It was adapted for pyLDAvis by Ben Mabey.
It was then adapted for Texthero.
"""

import json
import jinja2
from ._server import serve


# Our HTML template. We use jinja2
# to programmatically insert the
# data we want to visualize
# in the function data_to_html
# below.
HTML_TEMPLATE = jinja2.Template(
r"""
<!DOCTYPE html>
<html lang="en">
<head>
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css">
<link href="https://cdn.datatables.net/1.10.21/css/jquery.dataTables.min.css" rel="stylesheet">
</head>

<body>
<div class="container">
<div class="header">
<h5 class="text-muted"></h3>
</div>

<div>
<div id="tablediv"></div>
</div>
</div>
<script src="https://code.jquery.com/jquery-3.5.1.js" type="text/javascript"></script>
<script src="https://cdn.datatables.net/1.10.21/js/jquery.dataTables.min.js" type="text/javascript"></script>
<script src="https://cdn.datatables.net/plug-ins/1.10.21/dataRender/ellipsis.js" type="text/javascript"></script>
<script type="text/javascript">



$(document).ready(function () {
$("#tablediv").html({{ df_json }});
var table = $("#tableID").DataTable({
columnDefs: [ {
targets: 0,
render: $.fn.dataTable.render.ellipsis(260, true, true)
} ]
});
});

</script>
</body>

</html>
"""
)


def data_to_html(df):
"""
Output HTML with embedded visualization
of the DataFrame df.

"""
template = HTML_TEMPLATE

# Create JSON from DataFrame with correct classes/ID for visualization.
df_json = json.dumps(
df.to_html(
classes='table table-hover" id = "tableID',
index=False,
justify="left",
border=0,
)
)

return template.render(df_json=df_json)


def _display_df_browser(
df, ip="127.0.0.1", port=8888,
):
"""
Display visualization of DataFrame `df`
in local browser.

Parameters
----------
df : pd.DataFrame
The DataFrame to visualize.

ip : string, default = '127.0.0.1'
The ip address used for the local server

port : int, default = 8888
The port number to use for the local server.
If already in use,
a nearby open port will be found.

"""

html = data_to_html(df)

serve(
html, ip=ip, port=port,
)
93 changes: 93 additions & 0 deletions texthero/visualization_server/_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# this file is largely based on https://github.com/jakevdp/mpld3/blob/master/mpld3/_server.py
# Copyright (c) 2013, Jake Vanderplas
"""
Simple server used to serve our visualizations in a web browser.
"""
from http import server
import sys
import threading
import webbrowser
import socket


def generate_handler(html):
"""
Generate handler that only
serves our generated html.
"""

class MyHandler(server.BaseHTTPRequestHandler):
def do_GET(self):
"""Respond to a GET request."""
if self.path == "/":
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(html.encode())
else:
self.send_error(404)

return MyHandler


def find_open_port(ip, port, n=50):
"""
Find an open port near the specified port.
"""

ports = [port + i for i in range(n)]

for port in ports:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = s.connect_ex((ip, port))
s.close()
if result != 0:
return port

raise ValueError("no open ports found")


def serve(
html, ip="127.0.0.1", port=8888, open_browser=True,
):
"""
Start a server serving the given HTML, and (optionally) open a
browser.

Parameters
----------
html : string
HTML to serve

ip : string (default = '127.0.0.1')
ip address at which the HTML will be served.

port : int (default = 8888)
the port at which to serve the HTML

open_browser : bool (optional)
if True (default), then open a web browser to the given HTML
"""

port = find_open_port(ip, port, n=50)
Handler = generate_handler(html)

srvr = server.HTTPServer((ip, port), Handler)

# Start the server
print("Serving to http://{0}:{1}/ [Ctrl-C to exit]".format(ip, port))
sys.stdout.flush()

if open_browser:
# Use a thread to open a web browser pointing to the server
def b():
return webbrowser.open("http://{0}:{1}".format(ip, port))

threading.Thread(target=b).start()

try:
srvr.serve_forever()
except (KeyboardInterrupt, SystemExit):
print("\nStopping Server...")

srvr.server_close()