Merge pull request #216 from mheilman/smoke_tests

mheilman · web-flow · commit a2b7d5aa44d5 · 2018-01-31T14:59:13.000-06:00
Add script for smoke tests
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,8 @@ All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
+### Added
+- Added a script for integration tests (smoke tests).
 
 ### Changed
 - Updated CivisML 2.0 notebook (#214)
diff --git a/tools/smoke_tests.py b/tools/smoke_tests.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+
+"""
+This script tests end-to-end functionality using the Civis Python client.
+It uses the live Civis API and Redshift, so a valid CIVIS_API_KEY is needed.
+
+This is based on a similar script for the R client:
+https://github.com/civisanalytics/civis-r/blob/master/tools/integration_tests/smoke_test.R
+"""
+
+import io
+import logging
+import time
+
+import civis
+import pandas as pd
+from sklearn.datasets import load_iris
+
+
+def main():
+    logging.basicConfig(format="", level=logging.INFO)
+
+    logger = logging.getLogger("civis")
+
+    t0 = time.time()
+
+    database = "redshift-general"
+    client = civis.APIClient(resources="all")
+
+    # Test read_civis and read_civis_sql produce the same results.
+    # The table used here has an explicit index column to sort by in case the
+    # rows come back in a different order.
+    logger.info('Testing reading from redshift...')
+    sql = "SELECT * FROM datascience.iris"
+    df1 = civis.io.read_civis_sql(
+        sql=sql, database=database, use_pandas=True, client=client
+    ).sort_values(by='index')
+    df2 = civis.io.read_civis(
+        table="datascience.iris", database=database, use_pandas=True,
+        client=client
+    ).sort_values(by='index')
+    assert df1.shape == (150, 6)
+    pd.testing.assert_frame_equal(df1, df2)
+
+    # Test uploading data.
+    logger.info('Testing uploading to redshift...')
+    table = "scratch.smoke_test_{}".format(int(time.time()))
+    iris = load_iris()
+    df_iris1 = (
+        pd.DataFrame(iris.data)
+        .rename(columns={0: 'c0', 1: 'c1', 2: 'c2', 3: 'c3'})
+        .join(pd.DataFrame(iris.target).rename(columns={0: 'label'}))
+        .reset_index()
+    )
+    try:
+        civis.io.dataframe_to_civis(
+            df_iris1, database, table, client=client).result()
+        df_iris2 = civis.io.read_civis(
+            table=table, database=database, use_pandas=True, client=client)
+        pd.testing.assert_frame_equal(
+            df_iris1.sort_values(by='index').set_index('index'),
+            df_iris2.sort_values(by='index').set_index('index')
+        )
+    finally:
+        civis.io.query_civis("DROP TABLE IF EXISTS %s" % table,
+                             database=database, client=client)
+
+    # Test uploading and downloading file.
+    logger.info('Testing File uploading and downloading...')
+    buf = io.BytesIO()
+    csv_bytes1 = df_iris1.to_csv(index=False).encode('utf-8')
+    buf.write(csv_bytes1)
+    buf.seek(0)
+    file_id = civis.io.file_to_civis(
+        buf, name="civis-python test file", client=client)
+    buf.seek(0)
+    civis.io.civis_to_file(file_id, buf, client=client)
+    buf.seek(0)
+    csv_bytes2 = buf.read()
+    assert csv_bytes1 == csv_bytes2, "File upload/download did not match."
+
+    # Test modeling.
+    logger.info('Testing Civis-ML...')
+    mp = civis.ml.ModelPipeline(
+        model="sparse_logistic",
+        dependent_variable="type",
+        primary_key="index",
+        client=client
+    )
+    result = mp.train(
+        table_name="datascience.iris", database_name=database).result()
+    assert result['state'] == 'succeeded'
+
+    logger.info("%.1f seconds elapsed in total.", time.time() - t0)
+
+
+if __name__ == '__main__':
+    main()