MITLibraries
diff --git a/‎abdiff/core/create_final_records.py‎
Lines changed: 41 additions & 6 deletions b/‎abdiff/core/create_final_records.py‎
Lines changed: 41 additions & 6 deletions
diff --git a/‎abdiff/webapp/app.py‎
Lines changed: 36 additions & 50 deletions b/‎abdiff/webapp/app.py‎
Lines changed: 36 additions & 50 deletions
diff --git a/‎abdiff/webapp/static/jquery-3.6.0.min.js‎
Lines changed: 2 additions & 0 deletions b/‎abdiff/webapp/static/jquery-3.6.0.min.js‎
Lines changed: 2 additions & 0 deletions
@@ -7,7 +7,12 @@
 import pyarrow.dataset as ds
 
 from abdiff.config import Config
-from abdiff.core.utils import load_dataset, write_to_dataset
+from abdiff.core.utils import (
+    load_dataset,
+    read_run_json,
+    update_or_create_run_json,
+    write_to_dataset,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -27,18 +32,19 @@ def create_final_records(
     This dataset should be sufficient for supporting any webapp data needs.
 
     This dataset is partitioned by source and 'has_diff' boolean.
+
+    Lastly, a DuckDB database file is created with some views and small convenience tables
+    for the webapp to use.
     """
     logger.info("Creating final records dataset from 'diffs' and 'metrics' datasets.")
+    run_data = read_run_json(run_directory)
 
     diffs_dataset = load_dataset(diffs_dataset_path)
     metrics_dataset = load_dataset(metrics_dataset_path)
 
+    metrics_timdex_field_columns = run_data["metrics"]["summary"]["fields_with_diffs"]
+
     # get list of unique columns from metrics dataset, and create final dataset schema
-    metrics_timdex_field_columns = [
-        name
-        for name in metrics_dataset.schema.names
-        if name not in diffs_dataset.schema.names
-    ]
     metrics_columns = (
         pa.field(name, pa.int64())
         for name in metrics_dataset.schema.names
@@ -57,6 +63,7 @@ def create_final_records(
         )
     )
 
+    # write records to records dataset
     records_dataset_path = str(Path(run_directory) / "records")
     write_to_dataset(
         get_final_records_iter(
@@ -67,6 +74,11 @@ def create_final_records(
         partition_columns=["source", "has_diff"],
     )
 
+    # initialize duckdb database file for future use
+    duckdb_filepath = Path(run_directory) / "run.duckdb"
+    create_duckdb_database_file(duckdb_filepath, records_dataset_path)
+    update_or_create_run_json(run_directory, {"duckdb_filepath": str(duckdb_filepath)})
+
     return records_dataset_path
 
 
@@ -112,3 +124,26 @@ def get_final_records_iter(
                 yield results.read_next_batch()
             except StopIteration:
                 break
+
+
+def create_duckdb_database_file(
+    duckdb_filepath: str | Path, records_dataset_path: str
+) -> None:
+    """Create a DuckDB database file with views associated with records dataset.
+
+    This DuckDB database file will contain only views or very small tables, taking up
+    little space on disk.  These views will provided as a convenience for the webapp and
+    other contexts to query the records dataset.
+    """
+    logger.info("creating duckdb database file")
+    with duckdb.connect(duckdb_filepath) as conn:
+
+        # create records dataset view
+        parquet_glob_pattern = f"{records_dataset_path}/**/*.parquet"
+        conn.execute(
+            f"""
+            create view records as
+            select *
+            from read_parquet('{parquet_glob_pattern}', hive_partitioning=true)
+            """
+        )
@@ -5,17 +5,17 @@
 import signal
 from datetime import datetime
 from pathlib import Path
+from time import perf_counter
 
-from flask import Flask, g, render_template, request
+from flask import Flask, Response, g, jsonify, render_template, request
 
 from abdiff.core.utils import read_run_json
 from abdiff.webapp.utils import (
-    get_field_sample_records,
     get_record_a_b_versions,
     get_record_field_diff_summary,
     get_record_unified_diff_string,
     get_run_directory,
-    get_source_sample_records,
+    query_duckdb_for_records_datatable,
 )
 
 logger = logging.getLogger(__name__)
@@ -61,7 +61,7 @@ def job() -> str:
             with open(run_json_filepath) as f:
                 run_data = json.load(f)
             runs[run_data["run_timestamp"]] = run_data
-        dict(
+        runs = dict(
             sorted(
                 runs.items(),
                 key=lambda x: datetime.strptime(  # noqa: DTZ007
@@ -89,63 +89,49 @@ def run(run_timestamp: str) -> str:
         except FileNotFoundError:
             transform_logs = "'logs.txt' not found for transform logs"
 
-        # parse run metrics
-        metrics = run_data.get(
-            "metrics", {"warning": "'metrics' section not found in run data"}
-        )
-
-        # generate links for field and source samples
-        field_samples = {
-            field: f"http://{request.host}/run/{run_timestamp}/sample/field/{field}"
-            for field in metrics["summary"]["fields_with_diffs"]
-        }
-        source_samples = {
-            source: f"http://{request.host}/run/{run_timestamp}/sample/source/{source}"
-            for source in metrics["summary"]["sources"]
-        }
-        sample_links = {
-            "field_samples": field_samples,
-            "source_samples": source_samples,
-        }
-
         return render_template(
             "run.html",
             run_data=run_data,
             run_json=json.dumps(run_data),
             transform_logs=transform_logs,
-            metrics_json=json.dumps(metrics),
-            sample_links=sample_links,
+            metrics_json=json.dumps(run_data["metrics"]),
+            sources=sorted(run_data["metrics"]["summary"]["sources"]),
+            modified_fields=sorted(run_data["metrics"]["summary"]["fields_with_diffs"]),
         )
 
-    @app.route(
-        "/run/<run_timestamp>/sample/<sample_type>/<sample_value>", methods=["GET"]
-    )
-    def run_sample(run_timestamp: str, sample_type: str, sample_value: str) -> str:
-        """Route to provide links to record views based on a subset of detected diffs."""
+    @app.route("/run/<run_timestamp>/records/data", methods=["POST"])
+    def records_data(run_timestamp: str) -> Response:
+        """Endpoint to provide data for Records table in Run view.
+
+        The Javascript library DataTables (https://datatables.net/) is used to create the
+        Records table in the Run view.  This table is configured to make HTTP POST
+        requests to an endpoint for filtered, paginated data that supplies the table. This
+        endpoint provides that data.
+
+        The POST request payload conforms to the request signature here:
+        https://datatables.net/manual/server-side.  This endpoint receives the parameters
+        from the table (e.g. page, ordering, filtering, etc.), parses the query parameters
+        from the request payload, and passes to a utility function which performs the
+        DuckDB query, returning a dataframe of results suitable for the table.
+        """
+        start_time = perf_counter()
         run_directory = get_run_directory(run_timestamp)
+        run_data = read_run_json(run_directory)
 
-        # get sample records
-        if sample_type == "field":
-            sample_df = get_field_sample_records(run_directory, sample_value)
-        elif sample_type == "source":
-            sample_df = get_source_sample_records(run_directory, sample_value)
-        else:
-            raise ValueError(  # noqa: TRY003
-                f"Sample type: '{sample_type}' not recognized"
-            )
-        sample_df["record_link"] = sample_df.timdex_record_id.apply(
-            lambda timdex_record_id: (
-                f"http://{request.host}/run/{run_timestamp}/record/{timdex_record_id}"
-            )
+        datatables_data = query_duckdb_for_records_datatable(
+            run_data["duckdb_filepath"],
+            draw=int(request.form.get("draw", "1")),
+            start=int(request.form.get("start", "0")),
+            length=int(request.form.get("length", "10")),
+            search_value=request.form.get("search[value]", ""),
+            order_column_index=int(request.form.get("order[0][column]", "0")),
+            order_direction=request.form.get("order[0][dir]", "asc"),
+            source_filter=request.form.getlist("sourceFilter[]"),
+            modified_fields_filter=request.form.getlist("modifiedFieldsFilter[]"),
         )
-        sample_df = sample_df.sort_values(by=["source", "timdex_record_id"])
 
-        return render_template(
-            "sample.html",
-            sample_type=sample_type,
-            sample_value=sample_value,
-            sample_df=sample_df,
-        )
+        logger.info(f"records data elapsed: {perf_counter()-start_time}")
+        return jsonify(datatables_data)
 
     @app.route("/run/<run_timestamp>/record/<timdex_record_id>", methods=["GET"])
     def record(run_timestamp: str, timdex_record_id: str) -> str: