Skip to content

Commit 6d28eb0

Browse files
committed
Add filterable, browsable Records table to Run
Why these changes are being introduced: One affordance of the webapp is viewing individual records from a run, seeing a summary of A/B differences, the full A/B records, and a side-by-side comparison. But when runs may contain lots of records, some kind of interface is needed to identify records for viewing (where once the timdex_record_id is known, the Record page only requires that). Previously, a "Record Samples" section appeared on the Run page that linked to a standalone table of records that met some kind of criteria (e.g. source = X, or field Y has diffs). This was functional, but had drawbacks: - this static HTML could not handle large number of records, meaning a representative sample was used, which prevented access to all records - the combinations of dimensions to drill down into was limited by the static sample pages of records How this addresses that need: * Removes all "Record Samples" approaches * Replaces with a single table in the Run page * filterable by source, whether records had modified specific modified fields, even full-text search of the records themselves * This single table provides a mechanism to browse and filter records from the run, from a single interface, with arguably simpler logic under the hood to power it Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-385
1 parent e436d53 commit 6d28eb0

File tree

16 files changed

+1154
-169
lines changed

16 files changed

+1154
-169
lines changed

abdiff/core/create_final_records.py

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,12 @@
77
import pyarrow.dataset as ds
88

99
from abdiff.config import Config
10-
from abdiff.core.utils import load_dataset, write_to_dataset
10+
from abdiff.core.utils import (
11+
load_dataset,
12+
read_run_json,
13+
update_or_create_run_json,
14+
write_to_dataset,
15+
)
1116

1217
logger = logging.getLogger(__name__)
1318

@@ -27,18 +32,19 @@ def create_final_records(
2732
This dataset should be sufficient for supporting any webapp data needs.
2833
2934
This dataset is partitioned by source and 'has_diff' boolean.
35+
36+
Lastly, a DuckDB database file is created with some views and small convenience tables
37+
for the webapp to use.
3038
"""
3139
logger.info("Creating final records dataset from 'diffs' and 'metrics' datasets.")
40+
run_data = read_run_json(run_directory)
3241

3342
diffs_dataset = load_dataset(diffs_dataset_path)
3443
metrics_dataset = load_dataset(metrics_dataset_path)
3544

45+
metrics_timdex_field_columns = run_data["metrics"]["summary"]["fields_with_diffs"]
46+
3647
# get list of unique columns from metrics dataset, and create final dataset schema
37-
metrics_timdex_field_columns = [
38-
name
39-
for name in metrics_dataset.schema.names
40-
if name not in diffs_dataset.schema.names
41-
]
4248
metrics_columns = (
4349
pa.field(name, pa.int64())
4450
for name in metrics_dataset.schema.names
@@ -57,6 +63,7 @@ def create_final_records(
5763
)
5864
)
5965

66+
# write records to records dataset
6067
records_dataset_path = str(Path(run_directory) / "records")
6168
write_to_dataset(
6269
get_final_records_iter(
@@ -67,6 +74,11 @@ def create_final_records(
6774
partition_columns=["source", "has_diff"],
6875
)
6976

77+
# initialize duckdb database file for future use
78+
duckdb_filepath = Path(run_directory) / "run.duckdb"
79+
create_duckdb_database_file(duckdb_filepath, records_dataset_path)
80+
update_or_create_run_json(run_directory, {"duckdb_filepath": str(duckdb_filepath)})
81+
7082
return records_dataset_path
7183

7284

@@ -112,3 +124,26 @@ def get_final_records_iter(
112124
yield results.read_next_batch()
113125
except StopIteration:
114126
break
127+
128+
129+
def create_duckdb_database_file(
130+
duckdb_filepath: str | Path, records_dataset_path: str
131+
) -> None:
132+
"""Create a DuckDB database file with views associated with records dataset.
133+
134+
This DuckDB database file will contain only views or very small tables, taking up
135+
little space on disk. These views will provided as a convenience for the webapp and
136+
other contexts to query the records dataset.
137+
"""
138+
logger.info("creating duckdb database file")
139+
with duckdb.connect(duckdb_filepath) as conn:
140+
141+
# create records dataset view
142+
parquet_glob_pattern = f"{records_dataset_path}/**/*.parquet"
143+
conn.execute(
144+
f"""
145+
create view records as
146+
select *
147+
from read_parquet('{parquet_glob_pattern}', hive_partitioning=true)
148+
"""
149+
)

abdiff/webapp/app.py

Lines changed: 36 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,17 @@
55
import signal
66
from datetime import datetime
77
from pathlib import Path
8+
from time import perf_counter
89

9-
from flask import Flask, g, render_template, request
10+
from flask import Flask, Response, g, jsonify, render_template, request
1011

1112
from abdiff.core.utils import read_run_json
1213
from abdiff.webapp.utils import (
13-
get_field_sample_records,
1414
get_record_a_b_versions,
1515
get_record_field_diff_summary,
1616
get_record_unified_diff_string,
1717
get_run_directory,
18-
get_source_sample_records,
18+
query_duckdb_for_records_datatable,
1919
)
2020

2121
logger = logging.getLogger(__name__)
@@ -61,7 +61,7 @@ def job() -> str:
6161
with open(run_json_filepath) as f:
6262
run_data = json.load(f)
6363
runs[run_data["run_timestamp"]] = run_data
64-
dict(
64+
runs = dict(
6565
sorted(
6666
runs.items(),
6767
key=lambda x: datetime.strptime( # noqa: DTZ007
@@ -89,63 +89,49 @@ def run(run_timestamp: str) -> str:
8989
except FileNotFoundError:
9090
transform_logs = "'logs.txt' not found for transform logs"
9191

92-
# parse run metrics
93-
metrics = run_data.get(
94-
"metrics", {"warning": "'metrics' section not found in run data"}
95-
)
96-
97-
# generate links for field and source samples
98-
field_samples = {
99-
field: f"http://{request.host}/run/{run_timestamp}/sample/field/{field}"
100-
for field in metrics["summary"]["fields_with_diffs"]
101-
}
102-
source_samples = {
103-
source: f"http://{request.host}/run/{run_timestamp}/sample/source/{source}"
104-
for source in metrics["summary"]["sources"]
105-
}
106-
sample_links = {
107-
"field_samples": field_samples,
108-
"source_samples": source_samples,
109-
}
110-
11192
return render_template(
11293
"run.html",
11394
run_data=run_data,
11495
run_json=json.dumps(run_data),
11596
transform_logs=transform_logs,
116-
metrics_json=json.dumps(metrics),
117-
sample_links=sample_links,
97+
metrics_json=json.dumps(run_data["metrics"]),
98+
sources=sorted(run_data["metrics"]["summary"]["sources"]),
99+
modified_fields=sorted(run_data["metrics"]["summary"]["fields_with_diffs"]),
118100
)
119101

120-
@app.route(
121-
"/run/<run_timestamp>/sample/<sample_type>/<sample_value>", methods=["GET"]
122-
)
123-
def run_sample(run_timestamp: str, sample_type: str, sample_value: str) -> str:
124-
"""Route to provide links to record views based on a subset of detected diffs."""
102+
@app.route("/run/<run_timestamp>/records/data", methods=["POST"])
103+
def records_data(run_timestamp: str) -> Response:
104+
"""Endpoint to provide data for Records table in Run view.
105+
106+
The Javascript library DataTables (https://datatables.net/) is used to create the
107+
Records table in the Run view. This table is configured to make HTTP POST
108+
requests to an endpoint for filtered, paginated data that supplies the table. This
109+
endpoint provides that data.
110+
111+
The POST request payload conforms to the request signature here:
112+
https://datatables.net/manual/server-side. This endpoint receives the parameters
113+
from the table (e.g. page, ordering, filtering, etc.), parses the query parameters
114+
from the request payload, and passes to a utility function which performs the
115+
DuckDB query, returning a dataframe of results suitable for the table.
116+
"""
117+
start_time = perf_counter()
125118
run_directory = get_run_directory(run_timestamp)
119+
run_data = read_run_json(run_directory)
126120

127-
# get sample records
128-
if sample_type == "field":
129-
sample_df = get_field_sample_records(run_directory, sample_value)
130-
elif sample_type == "source":
131-
sample_df = get_source_sample_records(run_directory, sample_value)
132-
else:
133-
raise ValueError( # noqa: TRY003
134-
f"Sample type: '{sample_type}' not recognized"
135-
)
136-
sample_df["record_link"] = sample_df.timdex_record_id.apply(
137-
lambda timdex_record_id: (
138-
f"http://{request.host}/run/{run_timestamp}/record/{timdex_record_id}"
139-
)
121+
datatables_data = query_duckdb_for_records_datatable(
122+
run_data["duckdb_filepath"],
123+
draw=int(request.form.get("draw", "1")),
124+
start=int(request.form.get("start", "0")),
125+
length=int(request.form.get("length", "10")),
126+
search_value=request.form.get("search[value]", ""),
127+
order_column_index=int(request.form.get("order[0][column]", "0")),
128+
order_direction=request.form.get("order[0][dir]", "asc"),
129+
source_filter=request.form.getlist("sourceFilter[]"),
130+
modified_fields_filter=request.form.getlist("modifiedFieldsFilter[]"),
140131
)
141-
sample_df = sample_df.sort_values(by=["source", "timdex_record_id"])
142132

143-
return render_template(
144-
"sample.html",
145-
sample_type=sample_type,
146-
sample_value=sample_value,
147-
sample_df=sample_df,
148-
)
133+
logger.info(f"records data elapsed: {perf_counter()-start_time}")
134+
return jsonify(datatables_data)
149135

150136
@app.route("/run/<run_timestamp>/record/<timdex_record_id>", methods=["GET"])
151137
def record(run_timestamp: str, timdex_record_id: str) -> str:

abdiff/webapp/static/jquery-3.6.0.min.js

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)