Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions lavender_data/server/dataset/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
get_outlier_aware_hist,
)
from lavender_data.server.db import db_manual_session
from lavender_data.server.db.models import Shard, Shardset
from lavender_data.server.db.models import Shardset, ShardStatistics
from lavender_data.logging import get_logger


Expand Down Expand Up @@ -153,7 +153,7 @@ def get_shardset_statistics(shardset_id: str) -> dict[str, ColumnStatistics]:
.where(Shardset.id == shardset_id)
.options(
selectinload(Shardset.columns),
selectinload(Shardset.shards).options(selectinload(Shard.statistics)),
selectinload(Shardset.shards),
)
).one()

Expand All @@ -163,9 +163,15 @@ def get_shardset_statistics(shardset_id: str) -> dict[str, ColumnStatistics]:

for shard in shardset.shards:
for column in shardset.columns:
if shard.statistics is not None:
# query multiple times to prevent db lock
with db_manual_session() as session:
shard_statistics: ShardStatistics = session.exec(
select(ShardStatistics).where(ShardStatistics.shard_id == shard.id)
).one_or_none()

if shard_statistics is not None and shard_statistics.data.get(column.name):
column_statistics[column.name].append(
shard.statistics.data[column.name]
shard_statistics.data.get(column.name)
)

aggregated_statistics = {}
Expand Down
30 changes: 25 additions & 5 deletions lavender_data/shard/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import numpy as np
from typing import Any, Literal, Optional, TypedDict, Union

from lavender_data.logging import get_logger


class Histogram(TypedDict):
hist: list[float]
Expand Down Expand Up @@ -52,8 +54,7 @@ def _is_categorical_column(values: list[Any]) -> bool:
unique_values = set(values)

if _is_numeric_column(values):
# boolean
return len(set(values)) <= 2
return len(set(values)) <= 10

return len(unique_values) <= max(min(len(values) * 0.1, 99), 2)

Expand Down Expand Up @@ -141,6 +142,20 @@ def _to_numeric(value: Any):
return None
return len(value)

elif isinstance(values[0], (list, tuple)):

def _to_numeric(value: Any):
if value is None:
return None
return len(value)

elif isinstance(values[0], dict):

def _to_numeric(value: Any):
if value is None:
return None
return len(value.keys())

else:
raise ValueError(f"Invalid column type: {type(values[0])}")

Expand Down Expand Up @@ -193,6 +208,7 @@ def get_shard_statistics(
columns: dict[str, str],
statistics_types: Optional[dict[str, Literal["numeric", "categorical"]]] = None,
) -> ShardStatistics:
logger = get_logger(__name__)
samples_by_column = {
column_name: [sample[column_name] for sample in samples]
for column_name in columns.keys()
Expand All @@ -203,8 +219,12 @@ def get_shard_statistics(
statistics_type = (
statistics_types.get(column_name) if statistics_types else None
)
column_statistics[column_name] = get_shard_column_statistics(
values, statistics_type=statistics_type
)
try:
column_statistics[column_name] = get_shard_column_statistics(
values, statistics_type=statistics_type
)
except Exception as e:
logger.warning(f"Error getting statistics for column {column_name}: {e}")
continue

return column_statistics
2 changes: 1 addition & 1 deletion lavender_data/ui/.next/BUILD_ID
Original file line number Diff line number Diff line change
@@ -1 +1 @@
EDoKufa-96x-x5Ou9wFm9
rCvx5Tk0640Lq2U_jAoIc
72 changes: 36 additions & 36 deletions lavender_data/ui/.next/app-build-manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"static/chunks/1701-e422f49a0b315e2b.js",
"static/chunks/4006-bf8e3208049100fd.js",
"static/chunks/4360-befdc69f33b1f1b2.js",
"static/chunks/app/background-tasks/page-59b2e67e364d6bce.js"
"static/chunks/app/background-tasks/page-615c2f25afb3390a.js"
],
"/page": [
"static/chunks/webpack-422087ee6240a8cc.js",
Expand All @@ -61,7 +61,7 @@
"static/chunks/main-app-c987258603e8d21b.js",
"static/chunks/4452-716eac31d0770351.js",
"static/chunks/8527-dfb83c2a9d17df0e.js",
"static/chunks/app/set-api-key/page-5932baf9781e2055.js"
"static/chunks/app/set-api-key/page-706fb3916ce504d8.js"
],
"/datasets/layout": [
"static/chunks/webpack-422087ee6240a8cc.js",
Expand All @@ -88,7 +88,7 @@
"static/chunks/4006-bf8e3208049100fd.js",
"static/chunks/4290-a2992ee8c1e43435.js",
"static/chunks/8527-dfb83c2a9d17df0e.js",
"static/chunks/app/datasets/page-1af66a8cf2bcdfbf.js"
"static/chunks/app/datasets/page-021f7ee2b7a72659.js"
],
"/iterations/layout": [
"static/chunks/webpack-422087ee6240a8cc.js",
Expand Down Expand Up @@ -144,72 +144,71 @@
"static/chunks/main-app-c987258603e8d21b.js",
"static/chunks/app/datasets/[dataset_id]/loading-ac81efe44b0c800c.js"
],
"/datasets/[dataset_id]/iterations/page": [
"static/chunks/webpack-422087ee6240a8cc.js",
"static/chunks/639b4859-01df906fe7acfc96.js",
"static/chunks/5236-23290c027ddef11c.js",
"static/chunks/main-app-c987258603e8d21b.js",
"static/chunks/4452-716eac31d0770351.js",
"static/chunks/4290-a2992ee8c1e43435.js",
"static/chunks/app/datasets/[dataset_id]/iterations/page-2bb024c0ecb206f0.js"
],
"/datasets/[dataset_id]/page": [
"static/chunks/webpack-422087ee6240a8cc.js",
"static/chunks/639b4859-01df906fe7acfc96.js",
"static/chunks/5236-23290c027ddef11c.js",
"static/chunks/main-app-c987258603e8d21b.js",
"static/chunks/app/datasets/[dataset_id]/page-f0f9a739a3d1c3a1.js"
],
"/datasets/[dataset_id]/dataloader/page": [
"/datasets/[dataset_id]/settings/page": [
"static/chunks/webpack-422087ee6240a8cc.js",
"static/chunks/639b4859-01df906fe7acfc96.js",
"static/chunks/5236-23290c027ddef11c.js",
"static/chunks/main-app-c987258603e8d21b.js",
"static/chunks/ba47f2f6-f331313138068e46.js",
"static/chunks/4452-716eac31d0770351.js",
"static/chunks/1701-e422f49a0b315e2b.js",
"static/chunks/3228-264f0ea33ca639c5.js",
"static/chunks/4006-bf8e3208049100fd.js",
"static/chunks/8527-dfb83c2a9d17df0e.js",
"static/chunks/543-720bb5792cddffd0.js",
"static/chunks/5492-04ed51d043f9434c.js",
"static/chunks/3031-615dce7e6800bfff.js",
"static/chunks/5674-ee0225a62b423e37.js",
"static/chunks/4618-7859a0bd7b4a533f.js",
"static/chunks/app/datasets/[dataset_id]/dataloader/page-c920725dd186c4aa.js"
"static/chunks/4249-0fc799d49b0f211f.js",
"static/chunks/7756-2718584aea599f34.js",
"static/chunks/2056-9b60f7538ab42ab5.js",
"static/chunks/app/datasets/[dataset_id]/settings/page-c22ae5cb2e1138bb.js"
],
"/datasets/[dataset_id]/preview/page": [
"/datasets/[dataset_id]/iterations/page": [
"static/chunks/webpack-422087ee6240a8cc.js",
"static/chunks/639b4859-01df906fe7acfc96.js",
"static/chunks/5236-23290c027ddef11c.js",
"static/chunks/main-app-c987258603e8d21b.js",
"static/chunks/4452-716eac31d0770351.js",
"static/chunks/4290-a2992ee8c1e43435.js",
"static/chunks/app/datasets/[dataset_id]/iterations/page-2bb024c0ecb206f0.js"
],
"/datasets/[dataset_id]/shardsets/[shardset_id]/page": [
"static/chunks/webpack-422087ee6240a8cc.js",
"static/chunks/639b4859-01df906fe7acfc96.js",
"static/chunks/5236-23290c027ddef11c.js",
"static/chunks/main-app-c987258603e8d21b.js",
"static/chunks/ba47f2f6-f331313138068e46.js",
"static/chunks/4452-716eac31d0770351.js",
"static/chunks/1701-e422f49a0b315e2b.js",
"static/chunks/3228-264f0ea33ca639c5.js",
"static/chunks/4006-bf8e3208049100fd.js",
"static/chunks/4290-a2992ee8c1e43435.js",
"static/chunks/8527-dfb83c2a9d17df0e.js",
"static/chunks/543-720bb5792cddffd0.js",
"static/chunks/5492-04ed51d043f9434c.js",
"static/chunks/5674-ee0225a62b423e37.js",
"static/chunks/4618-7859a0bd7b4a533f.js",
"static/chunks/app/datasets/[dataset_id]/preview/page-7ad6147ee8117583.js"
"static/chunks/5262-5c731ce5fa127fec.js",
"static/chunks/5674-e9a96fd70413d0a0.js",
"static/chunks/app/datasets/[dataset_id]/shardsets/[shardset_id]/page-f8a90d7867298ab7.js"
],
"/datasets/[dataset_id]/settings/page": [
"/datasets/[dataset_id]/dataloader/page": [
"static/chunks/webpack-422087ee6240a8cc.js",
"static/chunks/639b4859-01df906fe7acfc96.js",
"static/chunks/5236-23290c027ddef11c.js",
"static/chunks/main-app-c987258603e8d21b.js",
"static/chunks/ba47f2f6-f331313138068e46.js",
"static/chunks/4452-716eac31d0770351.js",
"static/chunks/1701-e422f49a0b315e2b.js",
"static/chunks/3228-264f0ea33ca639c5.js",
"static/chunks/4006-bf8e3208049100fd.js",
"static/chunks/8527-dfb83c2a9d17df0e.js",
"static/chunks/543-720bb5792cddffd0.js",
"static/chunks/4249-0fc799d49b0f211f.js",
"static/chunks/7756-2718584aea599f34.js",
"static/chunks/2056-9b60f7538ab42ab5.js",
"static/chunks/app/datasets/[dataset_id]/settings/page-c17062259e8c98c8.js"
"static/chunks/5492-04ed51d043f9434c.js",
"static/chunks/3031-615dce7e6800bfff.js",
"static/chunks/5674-e9a96fd70413d0a0.js",
"static/chunks/4618-7859a0bd7b4a533f.js",
"static/chunks/app/datasets/[dataset_id]/dataloader/page-c920725dd186c4aa.js"
],
"/datasets/[dataset_id]/shardsets/page": [
"static/chunks/webpack-422087ee6240a8cc.js",
Expand All @@ -226,23 +225,24 @@
"static/chunks/4249-0fc799d49b0f211f.js",
"static/chunks/7756-2718584aea599f34.js",
"static/chunks/2056-9b60f7538ab42ab5.js",
"static/chunks/app/datasets/[dataset_id]/shardsets/page-0c9e4d42f7f67561.js"
"static/chunks/app/datasets/[dataset_id]/shardsets/page-7ea83fff64f322e0.js"
],
"/datasets/[dataset_id]/shardsets/[shardset_id]/page": [
"/datasets/[dataset_id]/preview/page": [
"static/chunks/webpack-422087ee6240a8cc.js",
"static/chunks/639b4859-01df906fe7acfc96.js",
"static/chunks/5236-23290c027ddef11c.js",
"static/chunks/main-app-c987258603e8d21b.js",
"static/chunks/ba47f2f6-f331313138068e46.js",
"static/chunks/4452-716eac31d0770351.js",
"static/chunks/1701-e422f49a0b315e2b.js",
"static/chunks/3228-264f0ea33ca639c5.js",
"static/chunks/4006-bf8e3208049100fd.js",
"static/chunks/4290-a2992ee8c1e43435.js",
"static/chunks/8527-dfb83c2a9d17df0e.js",
"static/chunks/543-720bb5792cddffd0.js",
"static/chunks/5262-5c731ce5fa127fec.js",
"static/chunks/5674-ee0225a62b423e37.js",
"static/chunks/app/datasets/[dataset_id]/shardsets/[shardset_id]/page-33e293d7e086324a.js"
"static/chunks/5492-04ed51d043f9434c.js",
"static/chunks/5674-e9a96fd70413d0a0.js",
"static/chunks/4618-7859a0bd7b4a533f.js",
"static/chunks/app/datasets/[dataset_id]/preview/page-7ad6147ee8117583.js"
]
}
}
8 changes: 4 additions & 4 deletions lavender_data/ui/.next/app-path-routes-manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
"/datasets/page": "/datasets",
"/iterations/page": "/iterations",
"/iterations/[iteration_id]/page": "/iterations/[iteration_id]",
"/datasets/[dataset_id]/iterations/page": "/datasets/[dataset_id]/iterations",
"/datasets/[dataset_id]/page": "/datasets/[dataset_id]",
"/datasets/[dataset_id]/dataloader/page": "/datasets/[dataset_id]/dataloader",
"/datasets/[dataset_id]/preview/page": "/datasets/[dataset_id]/preview",
"/datasets/[dataset_id]/settings/page": "/datasets/[dataset_id]/settings",
"/datasets/[dataset_id]/iterations/page": "/datasets/[dataset_id]/iterations",
"/datasets/[dataset_id]/shardsets/[shardset_id]/page": "/datasets/[dataset_id]/shardsets/[shardset_id]",
"/datasets/[dataset_id]/dataloader/page": "/datasets/[dataset_id]/dataloader",
"/datasets/[dataset_id]/shardsets/page": "/datasets/[dataset_id]/shardsets",
"/datasets/[dataset_id]/shardsets/[shardset_id]/page": "/datasets/[dataset_id]/shardsets/[shardset_id]"
"/datasets/[dataset_id]/preview/page": "/datasets/[dataset_id]/preview"
}
4 changes: 2 additions & 2 deletions lavender_data/ui/.next/build-manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"devFiles": [],
"ampDevFiles": [],
"lowPriorityFiles": [
"static/EDoKufa-96x-x5Ou9wFm9/_buildManifest.js",
"static/EDoKufa-96x-x5Ou9wFm9/_ssgManifest.js"
"static/rCvx5Tk0640Lq2U_jAoIc/_buildManifest.js",
"static/rCvx5Tk0640Lq2U_jAoIc/_ssgManifest.js"
],
"rootMainFiles": [
"static/chunks/webpack-422087ee6240a8cc.js",
Expand Down
6 changes: 3 additions & 3 deletions lavender_data/ui/.next/prerender-manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@
"dynamicRoutes": {},
"notFoundRoutes": [],
"preview": {
"previewModeId": "3fab7c6232cb48563032161afae83f69",
"previewModeSigningKey": "cbcda09c6c305aaaf3b3138a6f9af0773811d0a1e78693f122b747a6ee3daec9",
"previewModeEncryptionKey": "3bb04e969a8d55072516785dfa086c78cb3ee4409b6ace198c906a368374b99c"
"previewModeId": "9f5c9ab90c414c1bf0b924d82f01550b",
"previewModeSigningKey": "65761acc2152c32b9e38b86a2cf919d7cb562a69b97110a19bc0b8d07bb3eff0",
"previewModeEncryptionKey": "02066a1fcfe52150fdf8581567b473dc92ee0cb424686a32a9f2f826abd63443"
}
}
8 changes: 4 additions & 4 deletions lavender_data/ui/.next/server/app-paths-manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
"/datasets/page": "app/datasets/page.js",
"/iterations/page": "app/iterations/page.js",
"/iterations/[iteration_id]/page": "app/iterations/[iteration_id]/page.js",
"/datasets/[dataset_id]/iterations/page": "app/datasets/[dataset_id]/iterations/page.js",
"/datasets/[dataset_id]/page": "app/datasets/[dataset_id]/page.js",
"/datasets/[dataset_id]/dataloader/page": "app/datasets/[dataset_id]/dataloader/page.js",
"/datasets/[dataset_id]/preview/page": "app/datasets/[dataset_id]/preview/page.js",
"/datasets/[dataset_id]/settings/page": "app/datasets/[dataset_id]/settings/page.js",
"/datasets/[dataset_id]/iterations/page": "app/datasets/[dataset_id]/iterations/page.js",
"/datasets/[dataset_id]/shardsets/[shardset_id]/page": "app/datasets/[dataset_id]/shardsets/[shardset_id]/page.js",
"/datasets/[dataset_id]/dataloader/page": "app/datasets/[dataset_id]/dataloader/page.js",
"/datasets/[dataset_id]/shardsets/page": "app/datasets/[dataset_id]/shardsets/page.js",
"/datasets/[dataset_id]/shardsets/[shardset_id]/page": "app/datasets/[dataset_id]/shardsets/[shardset_id]/page.js"
"/datasets/[dataset_id]/preview/page": "app/datasets/[dataset_id]/preview/page.js"
}
Loading