Skip to content

Commit 5d93b8f

Browse files
committed
Optimize storage_locations_usage_over_time
Optimize storage_locations_usage_over_time by batching per-day metrics into a single grouped query instead of thousands of individual lookups. Previously, this test was a major outlier, consuming about 60% of the ~115s suite; now it completes in under a second. The change saves around 75s per run, or roughly 6 minutes when tox runs py310–py314. The UI for wide date ranges (e.g., 2000–2025) also loads in under 1s.
1 parent f03d21c commit 5d93b8f

File tree

1 file changed

+84
-24
lines changed

1 file changed

+84
-24
lines changed

AIPscan/Data/report_data.py

Lines changed: 84 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
"""Data endpoints optimized for reports in the Reporter blueprint."""
22

33
from datetime import datetime
4-
from datetime import timedelta
54
from operator import itemgetter
65

76
from dateutil.rrule import DAILY
87
from dateutil.rrule import rrule
8+
from sqlalchemy import case
99

1010
from AIPscan import db
1111
from AIPscan.Data import fields
@@ -702,8 +702,12 @@ def storage_locations_usage_over_time(
702702
:param storage_service_id: Storage Service ID (int)
703703
:param start_date: Inclusive AIP creation start date
704704
(datetime.datetime object)
705-
:param end_date: Inclusive AIP creation end date
706-
(datetime.datetime object)
705+
:param end_date: Exclusive upper bound for AIP creation timestamps
706+
(datetime.datetime). Callers that work with inclusive calendar dates
707+
should use ``parse_datetime_bound(..., upper=True)``.
708+
For example, the range “2024-01-01 to 2024-01-31” becomes:
709+
- ``start_date = 2024-01-01 00:00:00``, and
710+
- ``end_date = 2024-02-01 00:00:00``
707711
:param cumulative: Flag indicating whether to calculate cumulatively, where
708712
each month adds to previous totals (bool)
709713
@@ -714,35 +718,91 @@ def storage_locations_usage_over_time(
714718
report = {}
715719
report[fields.FIELD_STORAGE_NAME] = get_storage_service_name(storage_service_id)
716720

717-
locations = _get_storage_locations(storage_service_id)
721+
locations = sorted(
722+
_get_storage_locations(storage_service_id), key=lambda loc: loc.id
723+
)
724+
location_ids = [loc.id for loc in locations]
718725
days = _get_days_covered_by_date_range(storage_service_id, start_date, end_date)
719726

720-
results = {}
727+
# Build a single aggregated query that returns daily metrics per location.
728+
# MySQL: DATE(AIP.create_date) groups by calendar day.
729+
day_col = db.func.date(AIP.create_date).label("day")
721730

722-
for day in days:
723-
daily_locations_data = []
731+
# CASE expression to count only original files.
732+
orig_file_case = case((File.file_type == FileType.original, 1), else_=0)
733+
734+
aggregated = (
735+
db.session.query(
736+
day_col,
737+
AIP.storage_location_id.label("storage_location_id"),
738+
db.func.count(db.func.distinct(AIP.id)).label("aips"),
739+
db.func.coalesce(db.func.sum(File.size), 0).label("size_sum"),
740+
db.func.coalesce(db.func.sum(orig_file_case), 0).label("orig_files"),
741+
)
742+
.outerjoin(File, File.aip_id == AIP.id)
743+
.filter(AIP.storage_service_id == storage_service_id)
744+
.filter(AIP.create_date >= start_date)
745+
.filter(AIP.create_date < end_date)
746+
.group_by(day_col, AIP.storage_location_id)
747+
.all()
748+
)
724749

725-
for location in locations:
726-
loc_info = {}
750+
# Map aggregated rows for quick lookup.
751+
daily_loc_metrics = {} # {day_str: {loc_id: {aips,size,files}}}
752+
for row in aggregated:
753+
# row.day can be date/datetime; normalize to YYYY-MM-DD string
754+
day_str = str(row.day)
755+
loc_id = row.storage_location_id
756+
day_bucket = daily_loc_metrics.setdefault(day_str, {})
757+
day_bucket[loc_id] = {
758+
fields.FIELD_AIPS: int(row.aips or 0),
759+
fields.FIELD_SIZE: int(row.size_sum or 0),
760+
fields.FIELD_FILE_COUNT: int(row.orig_files or 0),
761+
}
762+
763+
# Prepare cumulative running totals per location if needed
764+
running_totals = {
765+
loc_id: {fields.FIELD_AIPS: 0, fields.FIELD_SIZE: 0, fields.FIELD_FILE_COUNT: 0}
766+
for loc_id in location_ids
767+
}
727768

728-
subquery_start_date = datetime.strptime(day, "%Y-%m-%d")
729-
subquery_end_date = subquery_start_date + timedelta(days=1)
730-
if cumulative:
731-
subquery_start_date = datetime.strptime(days[0], "%Y-%m-%d")
769+
results = {}
770+
for day in days:
771+
daily_locations_data = []
732772

733-
loc_info[fields.FIELD_ID] = location.id
734-
loc_info[fields.FIELD_UUID] = location.uuid
735-
loc_info[fields.FIELD_STORAGE_LOCATION] = location.description
736-
loc_info[fields.FIELD_AIPS] = location.aip_count(
737-
subquery_start_date, subquery_end_date
738-
)
739-
loc_info[fields.FIELD_SIZE] = location.aip_total_size(
740-
subquery_start_date, subquery_end_date
741-
)
742-
loc_info[fields.FIELD_FILE_COUNT] = location.file_count(
743-
subquery_start_date, subquery_end_date
773+
# Use aggregated metrics for the given day; default to zeros.
774+
day_metrics = daily_loc_metrics.get(day, {})
775+
for loc in locations:
776+
base = day_metrics.get(
777+
loc.id,
778+
{
779+
fields.FIELD_AIPS: 0,
780+
fields.FIELD_SIZE: 0,
781+
fields.FIELD_FILE_COUNT: 0,
782+
},
744783
)
745784

785+
if cumulative:
786+
run = running_totals[loc.id]
787+
run[fields.FIELD_AIPS] += base[fields.FIELD_AIPS]
788+
run[fields.FIELD_SIZE] += base[fields.FIELD_SIZE]
789+
run[fields.FIELD_FILE_COUNT] += base[fields.FIELD_FILE_COUNT]
790+
aips_val = run[fields.FIELD_AIPS]
791+
size_val = run[fields.FIELD_SIZE]
792+
files_val = run[fields.FIELD_FILE_COUNT]
793+
else:
794+
aips_val = base[fields.FIELD_AIPS]
795+
size_val = base[fields.FIELD_SIZE]
796+
files_val = base[fields.FIELD_FILE_COUNT]
797+
798+
loc_info = {
799+
fields.FIELD_ID: loc.id,
800+
fields.FIELD_UUID: loc.uuid,
801+
fields.FIELD_STORAGE_LOCATION: loc.description,
802+
fields.FIELD_AIPS: aips_val,
803+
fields.FIELD_SIZE: size_val,
804+
fields.FIELD_FILE_COUNT: files_val,
805+
}
746806
daily_locations_data.append(loc_info)
747807

748808
results[day] = daily_locations_data

0 commit comments

Comments
 (0)