Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bundle Analysis: Add asset comparison and fetching PR files #480

Merged
merged 4 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 82 additions & 32 deletions shared/bundle_analysis/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
BundleAnalysisReport,
BundleReport,
BundleRouteReport,
ModuleReport,
)
from shared.bundle_analysis.storage import BundleAnalysisReportLoader
from shared.django_apps.core.models import Repository
Expand Down Expand Up @@ -77,11 +78,66 @@ class AssetChange(BaseChange):
"""

asset_name: str
percentage_delta: float
size_base: int
size_head: int


AssetMatch = Tuple[Optional[AssetReport], Optional[AssetReport]]


class AssetComparison:
def __init__(
self,
base_asset_report: Optional[AssetReport] = None,
head_asset_report: Optional[AssetReport] = None,
):
self.base_asset_report = base_asset_report
self.head_asset_report = head_asset_report

@sentry_sdk.trace
def asset_change(self) -> AssetChange:
if self.base_asset_report is None:
return AssetChange(
asset_name=self.head_asset_report.name,
change_type=AssetChange.ChangeType.ADDED,
size_delta=self.head_asset_report.size,
percentage_delta=100,
size_base=0,
size_head=self.head_asset_report.size,
)
elif self.head_asset_report is None:
return AssetChange(
asset_name=self.base_asset_report.name,
change_type=AssetChange.ChangeType.REMOVED,
size_delta=-self.base_asset_report.size,
percentage_delta=-100.0,
size_base=self.base_asset_report.size,
size_head=0,
)
else:
size_delta = self.head_asset_report.size - self.base_asset_report.size
percentage_delta = round(
(size_delta / self.base_asset_report.size) * 100, 2
)
return AssetChange(
asset_name=self.head_asset_report.name,
change_type=AssetChange.ChangeType.CHANGED,
size_delta=size_delta,
percentage_delta=percentage_delta,
size_base=self.base_asset_report.size,
size_head=self.head_asset_report.size,
)

def contributing_modules(
self, pr_changed_files: Optional[List[str]] = None
) -> List[ModuleReport]:
asset_report = self.head_asset_report
if asset_report is None:
return []
return asset_report.modules(pr_changed_files)


class BundleComparison:
def __init__(
self, base_bundle_report: BundleReport, head_bundle_report: BundleReport
Expand All @@ -95,7 +151,7 @@ def total_size_delta(self) -> int:
return head_size - base_size

@sentry_sdk.trace
def asset_changes(self) -> List[AssetChange]:
def asset_comparisons(self) -> List[AssetComparison]:
# this groups assets by name
# there can be multiple assets with the same name and we
# need to try and match them across base and head reports
Expand All @@ -119,29 +175,10 @@ def asset_changes(self) -> List[AssetChange]:
if asset_name not in asset_names:
matches += self._match_assets(asset_reports, [])

changes = []
for base_asset_report, head_asset_report in matches:
if base_asset_report is None:
change = AssetChange(
asset_name=head_asset_report.name,
change_type=AssetChange.ChangeType.ADDED,
size_delta=head_asset_report.size,
)
elif head_asset_report is None:
change = AssetChange(
asset_name=base_asset_report.name,
change_type=AssetChange.ChangeType.REMOVED,
size_delta=-base_asset_report.size,
)
else:
change = AssetChange(
asset_name=head_asset_report.name,
change_type=AssetChange.ChangeType.CHANGED,
size_delta=head_asset_report.size - base_asset_report.size,
)
changes.append(change)

return changes
return [
AssetComparison(base_asset_report, head_asset_report)
for base_asset_report, head_asset_report in matches
]

def _match_assets(
self,
Expand All @@ -153,9 +190,11 @@ def _match_assets(
This method attempts to pick the most likely matching of assets between
base and head (so as to track their changes through time).

The current approach is fairly naive and just picks the asset with the
closest size. There are probably better ways of doing this that we can
improve upon in the future.
Current approach:
1. Pick asset with the same UUID. This means the base and head assets have either of:
- same hashed name
- same modules by name
2. Pick asset with the closest size
"""
n = max([len(base_asset_reports), len(head_asset_reports)])
matches: List[AssetMatch] = []
Expand All @@ -169,13 +208,24 @@ def _match_assets(
# no more base assets to match against
matches.append((None, head_asset_report))
else:
# try and find the most "similar" base asset
size_deltas = {
abs(head_asset_report.size - base_bundle.size): base_bundle
# 1. Pick asset with the same UUID
base_asset_report_uuids = {
base_bundle.uuid: base_bundle
for base_bundle in base_asset_reports
}
min_delta = min(size_deltas.keys())
base_asset_report = size_deltas[min_delta]
if head_asset_report.uuid in base_asset_report_uuids:
base_asset_report = base_asset_report_uuids[
head_asset_report.uuid
]

# 2. Pick asset with the closest size
else:
size_deltas = {
abs(head_asset_report.size - base_bundle.size): base_bundle
for base_bundle in base_asset_reports
}
min_delta = min(size_deltas.keys())
base_asset_report = size_deltas[min_delta]

matches.append((base_asset_report, head_asset_report))
base_asset_reports.remove(base_asset_report)
Expand Down
11 changes: 11 additions & 0 deletions shared/bundle_analysis/parsers/v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import ijson
import sentry_sdk
from sqlalchemy import tuple_
from sqlalchemy.orm import Session as DbSession
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound

Expand Down Expand Up @@ -160,6 +161,16 @@ def parse(self, path: str) -> Tuple[int, str]:
# but first we need to find the Asset by the hashed file name
dynamic_imports_list = self._parse_dynamic_imports()
if dynamic_imports_list:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Delete the old entries in the table before inserting the new data.

self.db_session.execute(
DynamicImport.__table__.delete().where(
tuple_(DynamicImport.chunk_id, DynamicImport.asset_id).in_(
[
(item["chunk_id"], item["asset_id"])
for item in dynamic_imports_list
]
)
)
)
insert_dynamic_imports = DynamicImport.__table__.insert().values(
dynamic_imports_list
)
Expand Down
20 changes: 16 additions & 4 deletions shared/bundle_analysis/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,28 @@ def uuid(self) -> str:
def asset_type(self) -> AssetType:
return self.asset.asset_type

def modules(self) -> List[ModuleReport]:
def modules(
self, pr_changed_files: Optional[List[str]] = None
) -> List[ModuleReport]:
with get_db_session(self.db_path) as session:
modules = (
query = (
session.query(Module)
.join(Module.chunks)
.join(Chunk.assets)
.filter(Asset.id == self.asset.id)
.all()
)
return [ModuleReport(self.db_path, module) for module in modules]

# Apply additional filter if pr_changed_files is provided
if pr_changed_files:
# Normalize pr_changed_files to have './' prefix for relative paths
normalized_files = {
f"./{file}" if not file.startswith("./") else file
for file in pr_changed_files
}
query = query.filter(Module.name.in_(normalized_files))

modules = query.all()
return [ModuleReport(self.db_path, module) for module in modules]

def routes(self) -> Optional[List[str]]:
plugin_name = self.bundle_info.get("plugin_name")
Expand Down
Loading
Loading