Skip to content

Commit 1570011

Browse files
ikreymertw4l
andauthored
compute top page origins for each collection (#2483)
A quick PR to fix #2482: - compute topPageHosts as part of existing collection stats compute - store top 10 results in collection for now. - display in collection About sidebar - fixes #2482 Co-authored-by: Tessa Walsh <[email protected]>
1 parent 0691f43 commit 1570011

File tree

9 files changed

+117
-1
lines changed

9 files changed

+117
-1
lines changed

backend/btrixcloud/colls.py

+3
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,8 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
705705

706706
unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids)
707707

708+
top_page_hosts = await self.page_ops.get_top_page_hosts(crawl_ids)
709+
708710
await self.collections.find_one_and_update(
709711
{"_id": collection_id},
710712
{
@@ -715,6 +717,7 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
715717
"totalSize": total_size,
716718
"tags": sorted_tags,
717719
"preloadResources": preload_resources,
720+
"topPageHosts": top_page_hosts,
718721
}
719722
},
720723
)

backend/btrixcloud/db.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
) = PageOps = BackgroundJobOps = object
3333

3434

35-
CURR_DB_VERSION = "0043"
35+
CURR_DB_VERSION = "0044"
3636

3737

3838
# ============================================================================
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""
2+
Migration 0044 - Recalculate collection stats
3+
"""
4+
5+
from btrixcloud.migrations import BaseMigration
6+
7+
8+
MIGRATION_VERSION = "0044"
9+
10+
11+
# pylint: disable=duplicate-code
12+
class Migration(BaseMigration):
13+
"""Migration class."""
14+
15+
# pylint: disable=unused-argument
16+
def __init__(self, mdb, **kwargs):
17+
super().__init__(mdb, migration_version=MIGRATION_VERSION)
18+
19+
self.coll_ops = kwargs.get("coll_ops")
20+
21+
async def migrate_up(self):
22+
"""Perform migration up.
23+
24+
Recalculate collection stats to get top host names
25+
"""
26+
colls_mdb = self.mdb["collections"]
27+
28+
if self.coll_ops is None:
29+
print(
30+
"Unable to set collection stats, missing coll_ops",
31+
flush=True,
32+
)
33+
return
34+
35+
async for coll in colls_mdb.find({}):
36+
coll_id = coll["_id"]
37+
try:
38+
await self.coll_ops.update_collection_counts_and_tags(coll_id)
39+
# pylint: disable=broad-exception-caught
40+
except Exception as err:
41+
print(
42+
f"Unable to update page stats for collection {coll_id}: {err}",
43+
flush=True,
44+
)

backend/btrixcloud/models.py

+12
Original file line numberDiff line numberDiff line change
@@ -1417,6 +1417,14 @@ class PreloadResource(BaseModel):
14171417
crawlId: str
14181418

14191419

1420+
# ============================================================================
1421+
class HostCount(BaseModel):
1422+
"""Host Count"""
1423+
1424+
host: str
1425+
count: int
1426+
1427+
14201428
# ============================================================================
14211429
class Collection(BaseMongoModel):
14221430
"""Org collection structure"""
@@ -1515,6 +1523,8 @@ class CollOut(BaseMongoModel):
15151523
pagesQueryUrl: str = ""
15161524
downloadUrl: Optional[str] = None
15171525

1526+
topPageHosts: List[HostCount] = []
1527+
15181528

15191529
# ============================================================================
15201530
class PublicCollOut(BaseMongoModel):
@@ -1550,6 +1560,8 @@ class PublicCollOut(BaseMongoModel):
15501560

15511561
allowPublicDownload: bool = True
15521562

1563+
topPageHosts: List[HostCount] = []
1564+
15531565

15541566
# ============================================================================
15551567
class UpdateColl(BaseModel):

backend/btrixcloud/pages.py

+29
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,35 @@ async def get_unique_page_count(self, crawl_ids: List[str]) -> int:
923923
res = await cursor.to_list(1)
924924
return res[0].get("urls") if res else 0
925925

926+
async def get_top_page_hosts(
927+
self, crawl_ids: List[str]
928+
) -> List[dict[str, str | int]]:
929+
"""Get count of top page hosts across all archived items"""
930+
cursor = self.pages.aggregate(
931+
[
932+
{"$match": {"crawl_id": {"$in": crawl_ids}}},
933+
{
934+
"$addFields": {
935+
"host": {
936+
"$regexFind": {
937+
"input": "$url",
938+
"regex": "^https?://([^/]+)",
939+
}
940+
}
941+
}
942+
},
943+
{
944+
"$group": {
945+
"_id": {"$first": "$host.captures"},
946+
"count": {"$count": {}},
947+
}
948+
},
949+
{"$sort": {"count": -1}},
950+
]
951+
)
952+
res = await cursor.to_list(10)
953+
return [{"host": x.get("_id"), "count": x.get("count")} for x in res]
954+
926955
async def set_archived_item_page_counts(self, crawl_id: str):
927956
"""Store archived item page and unique page counts in crawl document"""
928957
page_count = await self.pages.count_documents({"crawl_id": crawl_id})

backend/test/test_collections.py

+7
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ def test_create_collection(
9494
assert data["defaultThumbnailName"] == default_thumbnail_name
9595
assert data["allowPublicDownload"]
9696

97+
assert data["topPageHosts"] == [{'count': 3, 'host': 'webrecorder.net'}]
98+
9799

98100
def test_create_public_collection(
99101
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
@@ -223,6 +225,7 @@ def test_update_collection(
223225
assert data["dateEarliest"]
224226
assert data["dateLatest"]
225227
assert data["defaultThumbnailName"]
228+
assert data["topPageHosts"]
226229

227230

228231
def test_rename_collection(
@@ -310,6 +313,7 @@ def test_add_remove_crawl_from_collection(
310313
assert data["tags"] == ["wr-test-2", "wr-test-1"]
311314
assert data["dateEarliest"]
312315
assert data["dateLatest"]
316+
assert data["topPageHosts"] == [{'count': 7, 'host': 'webrecorder.net'}]
313317

314318
# Verify it was added
315319
r = requests.get(
@@ -335,6 +339,7 @@ def test_add_remove_crawl_from_collection(
335339
assert data.get("tags", []) == []
336340
assert data.get("dateEarliest") is None
337341
assert data.get("dateLatest") is None
342+
assert data["topPageHosts"] == []
338343

339344
# Verify they were removed
340345
r = requests.get(
@@ -366,6 +371,7 @@ def test_add_remove_crawl_from_collection(
366371
assert data["tags"] == ["wr-test-2", "wr-test-1"]
367372
assert data["dateEarliest"]
368373
assert data["dateLatest"]
374+
assert data["topPageHosts"]
369375

370376

371377
def test_get_collection(crawler_auth_headers, default_org_id):
@@ -1137,6 +1143,7 @@ def test_list_public_collections(
11371143
assert collection["pageCount"] > 0
11381144
assert collection["uniquePageCount"] > 0
11391145
assert collection["totalSize"] > 0
1146+
assert collection["topPageHosts"]
11401147

11411148
# Test non-existing slug - it should return a 404 but not reveal
11421149
# whether or not an org exists with that slug

frontend/src/layouts/collections/metadataColumn.ts

+14
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,20 @@ export function metadataColumn(collection?: Collection | PublicCollection) {
5656
label: metadata.totalSize,
5757
render: (col) => `${localize.bytes(col.totalSize)}`,
5858
})}
59+
${metadataItem({
60+
label: metadata.topPageHosts,
61+
render: (col) =>
62+
html` <table>
63+
${col.topPageHosts.map(
64+
(x) => html`
65+
<tr>
66+
<td>${x.host}</td>
67+
<td class="pl-4">${x.count}</td>
68+
</tr>
69+
`,
70+
)}
71+
</table>`,
72+
})}
5973
</btrix-desc-list>
6074
`;
6175
}

frontend/src/strings/collections/metadata.ts

+1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ export const metadata = {
55
uniquePageCount: msg("Unique Pages in Collection"),
66
pageCount: msg("Total Pages Crawled"),
77
totalSize: msg("Collection Size"),
8+
topPageHosts: msg("Top Page Hostnames"),
89
};

frontend/src/types/collection.ts

+6
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,12 @@ export const publicCollectionSchema = z.object({
4141
crawlCount: z.number(),
4242
uniquePageCount: z.number(),
4343
pageCount: z.number(),
44+
topPageHosts: z.array(
45+
z.object({
46+
host: z.string(),
47+
count: z.number(),
48+
}),
49+
),
4450
totalSize: z.number(),
4551
allowPublicDownload: z.boolean(),
4652
homeUrl: z.string().url().nullable(),

0 commit comments

Comments
 (0)