diff --git a/app/api_schemas.py b/app/api_schemas.py index a3117a0..96c8278 100644 --- a/app/api_schemas.py +++ b/app/api_schemas.py @@ -88,6 +88,7 @@ class SearchQuery(Schema): org_slug = String() org_type = Enum(ORGANIZATION_TYPE_ENUM) keyword = List(String()) + publisher = String() after = String() spatial_filter = Enum(SPATIAL_FILTER_ENUM) spatial_feature = GeoJson() @@ -125,6 +126,16 @@ class OrganizationsResults(Schema): total = Integer() +class PublisherResponse(Schema): + name = String() + count = Integer() + + +class PublishersResults(Schema): + publishers = List(Nested(PublisherResponse)) + total = Integer() + + class OpensearchHealth(Schema): status = String() diff --git a/app/database/interface.py b/app/database/interface.py index 8dd4060..e61abbc 100644 --- a/app/database/interface.py +++ b/app/database/interface.py @@ -70,6 +70,7 @@ def search_datasets( per_page=DEFAULT_PER_PAGE, org_id=None, org_types=None, + publisher: str | None = None, spatial_filter=None, spatial_geometry=None, spatial_within=True, @@ -78,6 +79,7 @@ def search_datasets( include_aggregations: bool = False, keyword_size: int = 100, org_size: int = 100, + publisher_size: int = 100, *args, **kwargs, ): @@ -108,6 +110,7 @@ def search_datasets( per_page=per_page, org_id=org_id, org_types=org_types, + publisher=publisher, search_after=search_after, spatial_filter=spatial_filter, spatial_geometry=spatial_geometry, @@ -116,6 +119,7 @@ def search_datasets( include_aggregations=include_aggregations, keyword_size=keyword_size, org_size=org_size, + publisher_size=publisher_size, ) def get_unique_keywords(self, size=100, min_doc_count=1) -> list[dict]: @@ -135,11 +139,13 @@ def get_contextual_aggregations( org_id=None, org_types=None, keywords: list[str] = None, + publisher: str | None = None, spatial_filter=None, spatial_geometry=None, spatial_within=True, keyword_size=100, org_size=100, + publisher_size=100, ) -> dict: """ Get keyword and organization aggregations based on current search context. @@ -153,14 +159,20 @@ def get_contextual_aggregations( per_page=0, org_id=org_id, org_types=org_types, + publisher=publisher, spatial_filter=spatial_filter, spatial_geometry=spatial_geometry, spatial_within=spatial_within, include_aggregations=True, keyword_size=keyword_size, org_size=org_size, + publisher_size=publisher_size, ) - return result.aggregations or {"keywords": [], "organizations": []} + return result.aggregations or { + "keywords": [], + "organizations": [], + "publishers": [], + } def search_locations(self, query, size=100): """ @@ -296,9 +308,14 @@ def list_datasets_for_organization( dataset_search_query: str = "", num_results=DEFAULT_PER_PAGE, keywords: list[str] | None = None, + publisher: str | None = None, spatial_filter: str | None = None, spatial_geometry: dict | None = None, spatial_within: bool = True, + include_aggregations: bool = False, + keyword_size: int = 100, + org_size: int = 100, + publisher_size: int = 100, ) -> SearchResult: if not organization_id: return SearchResult.empty() @@ -309,9 +326,14 @@ def list_datasets_for_organization( org_id=organization_id, sort_by=sort_by, per_page=num_results, + publisher=publisher, spatial_filter=spatial_filter, spatial_geometry=spatial_geometry, spatial_within=spatial_within, + include_aggregations=include_aggregations, + keyword_size=keyword_size, + org_size=org_size, + publisher_size=publisher_size, ) def get_opensearch_org_dataset_counts(self, as_dict=False): @@ -402,6 +424,18 @@ def _get_organizations_from_db(self) -> list[dict]: for row in rows ] + def get_top_publishers(self) -> list[dict]: + """Return the top 100 publishers ordered by dataset count.""" + publishers = self.opensearch.get_publisher_counts(size=100) + + return sorted( + [item for item in publishers if item.get("name")], + key=lambda item: ( + -int(item.get("count", 0)), + (item.get("name") or "").lower(), + ), + ) + @staticmethod def to_dict(obj: Any) -> dict[str, Any] | None: if obj is None: diff --git a/app/database/opensearch.py b/app/database/opensearch.py index 5d6c0a3..3ab93ce 100644 --- a/app/database/opensearch.py +++ b/app/database/opensearch.py @@ -57,7 +57,7 @@ def from_opensearch_result(cls, result_dict: dict, per_page_hint=0): When the search body included aggregation "clauses", the parsed `aggregations` dict will be populated on the returned instance with - `keywords` and `organizations` lists. + `keywords`, `organizations`, and `publishers` lists. """ total = result_dict["hits"]["total"]["value"] @@ -97,6 +97,7 @@ def from_opensearch_result(cls, result_dict: dict, per_page_hint=0): org_buckets = ( raw_aggs.get("organizations", {}).get("by_slug", {}).get("buckets", []) ) + publisher_buckets = raw_aggs.get("unique_publishers", {}).get("buckets", []) aggregations = { "keywords": [ {"keyword": b["key"], "count": b["doc_count"]} @@ -105,6 +106,10 @@ def from_opensearch_result(cls, result_dict: dict, per_page_hint=0): "organizations": [ {"slug": b["key"], "count": b["doc_count"]} for b in org_buckets ], + "publishers": [ + {"name": b["key"], "count": b["doc_count"]} + for b in publisher_buckets + ], } return cls( @@ -196,6 +201,13 @@ class OpenSearchInterface: "type": "text", "analyzer": TEXT_ANALYZER, "search_analyzer": TEXT_ANALYZER, + "fields": { + "raw": {"type": "keyword"}, + "normalized": { + "type": "keyword", + "normalizer": KEYWORD_NORMALIZER, + }, + }, }, "keyword": { "type": "text", @@ -895,6 +907,7 @@ def search( org_id=None, search_after: list = None, org_types=None, + publisher: str | None = None, spatial_filter=None, spatial_geometry=None, spatial_within=True, @@ -903,6 +916,7 @@ def search( include_aggregations: bool = False, keyword_size: int = 100, org_size: int = 100, + publisher_size: int = 100, ) -> SearchResult: """Search our index for a query string. @@ -1029,6 +1043,9 @@ def search( } ) + if publisher: + filters.append({"term": {"publisher.normalized": publisher.lower()}}) + # Add spatial filter if spatial_filter == "geospatial": filters.append({"term": {"has_spatial": True}}) @@ -1062,7 +1079,7 @@ def search( if search_after is not None: search_body["search_after"] = search_after - # `keyword` and `organization` aggregations for the chips + # `keyword`, `organization`, and `publisher` aggregations for the chips if include_aggregations: search_body["aggs"] = { "unique_keywords": { @@ -1086,6 +1103,14 @@ def search( } }, }, + "unique_publishers": { + "terms": { + "field": "publisher.raw", + "size": publisher_size, + "min_doc_count": 1, + "order": {"_count": "desc"}, + } + }, } # print("QUERY:", search_body) @@ -1175,6 +1200,41 @@ def get_organization_counts( {"slug": bucket["key"], "count": bucket["doc_count"]} for bucket in buckets ] + def get_publisher_counts( + self, size=100, min_doc_count=1, as_dict=False + ) -> list[dict] | dict[str, int]: + """Aggregate datasets by publisher name to get counts.""" + agg_body = { + "size": 0, + "aggs": { + "unique_publishers": { + "terms": { + "field": "publisher.raw", + "size": size, + "min_doc_count": min_doc_count, + "order": {"_count": "desc"}, + } + } + }, + } + + result = self.client.search(index=self.INDEX_NAME, body=agg_body) + buckets = ( + result.get("aggregations", {}) + .get("unique_publishers", {}) + .get("buckets", []) + ) + + if as_dict: + output = {} + for bucket in buckets: + output[bucket["key"]] = bucket["doc_count"] + return output + + return [ + {"name": bucket["key"], "count": bucket["doc_count"]} for bucket in buckets + ] + def get_last_harvested_stats(self) -> dict[str, Any]: """Get dataset age-bin counts.""" @@ -1241,11 +1301,13 @@ def get_contextual_aggregations( org_id=None, org_types=None, keywords: list[str] = None, + publisher: str | None = None, spatial_filter=None, spatial_geometry=None, spatial_within=True, keyword_size=100, org_size=100, + publisher_size=100, ) -> dict: """ Get keyword and organization aggregations based on current search context. @@ -1313,6 +1375,9 @@ def get_contextual_aggregations( } ) + if publisher: + filters.append({"term": {"publisher.normalized": publisher.lower()}}) + if spatial_filter == "geospatial": filters.append({"term": {"has_spatial": True}}) elif spatial_filter == "non-geospatial": @@ -1406,6 +1471,40 @@ def get_contextual_aggregations( .get("buckets", []) ) + if filters: + publisher_query = { + "bool": { + "filter": filters, + "must": [base_query], + } + } + else: + publisher_query = base_query + + publisher_agg_body = { + "size": 0, + "query": publisher_query, + "aggs": { + "unique_publishers": { + "terms": { + "field": "publisher.raw", + "size": publisher_size, + "min_doc_count": 1, + "order": {"_count": "desc"}, + } + } + }, + } + + publisher_result = self.client.search( + index=self.INDEX_NAME, body=publisher_agg_body + ) + publisher_buckets = ( + publisher_result.get("aggregations", {}) + .get("unique_publishers", {}) + .get("buckets", []) + ) + return { "keywords": [ {"keyword": bucket["key"], "count": bucket["doc_count"]} @@ -1415,4 +1514,8 @@ def get_contextual_aggregations( {"slug": bucket["key"], "count": bucket["doc_count"]} for bucket in org_buckets ], + "publishers": [ + {"name": bucket["key"], "count": bucket["doc_count"]} + for bucket in publisher_buckets + ], } diff --git a/app/routes.py b/app/routes.py index f77a34c..a553629 100644 --- a/app/routes.py +++ b/app/routes.py @@ -29,6 +29,7 @@ LocationsResults, OpensearchHealth, OrganizationsResults, + PublishersResults, SearchQuery, SearchResults, StatsResult, @@ -188,6 +189,7 @@ def index(): org_slug_param = (request.args.get("org_slug", None, type=str) or "").strip() org_types = request.args.getlist("org_type") keywords = request.args.getlist("keyword") + publisher = (request.args.get("publisher", None, type=str) or "").strip() or None spatial_filter = request.args.get("spatial_filter", None, type=str) spatial_geometry = request.args.get("spatial_geometry", type=str) spatial_within = _parse_bool_param(request.args.get("spatial_within"), True) @@ -237,6 +239,7 @@ def index(): query or org_types or keywords + or publisher or org_filter_id or spatial_filter or spatial_geometry @@ -249,6 +252,7 @@ def index(): per_page=num_results, org_id=org_filter_id, org_types=org_types, + publisher=publisher, sort_by=sort_by, spatial_filter=spatial_filter, spatial_geometry=spatial_geometry, @@ -256,6 +260,7 @@ def index(): include_aggregations=True, keyword_size=100, org_size=100, + publisher_size=100, ) # For homepage without filters, get accurate total count @@ -280,7 +285,7 @@ def index(): contextual_aggs = ( result.aggregations if result is not None and result.aggregations is not None - else {"keywords": [], "organizations": []} + else {"keywords": [], "organizations": [], "publishers": []} ) contextual_keyword_counts = { item["keyword"]: item["count"] for item in contextual_aggs.get("keywords", []) @@ -288,6 +293,9 @@ def index(): contextual_org_counts = { item["slug"]: item["count"] for item in contextual_aggs.get("organizations", []) } + contextual_publisher_counts = { + item["name"]: item["count"] for item in contextual_aggs.get("publishers", []) + } # Always compute suggested keywords from contextual aggregations, # excluding any already-selected keywords so users can keep refining. @@ -338,6 +346,16 @@ def index(): except Exception: logger.exception("Failed to fetch suggested organizations") + suggested_publishers = [ + item["name"] + for item in sorted( + contextual_aggs.get("publishers", []), + key=lambda x: x["count"], + reverse=True, + ) + if item["name"] != publisher + ][:10] + # Only emit a return-to-search hint when there is actual search or filter state. from_hint = hint_from_dict(request.args) if request.args else None search_result_geometries = ( @@ -357,9 +375,11 @@ def index(): ), org_types=org_types, keywords=keywords, + publisher=publisher, sort_by=sort_by, suggested_keywords=suggested_keywords, suggested_organizations=suggested_organizations, + suggested_publishers=suggested_publishers, spatial_filter=spatial_filter, spatial_geometry=spatial_geometry, search_result_geometries=search_result_geometries, @@ -368,6 +388,7 @@ def index(): selected_organization=selected_organization, contextual_keyword_counts=contextual_keyword_counts, contextual_org_counts=contextual_org_counts, + contextual_publisher_counts=contextual_publisher_counts, ) @@ -395,6 +416,7 @@ def search(**kwargs): org_slug_param = (request.args.get("org_slug", None, type=str) or "").strip() org_types = request.args.getlist("org_type") keywords = request.args.getlist("keyword") + publisher = (request.args.get("publisher", None, type=str) or "").strip() or None after = request.args.get("after") spatial_filter = request.args.get("spatial_filter", None, type=str) spatial_geometry = request.args.get("spatial_geometry", type=str) @@ -439,6 +461,7 @@ def search(**kwargs): per_page=per_page, org_id=org_filter_id, org_types=org_types, + publisher=publisher, spatial_filter=spatial_filter, spatial_geometry=spatial_geometry, spatial_within=spatial_within, @@ -466,6 +489,7 @@ def search(**kwargs): organization_slug_or_id=selected_organization.slug, spatial_geometry=spatial_geometry, spatial_within=spatial_within, + publisher=publisher, ) return render_template( "components/dataset_results.html", @@ -485,6 +509,7 @@ def search(**kwargs): spatial_filter=spatial_filter, spatial_geometry=spatial_geometry, spatial_within=spatial_within, + publisher=publisher, ) response_dict = { @@ -632,6 +657,7 @@ def organization_detail(slug: str): dataset_search_query = request.args.get("q", default="", type=str).strip() num_results = request.args.get("results", default=DEFAULT_PER_PAGE, type=int) keywords = request.args.getlist("keyword") + publisher = (request.args.get("publisher", None, type=str) or "").strip() or None spatial_filter = request.args.get("spatial_filter", None, type=str) spatial_geometry = request.args.get("spatial_geometry", type=str) spatial_within = _parse_bool_param(request.args.get("spatial_within"), True) @@ -652,26 +678,19 @@ def organization_detail(slug: str): ) sort_by = _normalize_sort(sort_by, spatial_geometry) - suggested_keywords: list[str] = [] - if not keywords: - try: - suggested_keywords = interface.get_unique_keywords(size=10, min_doc_count=1) - if suggested_keywords: - suggested_keywords = [ - keyword["keyword"] for keyword in suggested_keywords - ] - except Exception: - logger.exception("Failed to fetch suggested keywords") - dataset_result = interface.list_datasets_for_organization( organization.id, dataset_search_query=dataset_search_query, sort_by=sort_by, num_results=num_results, keywords=keywords, + publisher=publisher, spatial_filter=spatial_filter, spatial_geometry=spatial_geometry, spatial_within=spatial_within, + include_aggregations=True, + keyword_size=100, + publisher_size=100, ) after = dataset_result.search_after_obscured() search_result_geometries = ( @@ -679,6 +698,35 @@ def organization_detail(slug: str): if spatial_geometry is not None else [] ) + contextual_aggs = dataset_result.aggregations or { + "keywords": [], + "organizations": [], + "publishers": [], + } + contextual_keyword_counts = { + item["keyword"]: item["count"] for item in contextual_aggs.get("keywords", []) + } + contextual_publisher_counts = { + item["name"]: item["count"] for item in contextual_aggs.get("publishers", []) + } + suggested_keywords = [ + item["keyword"] + for item in sorted( + contextual_aggs.get("keywords", []), + key=lambda x: x["count"], + reverse=True, + ) + if item["keyword"] not in set(keywords) + ][:10] + suggested_publishers = [ + item["name"] + for item in sorted( + contextual_aggs.get("publishers", []), + key=lambda x: x["count"], + reverse=True, + ) + if item["name"] != publisher + ][:10] slug_or_id = organization.slug or slug @@ -701,11 +749,15 @@ def organization_detail(slug: str): selected_sort=sort_by, dataset_search_query=dataset_search_query, keywords=keywords, + publisher=publisher, spatial_filter=spatial_filter, spatial_geometry=spatial_geometry, spatial_within=spatial_within, search_result_geometries=search_result_geometries, suggested_keywords=suggested_keywords, + suggested_publishers=suggested_publishers, + contextual_keyword_counts=contextual_keyword_counts, + contextual_publisher_counts=contextual_publisher_counts, ) @@ -801,6 +853,26 @@ def get_organizations_api(**kwargs): return response +@api.route("/api/publishers", methods=["GET"]) +@api.output(PublishersResults) +@api.doc(description="Get the top 100 publishers") +def get_publishers_api(**kwargs): + """Fetch the top 100 publishers.""" + + try: + publishers = interface.get_top_publishers() + return jsonify( + { + "publishers": publishers, + "total": len(publishers), + } + ) + except Exception as e: + response = jsonify({"error": "Failed to fetch publishers", "message": str(e)}) + response.status_code = 500 + return response + + @api.get("/api/opensearch/health") @api.output(OpensearchHealth) def get_opensearch_health_api(): diff --git a/app/static/js/filters_autocomplete.js b/app/static/js/filters_autocomplete.js index 08e4822..c1d01f9 100644 --- a/app/static/js/filters_autocomplete.js +++ b/app/static/js/filters_autocomplete.js @@ -762,6 +762,337 @@ class OrganizationAutocomplete { } } +class PublisherAutocomplete { + constructor(options) { + this.inputId = options.inputId; + this.chipsContainerId = options.chipsContainerId; + this.suggestionsId = options.suggestionsId; + this.apiEndpoint = options.apiEndpoint || '/api/publishers'; + this.formId = options.formId; + this.mainSearchFormId = options.mainSearchFormId; + this.debounceDelay = options.debounceDelay || 300; + this.suggestedContainerId = options.suggestedContainerId || 'suggested-publishers'; + + this.input = document.getElementById(this.inputId); + this.chipsContainer = document.getElementById(this.chipsContainerId); + this.suggestionsContainer = document.getElementById(this.suggestionsId); + this.form = document.getElementById(this.formId); + this.mainSearchForm = document.getElementById(this.mainSearchFormId); + this.suggestedContainer = null; + + this.publishers = []; + this.selectedPublisher = null; + this.debounceTimer = null; + this.currentFocusIndex = -1; + this.numberFormatter = new Intl.NumberFormat(); + this.contextualCounts = {}; + this.initialSelection = options.initialSelection || this.getInitialSelection(); + + if (!this.input || !this.chipsContainer || !this.suggestionsContainer) { + console.error('PublisherAutocomplete: Required elements not found'); + return; + } + + const countsData = this.chipsContainer.dataset.contextualCounts; + if (countsData) { + try { + this.contextualCounts = JSON.parse(countsData); + } catch (e) { + console.error('Failed to parse publisher contextual counts:', e); + } + } + + this.init(); + } + + init() { + this.loadPublishers(); + this.initSuggestedPublishers(); + + if (this.initialSelection) { + this.setPublisher(this.initialSelection, { silent: true }); + this.hideSuggestedPublishers(); + } + + this.input.addEventListener('input', (e) => this.handleInput(e)); + this.input.addEventListener('keydown', (e) => this.handleKeyDown(e)); + this.input.addEventListener('focus', () => this.showSuggestions()); + + document.addEventListener('click', (e) => { + if (!this.input.contains(e.target) && !this.suggestionsContainer.contains(e.target)) { + this.hideSuggestions(); + } + }); + + if (this.form) { + this.form.addEventListener('submit', () => this.syncHiddenInputs()); + } + + if (this.mainSearchForm) { + this.mainSearchForm.addEventListener('submit', () => this.syncHiddenInputs()); + } + } + + getInitialSelection() { + if (!this.chipsContainer || !this.chipsContainer.dataset) { + return null; + } + + return this.chipsContainer.dataset.initialPublisherName || null; + } + + async loadPublishers() { + try { + const response = await fetch(this.apiEndpoint); + const data = await response.json(); + this.publishers = (data.publishers || []).filter((item) => item.name); + } catch (error) { + console.error('Error loading publishers:', error); + this.publishers = []; + } + } + + initSuggestedPublishers() { + this.suggestedContainer = document.getElementById(this.suggestedContainerId); + if (!this.suggestedContainer) { + return; + } + + const buttons = this.suggestedContainer.querySelectorAll('.tag-link--publisher'); + buttons.forEach((button) => { + button.addEventListener('click', () => { + const publisherName = button.dataset.publisherName; + if (publisherName) { + this.setPublisher(publisherName); + this.input.value = ''; + this.hideSuggestedPublishers(); + } + }); + }); + } + + handleInput(e) { + clearTimeout(this.debounceTimer); + this.debounceTimer = setTimeout(() => { + const query = e.target.value.trim().toLowerCase(); + if (query.length === 0) { + this.hideSuggestions(); + } else { + this.filterAndShowSuggestions(query); + } + }, this.debounceDelay); + } + + handleKeyDown(e) { + const suggestions = this.suggestionsContainer.querySelectorAll('.keyword-suggestion'); + + if (e.key === 'ArrowDown') { + e.preventDefault(); + this.currentFocusIndex = Math.min(this.currentFocusIndex + 1, suggestions.length - 1); + this.updateSuggestionFocus(suggestions); + } else if (e.key === 'ArrowUp') { + e.preventDefault(); + this.currentFocusIndex = Math.max(this.currentFocusIndex - 1, 0); + this.updateSuggestionFocus(suggestions); + } else if (e.key === 'Enter' || e.key === 'Tab') { + if (this.currentFocusIndex >= 0 && suggestions[this.currentFocusIndex]) { + e.preventDefault(); + const publisherName = suggestions[this.currentFocusIndex].dataset.publisherName; + if (publisherName) { + this.setPublisher(publisherName); + } + this.input.value = ''; + this.hideSuggestions(); + } + } else if (e.key === 'Escape') { + this.hideSuggestions(); + } + } + + updateSuggestionFocus(suggestions) { + suggestions.forEach((item, index) => { + if (index === this.currentFocusIndex) { + item.classList.add('keyword-suggestion--focused'); + item.scrollIntoView({ block: 'nearest' }); + } else { + item.classList.remove('keyword-suggestion--focused'); + } + }); + } + + filterAndShowSuggestions(query) { + const filtered = this.publishers.filter((item) => { + const name = (item.name || '').toLowerCase(); + const alreadySelected = this.selectedPublisher + && this.selectedPublisher.toLowerCase() === name; + const hasContextualCount = Object.keys(this.contextualCounts).length > 0; + const count = this.contextualCounts[item.name]; + const hasCount = !hasContextualCount || count > 0; + + return name.includes(query) && !alreadySelected && hasCount; + }); + + const topResults = filtered.slice(0, 10); + + if (topResults.length > 0) { + this.renderSuggestions(topResults); + this.showSuggestions(); + } else { + this.hideSuggestions(); + } + } + + renderSuggestions(items) { + this.suggestionsContainer.innerHTML = ''; + this.currentFocusIndex = -1; + + items.forEach((item) => { + const div = document.createElement('div'); + div.className = 'keyword-suggestion'; + div.dataset.publisherName = item.name; + + const displayCount = this.contextualCounts[item.name] !== undefined + ? this.contextualCounts[item.name] + : item.count; + + div.innerHTML = ` + ${this.highlightMatch(item.name, this.input.value)} + ${this.formatCount(displayCount || 0)} + `; + + div.addEventListener('click', () => { + this.setPublisher(item.name); + this.input.value = ''; + this.hideSuggestions(); + }); + + this.suggestionsContainer.appendChild(div); + }); + } + + setPublisher(name, options = {}) { + const silent = Boolean(options.silent); + if (!name) { + return; + } + + this.selectedPublisher = name; + this.renderChip(name); + this.syncHiddenInputs(); + this.hideSuggestedPublishers(); + + if (!silent) { + requestFilterFormSubmit(this.form); + } + } + + renderChip(name) { + this.chipsContainer.innerHTML = ''; + + const chip = document.createElement('div'); + chip.className = 'tag-link'; + chip.dataset.publisherName = name; + const count = this.contextualCounts[name]; + const countHtml = count ? ` (${this.formatCount(count)})` : ''; + chip.innerHTML = ` + ${this.escapeHtml(name)}${countHtml} + + `; + + const removeBtn = chip.querySelector('.keyword-chip__remove'); + removeBtn.addEventListener('click', () => { + this.clearSelection(); + }); + + this.chipsContainer.appendChild(chip); + } + + clearSelection() { + this.selectedPublisher = null; + this.chipsContainer.innerHTML = ''; + this.syncHiddenInputs(); + this.showSuggestedPublishers(); + requestFilterFormSubmit(this.form); + } + + syncHiddenInputs() { + this.syncFormHiddenInputs(this.form); + this.syncFormHiddenInputs(this.mainSearchForm); + } + + syncFormHiddenInputs(form) { + if (!form) { + return; + } + + const existing = form.querySelectorAll('input[name="publisher"][type="hidden"]'); + existing.forEach((input) => input.remove()); + + if (this.selectedPublisher) { + const input = document.createElement('input'); + input.type = 'hidden'; + input.name = 'publisher'; + input.value = this.selectedPublisher; + form.appendChild(input); + } + } + + highlightMatch(text, query) { + if (!text) { + return ''; + } + + const normalizedText = text.toLowerCase(); + const normalizedQuery = query.toLowerCase(); + const index = normalizedText.indexOf(normalizedQuery); + if (index === -1 || !query) { + return this.escapeHtml(text); + } + + const before = this.escapeHtml(text.substring(0, index)); + const match = this.escapeHtml(text.substring(index, index + query.length)); + const after = this.escapeHtml(text.substring(index + query.length)); + + return `${before}${match}${after}`; + } + + showSuggestions() { + this.suggestionsContainer.classList.add('keyword-suggestions--visible'); + this.input.setAttribute('aria-expanded', 'true'); + } + + hideSuggestions() { + this.suggestionsContainer.classList.remove('keyword-suggestions--visible'); + this.currentFocusIndex = -1; + this.input.setAttribute('aria-expanded', 'false'); + } + + formatCount(value) { + const count = Number.isFinite(value) ? value : 0; + return this.numberFormatter.format(count); + } + + escapeHtml(text) { + const div = document.createElement('div'); + div.textContent = text || ''; + return div.innerHTML; + } + + hideSuggestedPublishers() { + if (this.suggestedContainer) { + this.suggestedContainer.style.display = 'none'; + } + } + + showSuggestedPublishers() { + if (this.suggestedContainer && !this.selectedPublisher) { + this.suggestedContainer.style.display = 'block'; + } + } +} + // Initialize when DOM is ready document.addEventListener('DOMContentLoaded', () => { const keywordInput = document.getElementById('keyword-input'); @@ -794,4 +1125,20 @@ document.addEventListener('DOMContentLoaded', () => { suggestedContainerId: 'suggested-organizations' }); } -}); \ No newline at end of file + + const publisherInput = document.getElementById('publisher-input'); + const publisherChips = document.getElementById('publisher-chips'); + const publisherSuggestions = document.getElementById('publisher-suggestions'); + if (publisherInput && publisherChips && publisherSuggestions) { + new PublisherAutocomplete({ + inputId: 'publisher-input', + chipsContainerId: 'publisher-chips', + suggestionsId: 'publisher-suggestions', + formId: 'filter-form', + mainSearchFormId: 'main-search-form', + apiEndpoint: '/api/publishers', + debounceDelay: 300, + suggestedContainerId: 'suggested-publishers' + }); + } +}); diff --git a/app/templates/components/dataset_results.html b/app/templates/components/dataset_results.html index 3ae055a..77c7acb 100644 --- a/app/templates/components/dataset_results.html +++ b/app/templates/components/dataset_results.html @@ -26,6 +26,7 @@ org_slug=org_slug, org_type=org_types, keyword=keywords, + publisher=publisher, spatial_filter=spatial_filter, spatial_geometry=spatial_geometry|tojson if spatial_geometry else none, spatial_within=spatial_within if spatial_geometry else none, @@ -41,6 +42,7 @@ org_slug=org_slug, org_type=org_types, keyword=keywords, + publisher=publisher, spatial_filter=spatial_filter, spatial_geometry=spatial_geometry|tojson if spatial_geometry else none, spatial_within=spatial_within if spatial_geometry else none, @@ -55,6 +57,7 @@ org_slug=org_slug, org_type=org_types, keyword=keywords, + publisher=publisher, spatial_filter=spatial_filter, spatial_geometry=spatial_geometry|tojson if spatial_geometry else none, spatial_within=spatial_within if spatial_geometry else none, diff --git a/app/templates/components/dataset_results_organization.html b/app/templates/components/dataset_results_organization.html index 502a13e..1e80a78 100644 --- a/app/templates/components/dataset_results_organization.html +++ b/app/templates/components/dataset_results_organization.html @@ -10,10 +10,10 @@
{% set new_results = results_hint + per_page %} diff --git a/app/templates/components/filter_sidebar.html b/app/templates/components/filter_sidebar.html index 08f23ac..b914ae3 100644 --- a/app/templates/components/filter_sidebar.html +++ b/app/templates/components/filter_sidebar.html @@ -1,5 +1,6 @@ {% from "components/filters/sort_filter.html" import render_sort_filter %} {% from "components/filters/keyword_filter.html" import render_keyword_filter %} +{% from "components/filters/publisher_filter.html" import render_publisher_filter %} {% from "components/filters/organization_autocomplete_filter.html" import render_organization_autocomplete_filter %} {% from "components/filters/organization_type_filter.html" import render_organization_type_filter %} {% from "components/filters/geography_filter.html" import render_geography_filter %} @@ -12,6 +13,8 @@ keywords, suggested_keywords, spatial_filter, + publisher=None, + suggested_publishers=None, search_result_geometries=None, spatial_geometry=None, form_id='filter-form', @@ -22,7 +25,8 @@ selected_organization=None, suggested_organizations=None, contextual_keyword_counts=None, - contextual_org_counts=None + contextual_org_counts=None, + contextual_publisher_counts=None ) %}

Filter selections are applied on each click

@@ -44,6 +48,8 @@ {{ render_organization_type_filter(org_types) }} {% endif %} + {{ render_publisher_filter(publisher, suggested_publishers, contextual_publisher_counts) }} + {{ render_spatial_filter(spatial_filter) }}
{% endmacro %} diff --git a/app/templates/components/filters/publisher_filter.html b/app/templates/components/filters/publisher_filter.html new file mode 100644 index 0000000..d1266a2 --- /dev/null +++ b/app/templates/components/filters/publisher_filter.html @@ -0,0 +1,61 @@ +{% macro render_publisher_filter(selected_publisher=None, suggested_publishers=None, contextual_publisher_counts=None) %} +
+

+ +

+
+
+ + +
+ +
+ + {% if suggested_publishers %} +
+

Popular publishers:

+
+ {% for publisher in suggested_publishers %} + + {% endfor %} +
+
+ {% endif %} +
+
+
+{% endmacro %} diff --git a/app/templates/index.html b/app/templates/index.html index c637001..897dc5b 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -42,6 +42,9 @@ {% for keyword in keywords %} {% endfor %} + {% if publisher %} + + {% endif %} {% if org_slug %} {% endif %} @@ -63,7 +66,7 @@
{% if datasets %}
- {% if query or org_types or keywords or org_slug or spatial_filter or spatial_geometry %} + {% if query or org_types or keywords or publisher or org_slug or spatial_filter or spatial_geometry %}

{%- if total < 2 -%} Found {{ total }} dataset matching @@ -72,7 +75,7 @@ {%- else -%} Found over {{ total }} datasets matching {%- endif %} - {%- if query and (org_types or keywords or org_slug or spatial_filter or spatial_geometry) %} "{{ query }}" and filters. + {%- if query and (org_types or keywords or publisher or org_slug or spatial_filter or spatial_geometry) %} "{{ query }}" and filters. {%- elif query %} "{{ query }}". {%- else %} filters. {%- endif -%} @@ -124,6 +127,8 @@ sort_value=sort_by, keywords=keywords, suggested_keywords=suggested_keywords, + publisher=publisher, + suggested_publishers=suggested_publishers, spatial_filter=spatial_filter, search_result_geometries=search_result_geometries, spatial_geometry=spatial_geometry, @@ -134,7 +139,8 @@ selected_organization=selected_organization, suggested_organizations=suggested_organizations, contextual_keyword_counts=contextual_keyword_counts, - contextual_org_counts=contextual_org_counts + contextual_org_counts=contextual_org_counts, + contextual_publisher_counts=contextual_publisher_counts ) }}

diff --git a/app/templates/organization_detail.html b/app/templates/organization_detail.html index 2bf770c..c0dd960 100644 --- a/app/templates/organization_detail.html +++ b/app/templates/organization_detail.html @@ -73,6 +73,9 @@

Description

{% for keyword in keywords %} {% endfor %} + {% if publisher %} + + {% endif %} {% if spatial_filter %} {% endif %} @@ -90,7 +93,7 @@

Description

{% if datasets %}
- {% if dataset_search_query or keywords or spatial_filter or spatial_geometry %} + {% if dataset_search_query or keywords or publisher or spatial_filter or spatial_geometry %}

{% if num_matches < 2 %} Found {{ num_matches }} dataset matching @@ -99,7 +102,7 @@

Description

{% else %} Found over {{ num_matches }} datasets matching {% endif %} - {% if dataset_search_query and (keywords or spatial_filter or spatial_geometry) %} "{{ dataset_search_query }}" and filters. + {% if dataset_search_query and (keywords or publisher or spatial_filter or spatial_geometry) %} "{{ dataset_search_query }}" and filters. {% elif dataset_search_query %} "{{ dataset_search_query }}". {% else %} filters. {% endif %} @@ -131,9 +134,13 @@

Description

sort_value=selected_sort, keywords=keywords, suggested_keywords=suggested_keywords, + publisher=publisher, + suggested_publishers=suggested_publishers, spatial_filter=spatial_filter, search_result_geometries=search_result_geometries, - spatial_geometry=spatial_geometry + spatial_geometry=spatial_geometry, + contextual_keyword_counts=contextual_keyword_counts, + contextual_publisher_counts=contextual_publisher_counts ) }}
diff --git a/pyproject.toml b/pyproject.toml index 6f6d161..6975e81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,3 +50,6 @@ poetry-plugin-export = ">=1.10" [tool.isort] profile = "black" skip = [".poetry", "app/static/node_modules"] + +[tool.black] +target-version = ["py312"] diff --git a/tests/test_interface.py b/tests/test_interface.py index 5645083..6b7468c 100644 --- a/tests/test_interface.py +++ b/tests/test_interface.py @@ -33,3 +33,29 @@ def _raise(_size): assert len(organizations) == 2 assert by_slug["test-org"]["dataset_count"] > 0 assert by_slug["test-org-filtered"]["dataset_count"] == 0 + + +def test_get_top_publishers_returns_top_100(interface_with_dataset, monkeypatch): + captured_size = None + + def _get_publisher_counts(size): + nonlocal captured_size + captured_size = size + return [ + {"name": "Agency Beta", "count": 1}, + {"name": "Agency Alpha", "count": 1}, + ] + + monkeypatch.setattr( + interface_with_dataset.opensearch, + "get_publisher_counts", + _get_publisher_counts, + ) + + publishers = interface_with_dataset.get_top_publishers() + + assert captured_size == 100 + assert publishers == [ + {"name": "Agency Alpha", "count": 1}, + {"name": "Agency Beta", "count": 1}, + ] diff --git a/tests/test_routes.py b/tests/test_routes.py index 95ec026..fffc7f0 100644 --- a/tests/test_routes.py +++ b/tests/test_routes.py @@ -814,6 +814,9 @@ def test_organization_detail_filters_sidebar(db_client, interface_with_dataset): keyword_section = soup.find("div", {"id": "filter-keywords"}) assert keyword_section is not None + publisher_section = soup.find("div", {"id": "filter-publishers"}) + assert publisher_section is not None + geography_section = soup.find("div", {"id": "filter-geography"}) assert geography_section is not None assert soup.find("div", {"id": "geography-map-expanded-panel"}) is not None @@ -1261,6 +1264,9 @@ def test_index_page_has_filters_sidebar(db_client): filter_form = soup.find("form", {"id": "filter-form"}) assert filter_form is not None + publisher_input = soup.find("input", {"id": "publisher-input"}) + assert publisher_input is not None + # Check for specific organization type checkboxes federal_checkbox = soup.find( "input", {"id": "filter-federal", "value": "Federal Government"} @@ -1688,6 +1694,54 @@ def test_nonexistent_keyword_returns_no_results( assert geography_panel is not None +class TestPublisherSearch: + """Test publisher filter functionality on index page.""" + + def test_publisher_filter_shows_matching_datasets( + self, interface_with_dataset, db_client + ): + dataset_dict = interface_with_dataset.db.query(Dataset).first().to_dict() + + dataset_dict["id"] = "publisher-alpha" + dataset_dict["slug"] = "publisher-alpha" + dataset_dict["dcat"] = { + "title": "Alpha Publisher Dataset", + "description": "Dataset from Alpha publisher", + "publisher": {"name": "Agency Alpha"}, + "distribution": [], + } + interface_with_dataset.db.add(Dataset(**dataset_dict)) + + dataset_dict["id"] = "publisher-beta" + dataset_dict["slug"] = "publisher-beta" + dataset_dict["dcat"] = { + "title": "Beta Publisher Dataset", + "description": "Dataset from Beta publisher", + "publisher": {"name": "Agency Beta"}, + "distribution": [], + } + interface_with_dataset.db.add(Dataset(**dataset_dict)) + interface_with_dataset.db.commit() + + interface_with_dataset.opensearch.index_datasets( + interface_with_dataset.db.query(Dataset) + ) + + with patch("app.routes.interface", interface_with_dataset): + response = db_client.get("/?publisher=Agency Alpha") + + assert response.status_code == 200 + soup = BeautifulSoup(response.text, "html.parser") + + titles = [ + item.get_text(" ", strip=True) + for item in soup.select(".usa-collection__heading") + ] + + assert any("Alpha Publisher Dataset" in title for title in titles) + assert not any("Beta Publisher Dataset" in title for title in titles) + + class TestGeospatialSearch: """Test geospatial search functionality on index page.""" @@ -1785,6 +1839,7 @@ def test_htmx_load_more_preserves_filters(interface_with_dataset, db_client): dataset_dict["slug"] = f"test-{i}" dataset_dict["dcat"]["title"] = f"test-{i}" dataset_dict["dcat"]["keyword"] = ["health", "education"] + dataset_dict["dcat"]["publisher"] = {"name": "Test Publisher"} dataset_dict["dcat"]["spatial"] = "-90.155,27.155,-90.26,27.255" interface_with_dataset.db.add(Dataset(**dataset_dict)) interface_with_dataset.db.commit() @@ -1803,6 +1858,7 @@ def test_htmx_load_more_preserves_filters(interface_with_dataset, db_client): "per_page": "10", "org_type": "Federal Government", "keyword": "health", + "publisher": "Test Publisher", "spatial_filter": "geospatial", "sort": "popularity", }, @@ -1829,6 +1885,7 @@ def test_htmx_load_more_preserves_filters(interface_with_dataset, db_client): assert params.get("q") == ["test"] assert params.get("org_type") == ["Federal Government"] assert params.get("keyword") == ["health"] + assert params.get("publisher") == ["Test Publisher"] assert params.get("spatial_filter") == ["geospatial"] assert params.get("sort") == ["popularity"] assert "after" in params @@ -1844,6 +1901,7 @@ def test_htmx_load_more_preserves_filters(interface_with_dataset, db_client): assert push_params.get("q") == ["test"] assert push_params.get("org_type") == ["Federal Government"] assert push_params.get("keyword") == ["health"] + assert push_params.get("publisher") == ["Test Publisher"] def test_htmx_load_more_with_multiple_keywords(interface_with_dataset, db_client):