Skip to content

Commit 5bffff9

Browse files
authored
Merge pull request #204 from canonical/category-consumer
Create new class to process generic categories
2 parents ff8b487 + 0152fc1 commit 5bffff9

File tree

7 files changed

+225
-1
lines changed

7 files changed

+225
-1
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
### 5.8.0 [28-01-2025]
2+
**Added** Category class
3+
A generic class for processing discourse categories and the topics they contain

canonicalwebteam/discourse/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
Docs, # noqa
33
EngagePages, # noqa
44
Tutorials, # noqa
5+
Category, # noqa
56
)
67
from canonicalwebteam.discourse.models import DiscourseAPI # noqa
78
from canonicalwebteam.discourse.parsers import ( # noqa
89
DocParser, # noqa
910
TutorialParser, # noqa
11+
CategoryParser, # noqa
1012
)

canonicalwebteam/discourse/app.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -695,3 +695,109 @@ def takeovers_healthcheck(self, metadata, topic_id, title=None):
695695
raise MarkdownError((", ").join(errors))
696696

697697
pass
698+
699+
700+
class Category(Discourse):
701+
"""
702+
Given a category id and CategoryParser takes any data tables found in the
703+
index topic and stores the data in a dictionary.
704+
Builds a URL map of all topics in the category.
705+
Returns a Flask view function to serve a topics from a Discourse category
706+
depending on the path.
707+
708+
:param parser: A HTML parse class
709+
:param category_id: ID of a Discourse category
710+
:param url_prefix: URL prefix on project
711+
:param document_template: Path to a template to render page
712+
:param blueprint_name: Name of the Flask blueprint
713+
:param exclude_topics: Skip given posts from throwing errors
714+
"""
715+
716+
def __init__(
717+
self,
718+
parser,
719+
category_id,
720+
url_prefix,
721+
document_template,
722+
blueprint_name,
723+
exclude_topics=[],
724+
):
725+
super().__init__(parser, document_template, url_prefix, blueprint_name)
726+
self.parser = parser
727+
self.category_id = category_id
728+
self.exclude_topics = exclude_topics
729+
self.category_topics = []
730+
self.parser.parse_index_topic()
731+
pass
732+
733+
@self.blueprint.route("/")
734+
@self.blueprint.route("/<path:path>")
735+
def document_view(path=""):
736+
"""
737+
A Flask view function to serve topics from a Discourse category
738+
"""
739+
path = "/" + path
740+
if path == "/":
741+
document = self.parser.parse_topic(self.parser.index_topic)
742+
else:
743+
try:
744+
topic_id = self._get_topic_id_from_path(path)
745+
except PathNotFoundError:
746+
return flask.abort(404)
747+
748+
if topic_id == self.parser.index_topic_id:
749+
return flask.redirect(self.url_prefix)
750+
751+
try:
752+
topic = self.parser.api.get_topic(topic_id)
753+
except HTTPError as http_error:
754+
return flask.abort(http_error.response.status_code)
755+
756+
document = self.parser.parse_topic(topic)
757+
758+
template = flask.render_template(
759+
document_template,
760+
category_index_metadata=self.parser.category_index_metadata,
761+
document=document,
762+
)
763+
return flask.make_response(template)
764+
765+
def _get_topic_id_from_path(self, path):
766+
path = path.lstrip("/")
767+
category_topics = self._query_category_topics()
768+
for topic in category_topics:
769+
if topic[2] == path:
770+
return topic[0]
771+
return None
772+
773+
def get_category_index_metadata(self, data_name):
774+
"""
775+
Exposes an API to query category metadata
776+
777+
:param data_name: Name of the data table
778+
"""
779+
if data_name:
780+
return self.parser.category_index_metadata[data_name]
781+
else:
782+
return self.parser.category_index_metadata
783+
784+
def get_topics_in_category(self):
785+
"""
786+
Exposes an API to query all topics in a category
787+
"""
788+
topics_list = self._query_category_topics()
789+
topics_map = {str(topic[0]): topic[2] for topic in topics_list}
790+
return topics_map
791+
792+
def _query_category_topics(self):
793+
"""
794+
Retrieve the category topics list from the api and store it.
795+
On subsequent calls, return the stored list.
796+
"""
797+
if self.category_topics:
798+
return self.category_topics
799+
else:
800+
self.category_topics = self.parser.api.get_topic_list_by_category(
801+
self.category_id
802+
)
803+
return self.category_topics

canonicalwebteam/discourse/models.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,52 @@ def get_topics(self, topic_ids):
7575
return pages
7676

7777
def get_topics_category(self, category_id, page=0):
78+
"""
79+
Retrieves the full catergory object including metadata, groups, topics
80+
"""
7881
response = self.session.get(
7982
f"{self.base_url}/c/{category_id}.json?page={page}"
8083
)
8184
response.raise_for_status()
8285

8386
return response.json()
8487

88+
def get_topic_list_by_category(self, category_id, limit=100, offset=0):
89+
"""
90+
Uses data-explorer to query topics within a given category
91+
Returns a list of topics 'id', 'title', 'slug'
92+
93+
Args:
94+
- category_id [int]: The category ID
95+
- limit [int]: 100 by default, also set in data explorer
96+
- offset [int]: 0 by default (first page)
97+
"""
98+
# See https://discourse.ubuntu.com/admin/plugins/explorer?id=89
99+
data_explorer_id = 89
100+
headers = {
101+
"Accept": "application/json",
102+
"Content-Type": "multipart/form-data;",
103+
}
104+
params = (
105+
{
106+
"params": (
107+
f'{{"category_id":"{category_id}", '
108+
f'"limit":"{limit}", "offset":"{offset}"}}'
109+
)
110+
},
111+
)
112+
response = self.session.post(
113+
f"{self.base_url}/admin/plugins/explorer/"
114+
f"queries/{data_explorer_id}/run",
115+
headers=headers,
116+
data=params[0],
117+
)
118+
119+
response.raise_for_status()
120+
result = response.json()
121+
122+
return result["rows"]
123+
85124
def get_engage_pages_by_param(
86125
self,
87126
category_id,

canonicalwebteam/discourse/parsers/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@
44
from canonicalwebteam.discourse.parsers.tutorials import ( # noqa
55
TutorialParser, # noqa
66
)
7+
from canonicalwebteam.discourse.parsers.category import ( # noqa
8+
CategoryParser, # noqa
9+
)
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import re
2+
3+
# Packages
4+
from slugify import slugify
5+
from bs4 import BeautifulSoup
6+
7+
# Local
8+
from canonicalwebteam.discourse.parsers.base_parser import BaseParser
9+
10+
11+
class CategoryParser(BaseParser):
12+
"""
13+
Parses a tables from a Discourse topic and stores them in a dictionary
14+
"""
15+
16+
def __init__(self, api, index_topic_id, url_prefix):
17+
self.category_metadata = None
18+
return super().__init__(api, index_topic_id, url_prefix)
19+
20+
def parse_index_topic(self):
21+
"""
22+
Retrieve the index topic raw html content.
23+
Find any data tables (distinguished by [details="NAME"]), store them
24+
in a dictionary and return it.
25+
"""
26+
self.index_topic = self.api.get_topic(self.index_topic_id)
27+
raw_index_soup = BeautifulSoup(
28+
self.index_topic["post_stream"]["posts"][0]["cooked"],
29+
features="html.parser",
30+
)
31+
32+
details_sections = raw_index_soup.find_all(
33+
"p", text=re.compile(r"\[details=.*\]")
34+
)
35+
data_tables = {}
36+
37+
for section in details_sections:
38+
details_text = section.text
39+
section_name = re.search(r"\[details=(.*)\]", details_text).group(
40+
1
41+
)
42+
next_table = section.find_next("table")
43+
if next_table:
44+
data_tables[section_name] = self._parse_table(next_table)
45+
46+
self.category_index_metadata = data_tables
47+
48+
def _parse_table(self, table):
49+
"""
50+
Parse HTML table(s) into a dictionary.
51+
52+
:params table: HTML table element
53+
"""
54+
headers = [slugify(th.text.strip()) for th in table.find_all("th")]
55+
rows = []
56+
for tr in table.find_all("tr")[1:]:
57+
cells = tr.find_all("td")
58+
row = {
59+
headers[i]: cells[i].text.strip() for i in range(len(cells))
60+
}
61+
rows.append(row)
62+
return rows
63+
64+
def parse_topic(self, topic):
65+
"""
66+
Parse a topic and return the parsed content.
67+
68+
:params topic: The topic object containing HTML soup and metadata
69+
"""
70+
return super().parse_topic(topic)

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
setup(
66
name="canonicalwebteam.discourse",
7-
version="5.7.4",
7+
version="5.8.0",
88
author="Canonical webteam",
99
author_email="webteam@canonical.com",
1010
url="https://github.com/canonical/canonicalwebteam.discourse",
@@ -23,5 +23,6 @@
2323
"requests",
2424
"python-dateutil",
2525
"validators",
26+
"python-slugify",
2627
],
2728
)

0 commit comments

Comments
 (0)