Merge pull request #204 from canonical/category-consumer

petesfrench · web-flow · commit 5bffff914d07 · 2025-01-31T14:26:58.000+01:00
Create new class to process generic categories
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,3 @@
+### 5.8.0 [28-01-2025]
+**Added** Category class 
+A generic class for processing discourse categories and the topics they contain
diff --git a/canonicalwebteam/discourse/__init__.py b/canonicalwebteam/discourse/__init__.py
@@ -2,9 +2,11 @@
     Docs,  # noqa
     EngagePages,  # noqa
     Tutorials,  # noqa
+    Category,  # noqa
 )
 from canonicalwebteam.discourse.models import DiscourseAPI  # noqa
 from canonicalwebteam.discourse.parsers import (  # noqa
     DocParser,  # noqa
     TutorialParser,  # noqa
+    CategoryParser,  # noqa
 )
diff --git a/canonicalwebteam/discourse/app.py b/canonicalwebteam/discourse/app.py
@@ -695,3 +695,109 @@ def takeovers_healthcheck(self, metadata, topic_id, title=None):
             raise MarkdownError((", ").join(errors))
 
         pass
+
+
+class Category(Discourse):
+    """
+    Given a category id and CategoryParser takes any data tables found in the
+    index topic and stores the data in a dictionary.
+    Builds a URL map of all topics in the category.
+    Returns a Flask view function to serve a topics from a Discourse category
+    depending on the path.
+
+    :param parser: A HTML parse class
+    :param category_id: ID of a Discourse category
+    :param url_prefix: URL prefix on project
+    :param document_template: Path to a template to render page
+    :param blueprint_name: Name of the Flask blueprint
+    :param exclude_topics: Skip given posts from throwing errors
+    """
+
+    def __init__(
+        self,
+        parser,
+        category_id,
+        url_prefix,
+        document_template,
+        blueprint_name,
+        exclude_topics=[],
+    ):
+        super().__init__(parser, document_template, url_prefix, blueprint_name)
+        self.parser = parser
+        self.category_id = category_id
+        self.exclude_topics = exclude_topics
+        self.category_topics = []
+        self.parser.parse_index_topic()
+        pass
+
+        @self.blueprint.route("/")
+        @self.blueprint.route("/<path:path>")
+        def document_view(path=""):
+            """
+            A Flask view function to serve topics from a Discourse category
+            """
+            path = "/" + path
+            if path == "/":
+                document = self.parser.parse_topic(self.parser.index_topic)
+            else:
+                try:
+                    topic_id = self._get_topic_id_from_path(path)
+                except PathNotFoundError:
+                    return flask.abort(404)
+
+                if topic_id == self.parser.index_topic_id:
+                    return flask.redirect(self.url_prefix)
+
+                try:
+                    topic = self.parser.api.get_topic(topic_id)
+                except HTTPError as http_error:
+                    return flask.abort(http_error.response.status_code)
+
+                document = self.parser.parse_topic(topic)
+
+            template = flask.render_template(
+                document_template,
+                category_index_metadata=self.parser.category_index_metadata,
+                document=document,
+            )
+            return flask.make_response(template)
+
+    def _get_topic_id_from_path(self, path):
+        path = path.lstrip("/")
+        category_topics = self._query_category_topics()
+        for topic in category_topics:
+            if topic[2] == path:
+                return topic[0]
+        return None
+
+    def get_category_index_metadata(self, data_name):
+        """
+        Exposes an API to query category metadata
+
+        :param data_name: Name of the data table
+        """
+        if data_name:
+            return self.parser.category_index_metadata[data_name]
+        else:
+            return self.parser.category_index_metadata
+
+    def get_topics_in_category(self):
+        """
+        Exposes an API to query all topics in a category
+        """
+        topics_list = self._query_category_topics()
+        topics_map = {str(topic[0]): topic[2] for topic in topics_list}
+        return topics_map
+
+    def _query_category_topics(self):
+        """
+        Retrieve the category topics list from the api and store it.
+        On subsequent calls, return the stored list.
+        """
+        if self.category_topics:
+            return self.category_topics
+        else:
+            self.category_topics = self.parser.api.get_topic_list_by_category(
+                self.category_id
+            )
+            return self.category_topics
diff --git a/canonicalwebteam/discourse/models.py b/canonicalwebteam/discourse/models.py
@@ -75,13 +75,52 @@ def get_topics(self, topic_ids):
         return pages
 
     def get_topics_category(self, category_id, page=0):
+        """
+        Retrieves the full catergory object including metadata, groups, topics
+        """
         response = self.session.get(
             f"{self.base_url}/c/{category_id}.json?page={page}"
         )
         response.raise_for_status()
 
         return response.json()
 
+    def get_topic_list_by_category(self, category_id, limit=100, offset=0):
+        """
+        Uses data-explorer to query topics within a given category
+        Returns a list of topics 'id', 'title', 'slug'
+
+        Args:
+        - category_id [int]: The category ID
+        - limit [int]: 100 by default, also set in data explorer
+        - offset [int]: 0 by default (first page)
+        """
+        # See https://discourse.ubuntu.com/admin/plugins/explorer?id=89
+        data_explorer_id = 89
+        headers = {
+            "Accept": "application/json",
+            "Content-Type": "multipart/form-data;",
+        }
+        params = (
+            {
+                "params": (
+                    f'{{"category_id":"{category_id}", '
+                    f'"limit":"{limit}", "offset":"{offset}"}}'
+                )
+            },
+        )
+        response = self.session.post(
+            f"{self.base_url}/admin/plugins/explorer/"
+            f"queries/{data_explorer_id}/run",
+            headers=headers,
+            data=params[0],
+        )
+
+        response.raise_for_status()
+        result = response.json()
+
+        return result["rows"]
+
     def get_engage_pages_by_param(
         self,
         category_id,
diff --git a/canonicalwebteam/discourse/parsers/__init__.py b/canonicalwebteam/discourse/parsers/__init__.py
@@ -4,3 +4,6 @@
 from canonicalwebteam.discourse.parsers.tutorials import (  # noqa
     TutorialParser,  # noqa
 )
+from canonicalwebteam.discourse.parsers.category import (  # noqa
+    CategoryParser,  # noqa
+)
diff --git a/canonicalwebteam/discourse/parsers/category.py b/canonicalwebteam/discourse/parsers/category.py
@@ -0,0 +1,70 @@
+import re
+
+# Packages
+from slugify import slugify
+from bs4 import BeautifulSoup
+
+# Local
+from canonicalwebteam.discourse.parsers.base_parser import BaseParser
+
+
+class CategoryParser(BaseParser):
+    """
+    Parses a tables from a Discourse topic and stores them in a dictionary
+    """
+
+    def __init__(self, api, index_topic_id, url_prefix):
+        self.category_metadata = None
+        return super().__init__(api, index_topic_id, url_prefix)
+
+    def parse_index_topic(self):
+        """
+        Retrieve the index topic raw html content.
+        Find any data tables (distinguished by [details="NAME"]), store them
+        in a dictionary and return it.
+        """
+        self.index_topic = self.api.get_topic(self.index_topic_id)
+        raw_index_soup = BeautifulSoup(
+            self.index_topic["post_stream"]["posts"][0]["cooked"],
+            features="html.parser",
+        )
+
+        details_sections = raw_index_soup.find_all(
+            "p", text=re.compile(r"\[details=.*\]")
+        )
+        data_tables = {}
+
+        for section in details_sections:
+            details_text = section.text
+            section_name = re.search(r"\[details=(.*)\]", details_text).group(
+                1
+            )
+            next_table = section.find_next("table")
+            if next_table:
+                data_tables[section_name] = self._parse_table(next_table)
+
+        self.category_index_metadata = data_tables
+
+    def _parse_table(self, table):
+        """
+        Parse HTML table(s) into a dictionary.
+
+        :params table: HTML table element
+        """
+        headers = [slugify(th.text.strip()) for th in table.find_all("th")]
+        rows = []
+        for tr in table.find_all("tr")[1:]:
+            cells = tr.find_all("td")
+            row = {
+                headers[i]: cells[i].text.strip() for i in range(len(cells))
+            }
+            rows.append(row)
+        return rows
+
+    def parse_topic(self, topic):
+        """
+        Parse a topic and return the parsed content.
+
+        :params topic: The topic object containing HTML soup and metadata
+        """
+        return super().parse_topic(topic)
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name="canonicalwebteam.discourse",
-    version="5.7.4",
+    version="5.8.0",
     author="Canonical webteam",
     author_email="webteam@canonical.com",
     url="https://github.com/canonical/canonicalwebteam.discourse",
@@ -23,5 +23,6 @@
         "requests",
         "python-dateutil",
         "validators",
+        "python-slugify",
     ],
 )

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+### 5.8.0 [28-01-2025]`
	`2`	`+Added Category class`
	`3`	`+A generic class for processing discourse categories and the topics they contain`
Original file line number	Diff line number	Diff line change
`@@ -2,9 +2,11 @@`
`2`	`2`	`Docs, # noqa`
`3`	`3`	`EngagePages, # noqa`
`4`	`4`	`Tutorials, # noqa`
	`5`	`+ Category, # noqa`
`5`	`6`	`)`
`6`	`7`	`from canonicalwebteam.discourse.models import DiscourseAPI # noqa`
`7`	`8`	`from canonicalwebteam.discourse.parsers import ( # noqa`
`8`	`9`	`DocParser, # noqa`
`9`	`10`	`TutorialParser, # noqa`
	`11`	`+ CategoryParser, # noqa`
`10`	`12`	`)`
Original file line number	Diff line number	Diff line change
`@@ -4,3 +4,6 @@`
`4`	`4`	`from canonicalwebteam.discourse.parsers.tutorials import ( # noqa`
`5`	`5`	`TutorialParser, # noqa`
`6`	`6`	`)`
	`7`	`+from canonicalwebteam.discourse.parsers.category import ( # noqa`
	`8`	`+ CategoryParser, # noqa`
	`9`	`+)`