Merge pull request #57 from filips123/change-menu-date-parser

filips123 · web-flow · commit 33046cd1de05 · 2023-04-10T15:04:20.000+02:00
diff --git a/API/gimvicurnik/updaters/base.py b/API/gimvicurnik/updaters/base.py
@@ -45,7 +45,7 @@ class DocumentInfo:
     May be `None` if cannot be determined.
     """
 
-    file_extension: Optional[str] = None
+    extension: Optional[str] = None
     """
     The document file extension.
     May be `None` if cannot be determined.
diff --git a/API/gimvicurnik/updaters/eclassroom.py b/API/gimvicurnik/updaters/eclassroom.py
@@ -86,13 +86,15 @@ def _get_internal_urls(self) -> Iterator[DocumentInfo]:
                 if "contents" not in module or len(module["contents"]) == 0:
                     continue
 
+                url = self.normalize_url(module["contents"][0]["fileurl"])
+
                 yield DocumentInfo(
-                    url=self.normalize_url(module["contents"][0]["fileurl"]),
-                    type=self._get_document_type(module["contents"][0]["fileurl"]),
+                    url=url,
+                    type=self._get_document_type(url),
                     title=module["name"],
                     created=datetime.fromtimestamp(module["contents"][0]["timecreated"], tz=timezone.utc),
                     modified=datetime.fromtimestamp(module["contents"][0]["timemodified"], tz=timezone.utc),
-                    file_extension=self.normalize_url(module["contents"][0]["fileurl"]).split(".")[-1],
+                    extension=url.rsplit(".", 1)[1],
                 )
 
     def _get_external_urls(self) -> Iterator[DocumentInfo]:
@@ -124,12 +126,15 @@ def _get_external_urls(self) -> Iterator[DocumentInfo]:
             if content["course"] != self.config.course:
                 continue
 
+            url = self.normalize_url(content["externalurl"])
+
             yield DocumentInfo(
-                url=self.normalize_url(content["externalurl"]),
-                type=self._get_document_type(content["externalurl"]),
+                url=url,
+                type=self._get_document_type(url),
                 title=content["name"],
                 created=datetime.fromtimestamp(content["timemodified"], tz=timezone.utc),
                 modified=datetime.fromtimestamp(content["timemodified"], tz=timezone.utc),
+                extension=url.rsplit(".", 1)[1],
             )
 
     @staticmethod
@@ -206,20 +211,11 @@ def document_needs_parsing(self, document: DocumentInfo) -> bool:
     def document_has_content(self, document: DocumentInfo) -> bool:
         """Return whether the document has content."""
 
-        if document.file_extension == "docx":
+        if document.extension == "docx":
             return True
 
         return False
 
-    def get_content(self, document: DocumentInfo, content: bytes) -> Optional[str]:
-        """Get file content of docx circulars."""
-
-        def ignore_images(_image: Image) -> Dict:
-            return {}
-
-        result = convert_to_html(io.BytesIO(content), convert_image=ignore_images)
-        return typing.cast(str, result.value)  # The generated HTML
-
     @with_span(op="parse", pass_span=True)
     def parse_document(self, document: DocumentInfo, content: bytes, effective: date, span: Span) -> None:  # type: ignore[override]
         """Parse the document and store extracted data."""
@@ -246,6 +242,21 @@ def parse_document(self, document: DocumentInfo, content: bytes, effective: date
             # This cannot happen because only menus are provided by the API
             raise KeyError("Unknown parsable document type from the e-classroom")
 
+    @with_span(op="parse", pass_span=True)
+    def get_content(self, document: DocumentInfo, content: bytes, span: Span) -> Optional[str]:  # type: ignore[override]
+        """Convert content of DOCX circulars to HTML."""
+
+        def ignore_images(_image: Image) -> Dict:
+            return {}
+
+        # Set basic Sentry span info
+        span.set_tag("document.format", "docx")
+        span.set_tag("document.type", document.type.value)
+
+        # Convert DOCX to HTML
+        result = convert_to_html(io.BytesIO(content), convert_image=ignore_images)
+        return typing.cast(str, result.value)
+
     def _normalize_subject_name(self, name: str) -> Optional[str]:
         """Normalize the subject name."""
 
diff --git a/API/gimvicurnik/updaters/menu.py b/API/gimvicurnik/updaters/menu.py
@@ -75,86 +75,20 @@ def get_document_title(self, document: DocumentInfo) -> str:
             raise KeyError("Unknown document type for menu")
 
     def get_document_effective(self, document: DocumentInfo) -> datetime.date:
-        """Parse and return custom date formats from the document URL."""
-
-        short_month_to_number = {
-            "jan": 1,
-            "feb": 2,
-            "mar": 3,
-            "apr": 4,
-            "maj": 5,
-            "jun": 6,
-            "jul": 7,
-            "avg": 8,
-            "sep": 9,
-            "okt": 10,
-            "nov": 11,
-            "dec": 12,
-        }
-
-        long_month_to_number = {
-            "januar": 1,
-            "februar": 2,
-            "marec": 3,
-            "april": 4,
-            "maj": 5,
-            "junij": 6,
-            "julij": 7,
-            "avgust": 8,
-            "september": 9,
-            "oktober": 10,
-            "november": 11,
-            "december": 12,
-        }
-
-        url = document.url
-
-        # == FORMAT TYPE 1
-        # Example: KOSILO-4jan-8jan-2021.pdf
-        # Example: KOSILO-25jan-29jan-2021-PDF.pdf
-        date = re.search(r"(?:KOSILO|MALICA)-(\d+)([a-z]+)-\d+[a-z]+-(\d+)(?i:-PDF)?\.[a-z]+", url)  # fmt: skip
+        """Return the document effective date in a local timezone."""
+
+        # jedilnik-kosilo-YYYY-MM-DD(-popravek).pdf
+        # jedilnik-malica-YYYY-MM-DD(-popravek).pdf
+        date = re.search(r"jedilnik-(?:kosilo|malica)-(\d+)-(\d+)-(\d+)(?:-[\w-]*)?.pdf", document.url)
 
         if date:
             return datetime.date(
-                year=int(date.group(3)),
-                month=short_month_to_number[date.group(2)],
-                day=int(date.group(1)),
+                year=int(date.group(1)),
+                month=int(date.group(2)),
+                day=int(date.group(3)),
             )
 
-        # == FORMAT TYPE 2
-        # Example: 09-splet-oktober-1-teden-09-M.pdf
-        # Example: 05-splet-februar-3-teden-M-PDF.pdf
-        # Example: 04-splet-marec-2-teden-04-M-PDF-0.pdf
-        # Example: 01-splet-september-4-teden-02-M-popravek.pdf
-        # Example: 01-splet-januar1-teden-02-K.pdf
-        # Example: 01-splet-september-2-teden-02.pdf
-        # Example: 01-splet-september-2-teden-M-02.pdf
-        date = re.search(r"\d+-splet-([a-z]+)-?(\d)-teden(?:-[MK])?-?\d*(?:-[MK])?-?\d?(?i:-PDF)?(?:-[a-z]+)?(?:-\d)?\.[a-z]+", url)  # fmt: skip
-
-        if date:
-            today = datetime.date.today()
-            year = today.year
-
-            # Get week and month from URL
-            week = int(date.group(2))
-            month = long_month_to_number[date.group(1)]
-
-            # In case the menu is provided for the next year
-            if today.month == 12 and month == 1:
-                year += 1
-
-            # In case the menu is provided for the last year
-            if today.month == 1 and month == 12:
-                year -= 1
-
-            # Get start of nth week of the month
-            first = datetime.date(year, month, 1)
-            new = first + datetime.timedelta(weeks=week - 1, days=-first.weekday())
-
-            return new
-
-        # == UNKNOWN FORMAT
-        raise MenuDateError("Unknown menu date URL format: " + url.rsplit("/", 1)[-1])
+        raise MenuDateError("Unknown menu date URL format: " + document.url.rsplit("/", 1)[1])
 
     def document_needs_parsing(self, document: DocumentInfo) -> bool:
         """Return whether the document needs parsing."""