Skip to content

Commit 33046cd

Browse files
authored
Merge pull request #57 from filips123/change-menu-date-parser
2 parents 17c56cf + bb1a4e0 commit 33046cd

File tree

3 files changed

+36
-91
lines changed

3 files changed

+36
-91
lines changed

API/gimvicurnik/updaters/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class DocumentInfo:
4545
May be `None` if cannot be determined.
4646
"""
4747

48-
file_extension: Optional[str] = None
48+
extension: Optional[str] = None
4949
"""
5050
The document file extension.
5151
May be `None` if cannot be determined.

API/gimvicurnik/updaters/eclassroom.py

+26-15
Original file line numberDiff line numberDiff line change
@@ -86,13 +86,15 @@ def _get_internal_urls(self) -> Iterator[DocumentInfo]:
8686
if "contents" not in module or len(module["contents"]) == 0:
8787
continue
8888

89+
url = self.normalize_url(module["contents"][0]["fileurl"])
90+
8991
yield DocumentInfo(
90-
url=self.normalize_url(module["contents"][0]["fileurl"]),
91-
type=self._get_document_type(module["contents"][0]["fileurl"]),
92+
url=url,
93+
type=self._get_document_type(url),
9294
title=module["name"],
9395
created=datetime.fromtimestamp(module["contents"][0]["timecreated"], tz=timezone.utc),
9496
modified=datetime.fromtimestamp(module["contents"][0]["timemodified"], tz=timezone.utc),
95-
file_extension=self.normalize_url(module["contents"][0]["fileurl"]).split(".")[-1],
97+
extension=url.rsplit(".", 1)[1],
9698
)
9799

98100
def _get_external_urls(self) -> Iterator[DocumentInfo]:
@@ -124,12 +126,15 @@ def _get_external_urls(self) -> Iterator[DocumentInfo]:
124126
if content["course"] != self.config.course:
125127
continue
126128

129+
url = self.normalize_url(content["externalurl"])
130+
127131
yield DocumentInfo(
128-
url=self.normalize_url(content["externalurl"]),
129-
type=self._get_document_type(content["externalurl"]),
132+
url=url,
133+
type=self._get_document_type(url),
130134
title=content["name"],
131135
created=datetime.fromtimestamp(content["timemodified"], tz=timezone.utc),
132136
modified=datetime.fromtimestamp(content["timemodified"], tz=timezone.utc),
137+
extension=url.rsplit(".", 1)[1],
133138
)
134139

135140
@staticmethod
@@ -206,20 +211,11 @@ def document_needs_parsing(self, document: DocumentInfo) -> bool:
206211
def document_has_content(self, document: DocumentInfo) -> bool:
207212
"""Return whether the document has content."""
208213

209-
if document.file_extension == "docx":
214+
if document.extension == "docx":
210215
return True
211216

212217
return False
213218

214-
def get_content(self, document: DocumentInfo, content: bytes) -> Optional[str]:
215-
"""Get file content of docx circulars."""
216-
217-
def ignore_images(_image: Image) -> Dict:
218-
return {}
219-
220-
result = convert_to_html(io.BytesIO(content), convert_image=ignore_images)
221-
return typing.cast(str, result.value) # The generated HTML
222-
223219
@with_span(op="parse", pass_span=True)
224220
def parse_document(self, document: DocumentInfo, content: bytes, effective: date, span: Span) -> None: # type: ignore[override]
225221
"""Parse the document and store extracted data."""
@@ -246,6 +242,21 @@ def parse_document(self, document: DocumentInfo, content: bytes, effective: date
246242
# This cannot happen because only menus are provided by the API
247243
raise KeyError("Unknown parsable document type from the e-classroom")
248244

245+
@with_span(op="parse", pass_span=True)
246+
def get_content(self, document: DocumentInfo, content: bytes, span: Span) -> Optional[str]: # type: ignore[override]
247+
"""Convert content of DOCX circulars to HTML."""
248+
249+
def ignore_images(_image: Image) -> Dict:
250+
return {}
251+
252+
# Set basic Sentry span info
253+
span.set_tag("document.format", "docx")
254+
span.set_tag("document.type", document.type.value)
255+
256+
# Convert DOCX to HTML
257+
result = convert_to_html(io.BytesIO(content), convert_image=ignore_images)
258+
return typing.cast(str, result.value)
259+
249260
def _normalize_subject_name(self, name: str) -> Optional[str]:
250261
"""Normalize the subject name."""
251262

API/gimvicurnik/updaters/menu.py

+9-75
Original file line numberDiff line numberDiff line change
@@ -75,86 +75,20 @@ def get_document_title(self, document: DocumentInfo) -> str:
7575
raise KeyError("Unknown document type for menu")
7676

7777
def get_document_effective(self, document: DocumentInfo) -> datetime.date:
78-
"""Parse and return custom date formats from the document URL."""
79-
80-
short_month_to_number = {
81-
"jan": 1,
82-
"feb": 2,
83-
"mar": 3,
84-
"apr": 4,
85-
"maj": 5,
86-
"jun": 6,
87-
"jul": 7,
88-
"avg": 8,
89-
"sep": 9,
90-
"okt": 10,
91-
"nov": 11,
92-
"dec": 12,
93-
}
94-
95-
long_month_to_number = {
96-
"januar": 1,
97-
"februar": 2,
98-
"marec": 3,
99-
"april": 4,
100-
"maj": 5,
101-
"junij": 6,
102-
"julij": 7,
103-
"avgust": 8,
104-
"september": 9,
105-
"oktober": 10,
106-
"november": 11,
107-
"december": 12,
108-
}
109-
110-
url = document.url
111-
112-
# == FORMAT TYPE 1
113-
# Example: KOSILO-4jan-8jan-2021.pdf
114-
# Example: KOSILO-25jan-29jan-2021-PDF.pdf
115-
date = re.search(r"(?:KOSILO|MALICA)-(\d+)([a-z]+)-\d+[a-z]+-(\d+)(?i:-PDF)?\.[a-z]+", url) # fmt: skip
78+
"""Return the document effective date in a local timezone."""
79+
80+
# jedilnik-kosilo-YYYY-MM-DD(-popravek).pdf
81+
# jedilnik-malica-YYYY-MM-DD(-popravek).pdf
82+
date = re.search(r"jedilnik-(?:kosilo|malica)-(\d+)-(\d+)-(\d+)(?:-[\w-]*)?.pdf", document.url)
11683

11784
if date:
11885
return datetime.date(
119-
year=int(date.group(3)),
120-
month=short_month_to_number[date.group(2)],
121-
day=int(date.group(1)),
86+
year=int(date.group(1)),
87+
month=int(date.group(2)),
88+
day=int(date.group(3)),
12289
)
12390

124-
# == FORMAT TYPE 2
125-
# Example: 09-splet-oktober-1-teden-09-M.pdf
126-
# Example: 05-splet-februar-3-teden-M-PDF.pdf
127-
# Example: 04-splet-marec-2-teden-04-M-PDF-0.pdf
128-
# Example: 01-splet-september-4-teden-02-M-popravek.pdf
129-
# Example: 01-splet-januar1-teden-02-K.pdf
130-
# Example: 01-splet-september-2-teden-02.pdf
131-
# Example: 01-splet-september-2-teden-M-02.pdf
132-
date = re.search(r"\d+-splet-([a-z]+)-?(\d)-teden(?:-[MK])?-?\d*(?:-[MK])?-?\d?(?i:-PDF)?(?:-[a-z]+)?(?:-\d)?\.[a-z]+", url) # fmt: skip
133-
134-
if date:
135-
today = datetime.date.today()
136-
year = today.year
137-
138-
# Get week and month from URL
139-
week = int(date.group(2))
140-
month = long_month_to_number[date.group(1)]
141-
142-
# In case the menu is provided for the next year
143-
if today.month == 12 and month == 1:
144-
year += 1
145-
146-
# In case the menu is provided for the last year
147-
if today.month == 1 and month == 12:
148-
year -= 1
149-
150-
# Get start of nth week of the month
151-
first = datetime.date(year, month, 1)
152-
new = first + datetime.timedelta(weeks=week - 1, days=-first.weekday())
153-
154-
return new
155-
156-
# == UNKNOWN FORMAT
157-
raise MenuDateError("Unknown menu date URL format: " + url.rsplit("/", 1)[-1])
91+
raise MenuDateError("Unknown menu date URL format: " + document.url.rsplit("/", 1)[1])
15892

15993
def document_needs_parsing(self, document: DocumentInfo) -> bool:
16094
"""Return whether the document needs parsing."""

0 commit comments

Comments
 (0)