-
Notifications
You must be signed in to change notification settings - Fork 2k
Fix lyrics Unicode corruption and escaped quotes in Genius plugin #6233
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 8 commits
a79a86d
1d49413
b831191
3f56d95
2c300fa
e5f75e9
9941ffd
f234686
8088797
9254732
4cfb1e3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -190,17 +190,23 @@ def format_url(url: str, params: JSONDict | None) -> str: | |
| return f"{url}?{urlencode(params)}" | ||
|
|
||
| def get_text( | ||
| self, url: str, params: JSONDict | None = None, **kwargs | ||
| self, | ||
| url: str, | ||
| params: JSONDict | None = None, | ||
| force_utf8: bool = False, | ||
| **kwargs, | ||
| ) -> str: | ||
| """Return text / HTML data from the given URL. | ||
|
|
||
| Set the encoding to None to let requests handle it because some sites | ||
| set it incorrectly. | ||
| By default, trust the server's encoding and requests' apparent_encoding | ||
| detection. When force_utf8=True, default to UTF-8 if server doesn't | ||
| specify encoding (avoids MacRoman misdetection on some sites like Genius). | ||
| """ | ||
| url = self.format_url(url, params) | ||
| self.debug("Fetching HTML from {}", url) | ||
| r = self.get(url, **kwargs) | ||
| r.encoding = None | ||
| if force_utf8: | ||
| r.encoding = r.encoding or "utf-8" | ||
| return r.text | ||
|
|
||
| def get_json(self, url: str, params: JSONDict | None = None, **kwargs): | ||
|
|
@@ -544,6 +550,16 @@ class Genius(SearchBackend): | |
| def headers(self) -> dict[str, str]: | ||
| return {"Authorization": f"Bearer {self.config['genius_api_key']}"} | ||
|
|
||
| def get_text( | ||
| self, | ||
| url: str, | ||
| params: JSONDict | None = None, | ||
| force_utf8: bool = True, | ||
| **kwargs, | ||
| ) -> str: | ||
| """Force UTF-8 encoding for Genius to avoid MacRoman misdetection.""" | ||
| return super().get_text(url, params, force_utf8=force_utf8, **kwargs) | ||
|
|
||
| def search(self, artist: str, title: str) -> Iterable[SearchResult]: | ||
| search_data: GeniusAPI.Search = self.get_json( | ||
| self.SEARCH_URL, | ||
|
|
@@ -557,7 +573,10 @@ def search(self, artist: str, title: str) -> Iterable[SearchResult]: | |
| def scrape(cls, html: str) -> str | None: | ||
| if m := cls.LYRICS_IN_JSON_RE.search(html): | ||
| html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n") | ||
| return cls.get_soup(html_text).get_text().strip() | ||
| lyrics = cls.get_soup(html_text).get_text().strip() | ||
| # Genius embeds lyrics in JSON; escape sequences remain after parsing | ||
| lyrics = re.sub(r'\\+"', '"', lyrics) | ||
| return lyrics | ||
|
||
|
|
||
| return None | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.