Skip to content
29 changes: 24 additions & 5 deletions beetsplug/lyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,17 +190,23 @@ def format_url(url: str, params: JSONDict | None) -> str:
return f"{url}?{urlencode(params)}"

def get_text(
self, url: str, params: JSONDict | None = None, **kwargs
self,
url: str,
params: JSONDict | None = None,
force_utf8: bool = False,
**kwargs,
) -> str:
"""Return text / HTML data from the given URL.

Set the encoding to None to let requests handle it because some sites
set it incorrectly.
By default, trust the server's encoding and requests' apparent_encoding
detection. When force_utf8=True, default to UTF-8 if server doesn't
specify encoding (avoids MacRoman misdetection on some sites like Genius).
"""
url = self.format_url(url, params)
self.debug("Fetching HTML from {}", url)
r = self.get(url, **kwargs)
r.encoding = None
if force_utf8:
r.encoding = r.encoding or "utf-8"
return r.text

def get_json(self, url: str, params: JSONDict | None = None, **kwargs):
Expand Down Expand Up @@ -544,6 +550,16 @@ class Genius(SearchBackend):
def headers(self) -> dict[str, str]:
return {"Authorization": f"Bearer {self.config['genius_api_key']}"}

def get_text(
self,
url: str,
params: JSONDict | None = None,
force_utf8: bool = True,
**kwargs,
) -> str:
"""Force UTF-8 encoding for Genius to avoid MacRoman misdetection."""
return super().get_text(url, params, force_utf8=force_utf8, **kwargs)

def search(self, artist: str, title: str) -> Iterable[SearchResult]:
search_data: GeniusAPI.Search = self.get_json(
self.SEARCH_URL,
Expand All @@ -557,7 +573,10 @@ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
def scrape(cls, html: str) -> str | None:
if m := cls.LYRICS_IN_JSON_RE.search(html):
html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
return cls.get_soup(html_text).get_text().strip()
lyrics = cls.get_soup(html_text).get_text().strip()
# Genius embeds lyrics in JSON; escape sequences remain after parsing
lyrics = re.sub(r'\\+"', '"', lyrics)
return lyrics
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that we've sorted the encoding issue, I wonder whether this could possibly be solved by improving the pattern used by cls.remove_backslash?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@grillofran have you seen my comment by any chance?


return None

Expand Down
24 changes: 24 additions & 0 deletions test/plugins/lyrics_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,4 +576,28 @@ def backend(self) -> str:
""",
marks=[pytest.mark.xfail(reason="Tekstowo seems to be broken again")],
),
LyricsPage.make(
"https://genius.com/Caparezza-argenti-vive-lyrics",
"""
Ciao Dante, ti ricordi di me? Sono Filippo Argenti
Il vicino di casa che nella Commedia ponesti tra questi violenti
Sono quello che annega nel fango, pestato dai demoni intorno
Cos'è, vuoi provocarmi, sommo? Puoi solo provocarmi sonno!
""",
artist="Caparezza",
track_title="Argenti vive",
marks=[xfail_on_ci("Genius returns 403 FORBIDDEN in CI")],
),
LyricsPage.make(
"https://genius.com/Arctic-monkeys-r-u-mine-lyrics",
"""
I go crazy 'cause here isn't where I wanna be
And satisfaction feels like a distant memory
And I can't help myself, all I
Wanna hear her say is "Are you mine?"
""",
artist="Arctic Monkeys",
track_title="R U Mine?",
marks=[xfail_on_ci("Genius returns 403 FORBIDDEN in CI")],
),
]
Loading