|
| 1 | +import { PANDORA_LINK_REGEX } from '~/config/constants'; |
| 2 | +import { MetadataType, Parser } from '~/config/enum'; |
| 3 | +import { cacheSearchMetadata, getCachedSearchMetadata } from '~/services/cache'; |
| 4 | +import { fetchMetadata } from '~/services/metadata'; |
| 5 | +import type { SearchMetadata } from '~/services/search'; |
| 6 | +import { logger } from '~/utils/logger'; |
| 7 | +import { getCheerioDoc, linkedDataScript, metaTagContent } from '~/utils/scraper'; |
| 8 | + |
| 9 | +enum PandoraMetadataType { |
| 10 | + Song = 'TR', |
| 11 | + Album = 'AL', |
| 12 | + Artist = 'AR', |
| 13 | + Podcast = 'PE', |
| 14 | + Show = 'PC', |
| 15 | +} |
| 16 | + |
| 17 | +const PANDORA_METADATA_TO_METADATA_TYPE = { |
| 18 | + [PandoraMetadataType.Song]: MetadataType.Song, |
| 19 | + [PandoraMetadataType.Album]: MetadataType.Album, |
| 20 | + [PandoraMetadataType.Artist]: MetadataType.Artist, |
| 21 | + [PandoraMetadataType.Podcast]: MetadataType.Podcast, |
| 22 | + [PandoraMetadataType.Show]: MetadataType.Show, |
| 23 | +}; |
| 24 | + |
| 25 | +export const getPandoraMetadata = async (id: string, link: string) => { |
| 26 | + // Pandora's IDs are predictable and prefixed with their type: ${two-letter-type}:${actual-id} |
| 27 | + // For some URLs (Podcasts and Shows), the captured ID from the URL is correct/not transformed |
| 28 | + // For Albums, Tracks, and Artists, the ID is hashed and I haven't been able to identify it.... |
| 29 | + // e.g.: ALcdVpX6J57q54q (URL) -> AL:49608296 (actual) |
| 30 | + |
| 31 | + const cached = await getCachedSearchMetadata(id, Parser.Pandora); |
| 32 | + if (cached) { |
| 33 | + logger.info(`[Pandora] (${id}) metadata cache hit`); |
| 34 | + return cached; |
| 35 | + } |
| 36 | + |
| 37 | + try { |
| 38 | + const type = id.slice(0, 2); |
| 39 | + |
| 40 | + const html = await fetchMetadata(link); |
| 41 | + |
| 42 | + const doc = getCheerioDoc(html); |
| 43 | + |
| 44 | + let title, description, image; |
| 45 | + |
| 46 | + // Pandora's codebase must be such a rat's nest... |
| 47 | + if (['AL', 'AR', 'TR'].indexOf(type) !== -1) { |
| 48 | + // === Music Page === |
| 49 | + |
| 50 | + // There's a helpfully quite complete JSON Linked Data script node right at the top of the page, |
| 51 | + // And it's *much* more straightforward for getting some of our structured data than regexing the og tags |
| 52 | + const atts = linkedDataScript(doc); |
| 53 | + |
| 54 | + // Free up a handful of bytes of memory |
| 55 | + delete atts.potentialAction; |
| 56 | + |
| 57 | + // Debug |
| 58 | + // logger.info(JSON.stringify(atts, null, 2)); |
| 59 | + |
| 60 | + // Instead of fussing with the ID from the URL and checking whether it's hashed or not |
| 61 | + // Just grab a known-good one from our linked data |
| 62 | + id = atts['@id']; |
| 63 | + |
| 64 | + title = atts.name; |
| 65 | + image = atts.image; |
| 66 | + |
| 67 | + // There is no `og:description` tag and `twitter:description` tags are inconsistently available |
| 68 | + // (and not populated with different info most of the time anyway) |
| 69 | + // So we're just going to use the `description` field to bake in the Artist for the eventual query |
| 70 | + description = |
| 71 | + 'byArtist' in atts && 'name' in atts.byArtist |
| 72 | + ? [title, atts.byArtist.name].join(' ') |
| 73 | + : title; |
| 74 | + } else if (['PC', 'PE'].indexOf(type) !== -1) { |
| 75 | + // === Podcast Page === |
| 76 | + |
| 77 | + // The Linked Data node is present, but empty for podcast links :| |
| 78 | + |
| 79 | + // This `title` tag will be the name of the Podcast on the main Podcast page (good) |
| 80 | + // ...and also still the name of the Podcast on any individual Episode page (very bad) |
| 81 | + title = metaTagContent(doc, 'og:title', 'property'); |
| 82 | + image = metaTagContent(doc, 'og:image', 'property'); |
| 83 | + |
| 84 | + // Podcasts seem to have even fewer meta tags for some reason |
| 85 | + description = `Listen to the ${title} podcast on Pandora.`; |
| 86 | + |
| 87 | + // The Episode title can be scraped from the HTML of the page layout: `[data-qa="header_static_text_title"]` |
| 88 | + // But it doesn't exist cleanly in any tag or script anywhere in the document's HEAD |
| 89 | + // So our only options are to pull it from the HTML or accept the slugified version from the URL |
| 90 | + if (type === 'PE') { |
| 91 | + const ep_title = link |
| 92 | + .match(PANDORA_LINK_REGEX)?.[2] |
| 93 | + .replace(/[^\w]/g, ' ') |
| 94 | + .trim(); |
| 95 | + |
| 96 | + title = [ep_title, title].join(' '); |
| 97 | + } |
| 98 | + } else { |
| 99 | + throw new Error('Unknown Pandora type (or malformed ID).'); |
| 100 | + } |
| 101 | + |
| 102 | + if (!title || !image) { |
| 103 | + throw new Error('Pandora metadata not found'); |
| 104 | + } |
| 105 | + |
| 106 | + const parsedTitle = title?.trim(); |
| 107 | + |
| 108 | + const metadata = { |
| 109 | + id, |
| 110 | + title: parsedTitle, |
| 111 | + description, |
| 112 | + type: PANDORA_METADATA_TO_METADATA_TYPE[type as PandoraMetadataType], |
| 113 | + image, |
| 114 | + } as SearchMetadata; |
| 115 | + |
| 116 | + await cacheSearchMetadata(id, Parser.Pandora, metadata); |
| 117 | + |
| 118 | + return metadata; |
| 119 | + } catch (err) { |
| 120 | + throw new Error(`[${getPandoraMetadata.name}] (${link}) ${err}`); |
| 121 | + } |
| 122 | +}; |
| 123 | + |
| 124 | +export const getPandoraQueryFromMetadata = (metadata: SearchMetadata) => { |
| 125 | + let query = metadata.title; |
| 126 | + |
| 127 | + if (metadata.type === MetadataType.Album || metadata.type === MetadataType.Song) { |
| 128 | + query = metadata.description; |
| 129 | + } |
| 130 | + |
| 131 | + return query; |
| 132 | +}; |
0 commit comments