From 56638970663d8da4205b99964bb5e05afeb5acfc Mon Sep 17 00:00:00 2001 From: Zach Leatherman Date: Tue, 10 Dec 2024 15:57:25 -0600 Subject: [PATCH] Put escaped text into content directly, skip markdown conversion step --- src/Fetcher.js | 8 ++- src/Importer.js | 40 ++++++++++++--- test/sources/bluesky-test.xml | 2 + test/test.js | 97 +++++++++++++++++++++++++++++++++-- 4 files changed, 135 insertions(+), 12 deletions(-) create mode 100644 test/sources/bluesky-test.xml diff --git a/src/Fetcher.js b/src/Fetcher.js index 5422e85..6474bc1 100644 --- a/src/Fetcher.js +++ b/src/Fetcher.js @@ -18,6 +18,8 @@ const xmlParser = new XMLParser({ ignoreAttributes: false, allowBooleanAttributes: true, parseAttributeValue: true, + processEntities: false, // disable this, was causing inconsistencies in Bluesky entries + // htmlEntities: true, }); class Fetcher { @@ -58,6 +60,10 @@ class Fetcher { return base64Hash.replace(/[^A-Z0-9]/gi, "").slice(0, HASH_FILENAME_MAXLENGTH); } + static parseXml(content) { + return xmlParser.parse(content); + } + #cacheDuration = "0s"; #directoryManager; #assetsFolder = "assets"; @@ -249,7 +255,7 @@ class Fetcher { return EleventyFetch(url, opts).then(result => { if(opts.type === "xml") { - return xmlParser.parse(result); + return Fetcher.parseXml(result); } return result; diff --git a/src/Importer.js b/src/Importer.js index a022838..3994cc1 100644 --- a/src/Importer.js +++ b/src/Importer.js @@ -4,6 +4,7 @@ import fs from "graceful-fs"; import yaml from "js-yaml"; import kleur from "kleur"; import slugify from '@sindresorhus/slugify'; +import * as entities from "entities"; import { Logger } from "./Logger.js"; import { Fetcher } from "./Fetcher.js"; @@ -204,13 +205,27 @@ class Importer { } addDataOverride(type, url, data) { + let found = false; for(let source of this.getSourcesForType(type)) { source.setDataOverride(url, data); + found = true; } + + if(!found) { + throw new Error("addDataOverride(type) not found: " + type) + } + } + + static shouldUseMarkdownFileExtension(entry) { + return this.isText(entry) || this.isHtml(entry); } static shouldConvertToMarkdown(entry) { - return this.isHtml(entry) || entry.contentType === "text"; + return this.isHtml(entry); + } + + static isText(entry) { + return entry.contentType === "text"; } static isHtml(entry) { @@ -235,11 +250,13 @@ class Importer { } async getEntries(options = {}) { + let isWritingToMarkdown = options.contentType === "markdown"; + let entries = []; for(let source of this.sources) { for(let entry of await source.getEntries()) { let contentType = entry.contentType; - if(Importer.shouldConvertToMarkdown(entry) && options.contentType === "markdown") { + if(Importer.shouldUseMarkdownFileExtension(entry) && isWritingToMarkdown) { contentType = "markdown"; } @@ -257,14 +274,23 @@ class Importer { await this.fetchRelatedMedia(entry); if(Importer.isHtml(entry)) { - entry.content = await this.htmlTransformer.transform(entry.content, entry); + let decodedHtml = entities.decodeHTML(entry.content); + entry.content = await this.htmlTransformer.transform(decodedHtml, entry); } - if(Importer.shouldConvertToMarkdown(entry) && options.contentType === "markdown") { - await this.markdownService.asyncInit(); - entry.content = await this.markdownService.toMarkdown(entry.content, entry); + if(isWritingToMarkdown) { + if(Importer.isText(entry)) { + // _only_ decode newlines + entry.content = entry.content.split(" ").join("\n"); + } + + if(Importer.shouldConvertToMarkdown(entry)) { + await this.markdownService.asyncInit(); - entry.contentType = "markdown"; + entry.content = await this.markdownService.toMarkdown(entry.content, entry); + + entry.contentType = "markdown"; + } } return entry; diff --git a/test/sources/bluesky-test.xml b/test/sources/bluesky-test.xml new file mode 100644 index 0000000..cb90c61 --- /dev/null +++ b/test/sources/bluesky-test.xml @@ -0,0 +1,2 @@ + +🌍 https://zachleat.com 🎈🐀 Creator/Maintainer of @11ty.dev 🎉 Builder at Font Awesome 🏳️‍⚧️ Listen to Trans Folks 👋🏻 He/him/they 🐘 Mastodon https://zachleat.com/@zachleat ✅ Front of the Front-end ✅ Static Sites ✅ Web Components ✅ Web Performancehttps://bsky.app/profile/zachleat.com@zachleat.com - Zach Leathermanhttps://bsky.app/profile/zachleat.com/post/3lckusgtkuk2rtime to review my HTML wrapped 2024 Most used: <a> Doing work to reduce infrastructure bills: <picture> Underrated: <output> Misunderstood: <details> Tame but a small win: <search> Hope the design never calls for it: <dialog> Not today Satan: <canvas> Pure vibes: <noscript>05 Dec 2024 14:26 +0000at://did:plc:xpchjovbk6sxl3bv74z7cs54/app.bsky.feed.post/3lckusgtkuk2r diff --git a/test/test.js b/test/test.js index 57669e1..64a7dab 100644 --- a/test/test.js +++ b/test/test.js @@ -9,6 +9,11 @@ import { DataSource } from "../src/DataSource.js"; import { Persist } from "../src/Persist.js"; import { Fetcher } from "../src/Fetcher.js"; +function cleanContent(content) { + // trim extra whitespace (dirty workaround for trailing whitespace) + return content.split("\n").map(line => line.trim()).join("\n"); +} + const require = createRequire(import.meta.url); test("YouTube user", async (t) => { @@ -20,17 +25,62 @@ test("YouTube user", async (t) => { importer.addSource("youtubeUser", "UCskGTioqrMBcw8pd14_334A"); let stubContent = fs.readFileSync("./test/sources/youtube-user.xml"); - importer.addDataOverride("wordpress", "https://www.youtube.com/feeds/videos.xml?channel_id=UCskGTioqrMBcw8pd14_334A", stubContent); + importer.addDataOverride("youtube", "https://www.youtube.com/feeds/videos.xml?channel_id=UCskGTioqrMBcw8pd14_334A", Fetcher.parseXml(stubContent.toString("utf8"))); - let entries = await importer.getEntries(); + let entries = await importer.getEntries({ contentType: "markdown" }); assert.equal(entries.length, 15); let [post] = entries; + assert.deepEqual(Object.keys(post).sort(), ["authors", "content", "contentType", "date", "dateUpdated", "filePath", "title", "type", "url", "uuid"]); assert.equal(post.content.length, 812); + assert.equal(post.content, `CloudCannon is the Recommended CMS Partner of 11ty: + +https://cloudcannon.com/11tyconf/ +https://cloudcannon.com/blog/how-to-manage-hundreds-of-connected-websites-with-a-git-based-headless-cms/ + +This was a talk given at the 11ty International Symposium on Making Web Sites Real Good (2024): https://conf.11ty.dev/2024/managing-content-management/ + +If Jamstack has taught us anything, it’s that websites work best when they’re generated from folders full of flat files. Even massively interconnected websites! + +We talk through a classically Jamstacky approach to content management for large organizations: mounting shared layout and component repositories, creating a central content lake to aggregate content like news articles, and automating site builds and deployments when your content or dependencies change.`); + assert.equal(post.authors[0].name, "Eleventy"); }); +test("Bluesky posts", async (t) => { + let importer = new Importer(); + + importer.setVerbose(false); + importer.setDryRun(true); + + importer.addSource("bluesky", "zachleat.com"); + + let stubContent = fs.readFileSync("./test/sources/bluesky-test.xml"); + + importer.addDataOverride("bluesky", "https://bsky.app/profile/zachleat.com/rss", Fetcher.parseXml(stubContent.toString("utf8"))); + + let entries = await importer.getEntries({ contentType: "markdown" }); + assert.equal(entries.length, 1); + + let [post] = entries; + + assert.deepEqual(Object.keys(post).sort(), ["authors", "content", "contentType", "date", "filePath", "title", "type", "url", "uuid"]); + assert.equal(post.content.length, 323); + assert.equal(post.content, `time to review my HTML wrapped 2024 + +Most used: <a> +Doing work to reduce infrastructure bills: <picture> +Underrated: <output> +Misunderstood: <details> +Tame but a small win: <search> +Hope the design never calls for it: <dialog> +Not today Satan: <canvas> +Pure vibes: <noscript>`); + + assert.equal(post.authors[0].name, "@zachleat.com - Zach Leatherman"); +}); + test("WordPress import", async (t) => { let importer = new Importer(); @@ -51,12 +101,51 @@ test("WordPress import", async (t) => { importer.addDataOverride("wordpress", "https://blog.fontawesome.com/wp-json/wp/v2/categories/1", require("./sources/blog-awesome-categories.json")); importer.addDataOverride("wordpress", "https://blog.fontawesome.com/wp-json/wp/v2/users/155431370", require("./sources/blog-awesome-author.json")); - let entries = await importer.getEntries(); + let entries = await importer.getEntries({ contentType: "markdown" }); assert.equal(entries.length, 1); let [post] = entries; assert.deepEqual(Object.keys(post).sort(), ["authors", "content", "contentType", "date", "dateUpdated", "filePath", "metadata", "status", "title", "type", "url", "uuid"]); - assert.equal(post.content.length, 6134); + + assert.equal(cleanContent(post.content), `We’re so close to launching version 6, and we figured it was high time to make an official announcement. So, save the date for February. Font Awesome 6 will go beyond pure icon-imagination! + +![](assets/image-calendar-exclamation-2-eKNZqhhuChge.png) + +Save the date! February 2022 is just around the corner! + +So, what’s new? + +* * * + +## More Icons + +Font Awesome 6 contains over 7,000 new icons, so you’re sure to find what you need for your project. Plus, we’ve redesigned most of our icons from scratch, so they’re more consistent and easier to use. + +![](assets/image-icons-2-66KjmgCOuZQw.png) + +* * * + +## More Styles + +Font Awesome 6 includes five icons styles: solid, regular, light, duotone, and the new THIN style — not to mention all of our brand icons. And coming later in 2022 is the entirely new SHARP family of styles. + +![](assets/image-styles-2-SNjQOsXaJuRQ.png) + +* * * + +## More Ways to Use + +Font Awesome 6 makes it even easier to use icons where you want to. More plugins and packages to match your stack. Less time wrestling browser rendering. + +![](assets/image-awesome-2-1AOLfzrlbkMJ.png) + +* * * + +We’ll keep fine-tuning that sweet, sweet recipe until February. Believe us; the web’s going to have a new scrumpdillyicious secret ingredient! + +[Check Out the Beta!](https://fontawesome.com/v6.0)`); + + assert.equal(post.content.length, 1304); assert.equal(post.authors[0].name, "Matt Johnson"); });