Skip to content
3 changes: 3 additions & 0 deletions src/Crawler/ContentProcessor/BaseProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
use Crawler\Export\Utils\OfflineUrlConverter;
use Crawler\FoundUrls;
use Crawler\ParsedUrl;
use Crawler\Utils;

abstract class BaseProcessor implements ContentProcessor
{
Expand Down Expand Up @@ -69,6 +70,8 @@ public function setDebugMode(bool $debugMode): void
*/
public function convertUrlToRelative(ParsedUrl $parsedBaseUrl, string $targetUrl, ?string $attribute = null): string
{
// normalize URL before parsing so that it matches what is used on FoundUrl
$targetUrl = Utils::normalizeUrl($targetUrl, $parsedBaseUrl->getFullUrl());
$urlConverter = new OfflineUrlConverter(
$this->crawler->getInitialParsedUrl(),
$parsedBaseUrl,
Expand Down
198 changes: 177 additions & 21 deletions src/Crawler/ContentProcessor/HtmlProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ public function applyContentChangesForOfflineVersion(string &$content, int $cont
*/
private function updateHtmlPathsToRelative(string $html, ParsedUrl $parsedBaseUrl): string
{
$patternHrefSrc = '/(\.|<[a-z0-9]{1,10}[^>]*\s+)(href|src|component-url)\s*(=)\s*([\'"]?)([^\'">]+)\4([^>]*)/is';
$patternSrcset = '/(\.|<[a-z0-9]{1,10}[^>]*\s+)(imagesrcset|srcset|renderer-url)\s*(=)\s*([\'"]?)([^\'">]+)\4([^>]*)/is';
$patternHrefSrc = '/(\.|<[a-z0-9]{1,10}[^>]*\s+)(href|src|component-url)\s*(=)\s*({{quote}})({{no_quote}}[^{{quote}}{{quote_space}}>]+){{quote}}([^>]*)/is';
$patternSrcset = '/(\.|<[a-z0-9]{1,10}[^>]*\s+)(imagesrcset|srcset|renderer-url)\s*(=)\s*({{quote}})({{no_quote}}[^{{quote}}{{quote_space}}>]+){{quote}}([^>]*)/is';
$patternMetaUrl = '/(<meta[^>]*)(url)\s*(=)\s*([\'"]?)([^\'">]+)\4(")/im';
$escapedHref = '/(.)(href\\\\["\']|src\\\\["\'])([:=])(\\\\["\'])([^"\'\\\\]+)\\\\["\'](.)/is';

Expand Down Expand Up @@ -201,9 +201,9 @@ private function updateHtmlPathsToRelative(string $html, ParsedUrl $parsedBaseUr
return $start . $attributeRaw . $assignmentChar . $quote . $newValue . $quote . $end;
};

$html = preg_replace_callback($patternHrefSrc, $replaceCallback, $html);
$html = $this->pregPatternsReplaceCallback($patternHrefSrc, $replaceCallback, $html);
$html = $this->pregPatternsReplaceCallback($patternMetaUrl, $replaceCallback, $html);
$html = preg_replace_callback($patternSrcset, $replaceCallback, $html);
$html = preg_replace_callback($patternMetaUrl, $replaceCallback, $html);
$html = preg_replace_callback($escapedHref, $replaceCallback, $html);

return $html;
Expand All @@ -219,9 +219,10 @@ private function updateHtmlPathsToRelative(string $html, ParsedUrl $parsedBaseUr
*/
private function findHrefUrls(string $html, ParsedUrl $sourceUrl, FoundUrls $foundUrls, string $regexForHtmlExtensions): void
{
preg_match_all('/<a[^>]*\shref=["\']?([^#][^"\'\s>]+)["\'\s]?[^>]*>/is', $html, $matches);
$foundUrlsTxt = $matches[1];
$urls = $this->pregPatternsMatchAll('/<a[^>]*\shref={{quote}}((?!#){{no_quote}}[^{{quote}}{{quote_space}}>]+){{quote}}[^>]*>/is', $html);
$foundUrlsTxt = $urls;

// TODO: Document what this is trying to do before converting it to ->pregPatternsMatchAll
preg_match_all('/href\\\\["\'][:=]\\\\["\'](https?:\/\/[^"\'\\\\]+)\\\\["\']/i', $html, $matches);
$foundUrlsTxt = array_merge($foundUrlsTxt, $matches[1] ?? []);

Expand Down Expand Up @@ -264,12 +265,167 @@ private function findFonts(string $html, ParsedUrl $sourceUrl, FoundUrls $foundU
$sourceUrlWithoutFragment = $sourceUrl->getFullUrl(true, false);

// CSS @font-face
preg_match_all("/url\s*\(\s*['\"]?([^'\"\s>]+\.(eot|ttf|woff2|woff|otf)[^'\")]*)['\"]?\s*\)/is", $html, $matches);
$foundUrls->addUrlsFromTextArray($matches[1], $sourceUrlWithoutFragment, FoundUrl::SOURCE_CSS_URL);
$urls = $this->pregPatternsMatchAll('/url\s*\(\s*{{quote}}({{no_quote}}[^{{quote}}{{quote_space}}\)]+\.{{extensions:eot|ttf|woff2?|otf:\)}}){{quote_assert:\)}}[^\)]*\)/is', $html);
$foundUrls->addUrlsFromTextArray($urls, $sourceUrlWithoutFragment, FoundUrl::SOURCE_CSS_URL);

// <link href="...(eot|ttf|woff2|woff|otf)
preg_match_all('/<link\s+[^>]*href=["\']?([^"\' ]+\.(eot|ttf|woff2|woff|otf)[^"\' ]*)["\']?[^>]*>/is', $html, $matches);
$foundUrls->addUrlsFromTextArray($matches[1], $sourceUrlWithoutFragment, FoundUrl::SOURCE_LINK_HREF);
$urls = $this->pregPatternsMatchAll('/<link\s+[^>]*href={{quote}}({{no_quote}}[^{{quote}}{{quote_space}}>]+\.{{extensions:eot|ttf|woff2?|otf:>}}){{quote_assert:>}}[^>]*>/is', $html);
$foundUrls->addUrlsFromTextArray($urls, $sourceUrlWithoutFragment, FoundUrl::SOURCE_LINK_HREF);
}

/**
* Generates multiple regex patterns from a pseudo regex template language
* for handling quoted and unquoted HTML attributes.
*
* Automatically generates 2-3 regex patterns to handle different quoting
* scenarios (double quotes, single quotes, and unquoted values).
*
* **Pseudo Regex Template Language:**
*
* - `{{quote}}` - The actual quote character (", ', or empty for unquoted)
* - `{{quote_space}}` - Conditional space: empty for quoted, space for
* unquoted scenarios
* - `{{no_quote}}` - Conditional negation: empty for quoted, [^"'] for
* unquoted scenarios
* - `{{extensions:<extensions>}}` - Expands to
* (?:<extensions>)(?:\?[^{{quote}}{{quote_space}}>]*)?
* - `{{quote_assert:<chars>}}` - Expands to {{quote}}(?=[\s<chars>])
*
* **Key Insight - Conditional Replacements:**
* `{{no_quote}}` and `{{quote_space}}` enable conditional behavior:
* - For quoted: they become empty (only avoid quote character)
* - For unquoted: they add restrictions for space-separated values
*
* Example: `[^{{quote}}{{quote_space}}>]` becomes:
* - `[^">]` for quoted, `[^"' >]` for unquoted
*
* @param string $pattern_template Pseudo regex template with {{placeholders}}
* @param array|null $options 'unquoted' => bool (default: true)
* @return array Array of generated regex patterns
*/
function pregPatternsGenerate(string $pattern_template, ?array $options = []): array
{
$options = $options + [
'unquoted' => true,
];

$patterns = [];

// Process macro placeholders first
$pattern_template = preg_replace_callback('/{{extensions:([^:]+):([^}]+)}}/', function ($matches) {
return '(?:'. $matches[1] . ')(?:\?[^{{quote}}{{quote_space}}' . $matches[2] . ']*)?';
}, $pattern_template);

$pattern_search = [
'/{{quote}}/',
'/{{quote_space}}/',
'/{{no_quote}}/',
'/{{quote_assert:([^}]+)}}/',
];

$pattern_replace = [
['', '', '', ''],
];

if (strstr($pattern_template, '{{quote}}') !== false) {
$pattern_replace = [
['"', '', '', ''],
["'", '', '', ''],
];

if ($options['unquoted']) {
$pattern_replace[] = ['', ' ', '(?!["\'])', '(?=[\s\1])'];
}
}

foreach ($pattern_replace as $replace) {
$pattern = preg_replace($pattern_search, $replace, $pattern_template);
$patterns[] = $pattern;
}

return $patterns;
}


/**
* Wrapper for preg_match_all that executes multiple patterns and merges
* results. Generates patterns using pregPatternsGenerate().
*
* @param string $pattern_template Pseudo regex template with {{placeholders}}
* @param string $subject String to search in
* @param array $options 'return' => int (capture group, default: 1),
* 'unquoted' => bool
* @param array|null $matches Reference to store merged matches (optional)
* @param int $flags preg_match_all flags
* @param int $offset Search offset
* @return array Merged matches from specified capture group
*/
function pregPatternsMatchAll(string $pattern_template, string $subject, array $options = [], ?array &$matches = null, int $flags = 0, int $offset = 0): array
{
$options = $options + [
'return' => 1,
];
$patterns = $this->pregPatternsGenerate($pattern_template, $options);

foreach ($patterns as $pattern) {
preg_match_all($pattern, $subject, $single_matches, $flags, $offset);
if (!isset($matches)) {
$matches = $single_matches;
}
else {
$matches = array_map('array_merge', $single_matches, $matches);
}
}
if (isset($options['return'])) {
$idx = $options['return'];
if (!empty($matches[$idx])) {
$ret = $matches[$idx];
}
}

return $ret ?? [];
}

/**
* Wrapper for preg_replace that executes multiple patterns sequentially.
* Generates patterns using pregPatternsGenerate().
*
* @param string $pattern_template Pseudo regex template with {{placeholders}}
* @param string $replacement Replacement string
* @param string $subject String to perform replacements on
* @param array $options 'unquoted' => bool
* @param int $limit Maximum replacements per pattern (-1 for unlimited)
* @param int $count Reference to store total replacement count
* @return string Modified string
*/
function pregPatternsReplace(string $pattern_template, string $replacement, string $subject, array $options = [], int $limit = -1, int &$count = 0): string
{
$patterns = $this->pregPatternsGenerate($pattern_template, $options);
foreach ($patterns as $pattern) {
$subject = preg_replace($pattern, $replacement, $subject, $limit, $count);
}
return $subject;
}

/**
* Wrapper for preg_replace_callback that executes multiple patterns
* sequentially. Generates patterns using pregPatternsGenerate().
*
* @param string $pattern_template Pseudo regex template with {{placeholders}}
* @param callable $callback Callback function for replacements
* @param string $subject String to perform replacements on
* @param array $options 'unquoted' => bool
* @param int $limit Maximum replacements per pattern (-1 for unlimited)
* @param int $count Reference to store total replacement count
* @return string Modified string
*/
function pregPatternsReplaceCallback(string $pattern_template, callable $callback, string $subject, array $options = [], int $limit = -1, int &$count = 0): string
{
$patterns = $this->pregPatternsGenerate($pattern_template, $options);
foreach ($patterns as $pattern) {
$subject = preg_replace_callback($pattern, $callback, $subject, $limit, $count);
}
return $subject;
}

/**
Expand All @@ -283,24 +439,24 @@ private function findImages(string $html, ParsedUrl $sourceUrl, FoundUrls $found
$sourceUrlWithoutFragment = $sourceUrl->getFullUrl(true, false);

// <img src="..."
preg_match_all('/<img\s+[^>]*?src=["\']?([^"\'> ]+)["\']?[^>]*>/is', $html, $matches);
$foundUrls->addUrlsFromTextArray($matches[1], $sourceUrlWithoutFragment, FoundUrl::SOURCE_IMG_SRC);
$urls = $this->pregPatternsMatchAll('/<img\s+[^>]*?src={{quote}}({{no_quote}}[^{{quote}}{{quote_space}}>]+){{quote}}[^>]*>/is' , $html);
$foundUrls->addUrlsFromTextArray($urls, $sourceUrlWithoutFragment, FoundUrl::SOURCE_IMG_SRC);

// <input src="..."
preg_match_all('/<input\s+[^>]*?src=["\']?([^"\'> ]+\.[a-z0-9]{1,10})["\']?[^>]*>/is', $html, $matches);
$foundUrls->addUrlsFromTextArray($matches[1], $sourceUrlWithoutFragment, FoundUrl::SOURCE_INPUT_SRC);
$urls = $this->pregPatternsMatchAll('/<input\s+[^>]*?src={{quote}}({{no_quote}}[^{{quote}}{{quote_space}}>]+){{quote}}[^>]*>/is', $html);
$foundUrls->addUrlsFromTextArray($urls, $sourceUrlWithoutFragment, FoundUrl::SOURCE_INPUT_SRC);

// <link href="...(png|gif|jpg|jpeg|webp|avif|tif|bmp|svg)"
preg_match_all('/<link\s+[^>]*?href=["\']?([^"\'> ]+\.(png|gif|jpg|jpeg|webp|avif|tif|bmp|svg|ico)(|\?[^"\' ]))["\']?[^>]*>/is', $html, $matches);
$foundUrls->addUrlsFromTextArray($matches[1], $sourceUrlWithoutFragment, FoundUrl::SOURCE_LINK_HREF);
$urls = $this->pregPatternsMatchAll('/<link\s+[^>]*?href={{quote}}({{no_quote}}[^{{quote}}{{quote_space}}>]+\.{{extensions:png|gif|jpe?g|webp|avif|tiff?|bmp|svg|ico:>}}){{quote_assert:>}}[^>]*>/is', $html);
$foundUrls->addUrlsFromTextArray($urls, $sourceUrlWithoutFragment, FoundUrl::SOURCE_LINK_HREF);

// <source src="..."
preg_match_all('/<source\s+[^>]*?src=["\']([^"\'>]+)["\'][^>]*>/is', $html, $matches);
$foundUrls->addUrlsFromTextArray($matches[1], $sourceUrlWithoutFragment, FoundUrl::SOURCE_SOURCE_SRC);
$urls = $this->pregPatternsMatchAll('/<source\s+[^>]*?src={{quote}}({{no_quote}}[^{{quote}}{{quote_space}}>]+){{quote}}[^>]*>/is', $html);
$foundUrls->addUrlsFromTextArray($urls, $sourceUrlWithoutFragment, FoundUrl::SOURCE_SOURCE_SRC);

// CSS url()
preg_match_all("/url\s*\(\s*['\"]?([^'\")]+\.(jpg|jpeg|png|gif|bmp|tif|webp|avif)[^'\")]*)['\"]?\s*\)/is", $html, $matches);
$foundUrls->addUrlsFromTextArray($matches[1], $sourceUrlWithoutFragment, FoundUrl::SOURCE_CSS_URL);
$urls = $this->pregPatternsMatchAll('/url\s*\(\s*{{quote}}({{no_quote}}[^{{quote}}{{quote_space}}\)]+\.{{extensions:png|gif|jpe?g|webp|avif|tiff?|bmp|svg|ico:\)}}){{quote_assert:\)}}[^\)]*\)/is', $html);
$foundUrls->addUrlsFromTextArray($urls, $sourceUrlWithoutFragment, FoundUrl::SOURCE_CSS_URL);

// <picture><source srcset="..."><img src="..."></picture>
// <img srcset="..."
Expand Down Expand Up @@ -406,7 +562,7 @@ private function findScripts(string $html, ParsedUrl $sourceUrl, FoundUrls $foun
*/
private function findStylesheets(string $html, ParsedUrl $sourceUrl, FoundUrls $foundUrls): void
{
preg_match_all('/<link\s+[^>]*?href=["\']([^"\']+)["\'][^>]*>/is', $html, $matches);
$this->pregPatternsMatchAll('/<link\s+[^>]*?href={{quote}}({{no_quote}}[^{{quote}}{{quote_space}}>]+){{quote}}[^>]*>/is', $html, ['unquoted' => false, 'return' => null], $matches);
foreach ($matches[0] as $key => $match) {
if (stripos($match, 'rel=') !== false && stripos($match, 'stylesheet') === false) {
unset($matches[0][$key]);
Expand Down
14 changes: 12 additions & 2 deletions src/Crawler/Export/OfflineWebsiteExporter.php
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ class OfflineWebsiteExporter extends BaseExporter implements Exporter

protected bool $ignoreStoreFileError = false;

/**
* Convert all filenames to lowercase for offline export
* @var bool
*/
protected bool $offlineExportLowercase = false;

/**
* Replace HTML/JS/CSS content with `xxx -> bbb` or regexp in PREG format: `/card[0-9]/ -> card`
*
Expand Down Expand Up @@ -103,6 +109,9 @@ public function export(): void
// user-defined replaceQueryString will deactivate replacing query string with hash and use custom replacement
OfflineUrlConverter::setReplaceQueryString($this->replaceQueryString);

// set lowercase option for all URL conversions
OfflineUrlConverter::setLowercase($this->offlineExportLowercase);

// filter only relevant URLs with OK status codes
$exportedUrls = array_filter($visitedUrls, function (VisitedUrl $visitedUrl) {
return in_array($visitedUrl->statusCode, [200, 201, 301, 302, 303, 308]);
Expand Down Expand Up @@ -195,7 +204,7 @@ private function storeFile(VisitedUrl $visitedUrl): void
// same logic is in method convertUrlToRelative()
$storeFilePath = sprintf('%s/%s',
$this->offlineExportDirectory,
OfflineUrlConverter::sanitizeFilePath($this->getRelativeFilePathForFileByUrl($visitedUrl), false)
OfflineUrlConverter::sanitizeFilePath($this->getRelativeFilePathForFileByUrl($visitedUrl), false, $this->offlineExportLowercase)
);

$directoryPath = dirname($storeFilePath);
Expand Down Expand Up @@ -282,7 +291,7 @@ private function getRelativeFilePathForFileByUrl(VisitedUrl $visitedUrl): string
$visitedUrl->contentType === Crawler::CONTENT_TYPE_ID_IMAGE ? 'src' : 'href'
);

$relativeUrl = $urlConverter->convertUrlToRelative(false);
$relativeUrl = $urlConverter->convertUrlToRelative(false, $this->offlineExportLowercase);
$relativeTargetUrl = $urlConverter->getRelativeTargetUrl();
$relativePath = '';

Expand Down Expand Up @@ -335,6 +344,7 @@ public static function getOptions(): Options
new Option('--offline-export-no-auto-redirect-html', null, 'offlineExportNoAutoRedirectHtml', Type::BOOL, false, "Disable automatic creation of redirect HTML files for subfolders that contain an index.html file. This solves situations for URLs where sometimes the URL ends with a slash, sometimes it doesn't.", false, false),
new Option('--replace-content', null, 'replaceContent', Type::REPLACE_CONTENT, true, "Replace HTML/JS/CSS content with `foo -> bar` or regexp in PREG format: `/card[0-9]/i -> card`", null, true, true),
new Option('--replace-query-string', null, 'replaceQueryString', Type::REPLACE_CONTENT, true, "Instead of using a short hash instead of a query string in the filename, just replace some characters. You can use simple format 'foo -> bar' or regexp in PREG format, e.g. '/([a-z]+)=([^&]*)(&|$)/i -> $1__$2'", null, true, true),
new Option('--offline-export-lowercase', null, 'offlineExportLowercase', Type::BOOL, false, 'Convert all filenames to lowercase for offline export. Useful for case-insensitive filesystems.', false, false),
new Option('--ignore-store-file-error', null, 'ignoreStoreFileError', Type::BOOL, false, 'Ignores any file storing errors. The export process will continue.', false, false),
]));
return $options;
Expand Down
Loading