Skip to content

Commit 1792560

Browse files
authored
Merge pull request #5 from autogram-is/ESM
Optional filter for UrlSet classes
2 parents cb772ee + 1852061 commit 1792560

11 files changed

+255
-234
lines changed

README.md

+48-1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,51 @@ URL Tools is a helper library whose sole purpose is making that process just a l
77
- `ParsedUrl`, a wrapper for the standard WHATWG `URL` class that mixes in the domain and subdomain parsing from [`tldts`](https://www.npmjs.com/package/tldts), and provides a convenience serializer that preserves the object's individual properties, rather that flattening it to an href the way `URL.toJSON()` does.
88
- A collection set of helper functions for URL filtering and normalizing operations, including sorting querystring parameters, stripping social sharing cruft, remapping 'ww1', 'ww2', etc. subdomains to a single canonical one, identifying web vs. non-web URLs, flagging urls on public hosting like S3, and more.
99
- `NormalizedUrl`, a version of `ParsedUrl` that applies one of those normalizer functions automatically on instantiation. Functions that accept a `URL` or `ParsedURL` as a parameter can use `if (url instanceof NormalizedUrl)` to determine whether a given incoming URL has already been normalized. Yay.
10-
- `UrlSet`, `ParsedUrlSet`, and `NormalizedUrlSet`, a trio of Set classes that store, de-duplicate, and normalize piles of Urls in bulk. It's a bit fussy with bulk adding of relative URLs, but you can pass in a 'fallback base url' that helps in some circumstances.
10+
- `UrlSet`, `ParsedUrlSet`, and `NormalizedUrlSet`, a trio of Set classes that store, de-duplicate, filter, and normalize piles of Urls in bulk. It's a bit fussy with bulk adding of relative URLs, but you can pass in a 'fallback base url' that helps in some circumstances.
11+
12+
## Installation
13+
14+
`npm install @autogram/url-tools`
15+
16+
## Usage
17+
18+
UrlSet is the simplest example; toss URLs at it, and parsed URLs come out. Any that couldn't be parsed can be found the `urlSet.unparsable` property.
19+
20+
```
21+
import { UrlSet } from '@autogram/url-tools';
22+
const rawUrls = [
23+
'http://example.com',
24+
'https://127.0.0.1',
25+
'tel:1-800-555-1212',
26+
'definitely-not-a-url'
27+
];
28+
29+
const us = new UrlSet(rawUrls);
30+
for (url of us) {
31+
console.log(url.href);
32+
}
33+
// 'http://example.com', 'https://127.0.0.1', 'tel:1-800-555-1212'
34+
35+
console.log([...us.unparsable]); // ['definitely-not-a-url']
36+
```
37+
38+
Both `ParsedUrlSet` and `NormalizedUrlSet` can accept a `UrlFilter` function in their constructor options; incoming URLs rejected by that function are shunted to the Set's `parsedUrlSet.rejected` property and not added to the Set proper.
39+
40+
`NormalizedUrlSet` can rely rely on NormalizedUrl's aggressive defaults, or pass in a UrlMutator function to use as an override.
41+
42+
```
43+
import { NormalizedUrlSet, UrlFilters, UrlMutators } from '@autogram/url-tools';
44+
const options = {
45+
filter: UrlFilters.isValidWebUrl,
46+
normalizer: (u) => UrlMutators.forceProtocol(u, 'https')
47+
}
48+
49+
const ns = new NormalizedUrlSet(rawUrls);
50+
for (n of ns) {
51+
console.log(n.href);
52+
}
53+
// 'http://example.com', 'https://127.0.0.1'
54+
55+
console.log([...ns.unparsable]); // ['definitely-not-a-url']
56+
console.log([...ns.rejected]); // ['tel:1-800-555-1212']
57+
```

package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@autogram/url-tools",
3-
"version": "2.0.0",
3+
"version": "2.1.0",
44
"description": "Helper classes for validating, filtering, and normalizing URLs in bulk",
55
"exports": "./dist/source/index.js",
66
"types": "./dist/source/index.d.ts",

source/filters.ts

+35-44
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,44 @@
11
import { ParsedUrl } from './parsed-url.js';
22
import { regExpFromStringMatch, StringMatch } from './index.js';
33

4-
type UrlFilter = (url: ParsedUrl, options?: Record<string, unknown>) => boolean;
4+
export const isWebProtocol = function (url: ParsedUrl): boolean {
5+
const webProtocols = ['http:', 'https:'];
6+
return webProtocols.includes(url.protocol);
7+
};
58

6-
type MatchesPatternOptions = { pattern: StringMatch; property: string };
9+
export const isAuthenticated = function (url: ParsedUrl): boolean {
10+
return (url.username + url.password).length > 0;
11+
};
712

8-
const UrlFilters = {
9-
IsWebProtocol(url: ParsedUrl): boolean {
10-
const webProtocols = ['http:', 'https:'];
11-
return webProtocols.includes(url.protocol);
12-
},
13-
IsAuthenticated(url: ParsedUrl): boolean {
14-
return (url.username + url.password).length > 0;
15-
},
16-
HasPublicSuffix(url: ParsedUrl): boolean {
17-
return url.publicSuffix.length > 0;
18-
},
19-
MatchesPattern(
20-
url: ParsedUrl,
21-
options: Partial<MatchesPatternOptions> = {},
22-
): boolean {
23-
options = {
24-
pattern: [],
25-
property: 'href',
26-
...options,
27-
};
13+
export const hasPublicSuffix = function (url: ParsedUrl): boolean {
14+
return url.publicSuffix.length > 0;
15+
};
2816

29-
if (options.property && options.property in url) {
30-
const match = regExpFromStringMatch(options.pattern);
31-
return match.test(url.properties[options.property].toString());
32-
}
17+
export const matchesPattern = function (
18+
url: ParsedUrl,
19+
pattern: StringMatch = [],
20+
property = 'href',
21+
): boolean {
22+
if (pattern && property in url) {
23+
const match = regExpFromStringMatch(pattern);
24+
return match.test(url.properties[property].toString());
25+
}
3326

34-
return false;
35-
},
36-
IsSocialShareLink(url: ParsedUrl): boolean {
37-
return (
38-
(url.domain === 'twitter.com' &&
39-
url.pathname.startsWith('/intent/tweet')) || // Share links
40-
(url.domain === 'pinterest.com' &&
41-
url.pathname.startsWith('/pin/create/button')) ||
42-
(url.domain === 'linkedin.com' &&
43-
url.pathname.startsWith('/shareArticle')) ||
44-
(url.domain === 'reddit.com' && url.pathname.startsWith('/submit')) ||
45-
(url.domain === 'tumblr.com' &&
46-
url.pathname.startsWith('/widgets/share')) ||
47-
(url.domain === 'facebook.com' &&
48-
url.pathname.startsWith('/sharer/sharer.php'))
49-
);
50-
},
27+
return false;
5128
};
5229

53-
export { UrlFilter, UrlFilters };
30+
export const isSocialShareLink = function (url: ParsedUrl): boolean {
31+
return (
32+
(url.domain === 'twitter.com' &&
33+
url.pathname.startsWith('/intent/tweet')) || // Share links
34+
(url.domain === 'pinterest.com' &&
35+
url.pathname.startsWith('/pin/create/button')) ||
36+
(url.domain === 'linkedin.com' &&
37+
url.pathname.startsWith('/shareArticle')) ||
38+
(url.domain === 'reddit.com' && url.pathname.startsWith('/submit')) ||
39+
(url.domain === 'tumblr.com' &&
40+
url.pathname.startsWith('/widgets/share')) ||
41+
(url.domain === 'facebook.com' &&
42+
url.pathname.startsWith('/sharer/sharer.php'))
43+
);
44+
};

source/index.ts

+5-6
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
1-
export { UrlMutator, UrlMutators } from './mutations.js';
2-
export { UrlFilter, UrlFilters } from './filters.js';
3-
export { ParsedUrl } from './parsed-url.js';
1+
export * as UrlMutators from './mutators.js';
2+
export * as UrlFilters from './filters.js';
3+
4+
export { ParsedUrl, UrlFilter, UrlMutator } from './parsed-url.js';
45
export { UrlSet, ParsedUrlSet, NormalizedUrlSet } from './url-set.js';
56
export { NormalizedUrl } from './normalized-url.js';
67

78
export type StringMatch = string | string[] | RegExp;
8-
export const regExpFromStringMatch = function (
9-
pattern: StringMatch
10-
): RegExp {
9+
export const regExpFromStringMatch = function (pattern: StringMatch): RegExp {
1110
if (typeof pattern === 'string') {
1211
return new RegExp(pattern);
1312
}

source/mutations.ts

-120
This file was deleted.

source/mutators.ts

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import { ParsedUrl } from './parsed-url.js';
2+
import { StringMatch, regExpFromStringMatch } from './index.js';
3+
4+
export const defaultNormalizer = function (url: ParsedUrl): ParsedUrl {
5+
url = forceLowercaseHostname(url);
6+
url = forceProtocol(url);
7+
url = stripAuthentication(url);
8+
url = stripAnchor(url);
9+
url = stripIndexPages(url);
10+
url = stripPort(url);
11+
url = stripSubdomains(url);
12+
url = stripTrailingSlash(url);
13+
url = stripQueryParameters(url);
14+
url = sortQueryParameters(url);
15+
return url;
16+
};
17+
18+
export const forceLowercaseHostname = function (url: ParsedUrl): ParsedUrl {
19+
url.hostname = url.hostname.toLowerCase();
20+
return url;
21+
};
22+
23+
export const forceProtocol = function (
24+
url: ParsedUrl,
25+
options = 'https',
26+
): ParsedUrl {
27+
url.protocol = options;
28+
return url;
29+
};
30+
31+
export const stripIndexPages = function (
32+
url: ParsedUrl,
33+
indexes?: string[],
34+
): ParsedUrl {
35+
indexes = indexes ?? ['index.htm', 'index.html', 'default.aspx', 'index.php'];
36+
for (const i of indexes) {
37+
if (url.pathname.endsWith(i)) {
38+
url.pathname = url.pathname.replace(i, '');
39+
}
40+
}
41+
42+
return url;
43+
};
44+
45+
export const stripAnchor = function (url: ParsedUrl): ParsedUrl {
46+
url.hash = '';
47+
return url;
48+
};
49+
50+
export const stripAuthentication = function (url: ParsedUrl): ParsedUrl {
51+
url.username = '';
52+
url.password = '';
53+
return url;
54+
};
55+
56+
export const stripPort = function (url: ParsedUrl): ParsedUrl {
57+
url.port = '';
58+
return url;
59+
};
60+
61+
export const stripQueryParameters = function (
62+
url: ParsedUrl,
63+
options: StringMatch = /^utm_\s+|src|referrer|referer/,
64+
): ParsedUrl {
65+
const stripList = regExpFromStringMatch(options);
66+
for (const [name] of url.searchParams) {
67+
if (stripList.test(name)) {
68+
url.searchParams.delete(name);
69+
}
70+
}
71+
72+
return url;
73+
};
74+
75+
export const stripSubdomains = function (
76+
url: ParsedUrl,
77+
options: StringMatch = /^ww[w\d]+/,
78+
): ParsedUrl {
79+
const stripList = regExpFromStringMatch(options);
80+
if (stripList.test(url.subdomain)) url.subdomain = '';
81+
return url;
82+
};
83+
84+
export const stripTrailingSlash = function (url: ParsedUrl): ParsedUrl {
85+
if (url.pathname.endsWith('/')) {
86+
url.pathname = url.pathname.slice(0, Math.max(0, url.pathname.length - 1));
87+
}
88+
89+
return url;
90+
};
91+
92+
export const sortQueryParameters = function (url: ParsedUrl): ParsedUrl {
93+
url.searchParams.sort();
94+
return url;
95+
};

source/normalized-url.ts

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import { URL } from 'node:url';
22
import { ParsedUrl } from './parsed-url.js';
3-
import { UrlMutators, UrlMutator } from './mutations.js';
3+
import * as UrlMutators from './mutators.js';
4+
import { UrlMutator } from './index.js';
45

56
export class NormalizedUrl extends ParsedUrl {
6-
static normalizer: UrlMutator = UrlMutators.DefaultNormalizer;
7+
static normalizer: UrlMutator = UrlMutators.defaultNormalizer;
78
original: string;
89

910
constructor(

0 commit comments

Comments
 (0)