motoko_url_kit/src/ComprehensiveDomainParser.mo at main · edjCase/motoko_url_kit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import DomainSuffixData "./data/DomainSuffixData";
import Map "mo:core@1/Map";
import Domain "Domain";
import Result "mo:core@1/Result";
import Text "mo:core@1/Text";
import Iter "mo:core@1/Iter";
import Runtime "mo:core@1/Runtime";
import Array "mo:core@1/Array";
import Char "mo:core@1/Char";
import List "mo:core@1/List";

module {

  /// A domain parser that uses the comprehensive Public Suffix List for accurate domain parsing.
  /// Automatically decompresses and caches the suffix data for efficient lookups.
  ///
  /// ```motoko
  /// let parser = ComprehensiveDomainParser.ComprehensiveDomainParser();
  /// let result = parser.parse("blog.github.io");
  /// // result is #ok({ name = "blog"; suffix = "github.io"; subdomains = [] })
  /// ```
  public class ComprehensiveDomainParser() : Domain.DomainParser {
    var decompressedSuffixes : ?Map.Map<Text, Domain.SuffixEntry> = null;

    /// Parses a domain name using the comprehensive Public Suffix List.
    /// Returns the domain broken into name, suffix, and subdomain components.
    ///
    /// ```motoko
    /// let parser = ComprehensiveDomainParser.ComprehensiveDomainParser();
    /// let result = parser.parse("www.example.com");
    /// // result is #ok({ name = "example"; suffix = "com"; subdomains = ["www"] })
    /// ```
    public func parse(domain : Text) : Result.Result<Domain.Domain, Text> {
      let suffixes = switch (decompressedSuffixes) {
        case (null) {
          let decompressed = decompressData(DomainSuffixData.value);
          decompressedSuffixes := ?decompressed;
          decompressed;
        };
        case (?s) s;
      };
      Domain.fromTextAdvanced(domain, suffixes);
    };

  };

  /// Decompresses the compact domain suffix data into a searchable map structure.
  /// This function is used internally by the parser but can be called directly for custom use cases.
  ///
  /// ```motoko
  /// let suffixMap = ComprehensiveDomainParser.decompressData(DomainSuffixData.value);
  /// // Returns a Map<Text, SuffixEntry> for efficient suffix lookups
  /// ```
  public func decompressData(compressed : Text) : Map.Map<Text, Domain.SuffixEntry> {
    let entries = Map.empty<Text, Domain.SuffixEntry>();

    // Split by | to get top-level segments
    let topLevelParts = Text.split(compressed, #char('|'));

    for (part in topLevelParts) {
      let (id, entry) = parseSegment(part);
      Map.add(entries, Text.compare, id, entry);
    };

    entries;
  };
  private func parseSegment(segment : Text) : (Text, Domain.SuffixEntry) {
    if (Text.contains(segment, #char('>'))) {
      // Find first > not inside parentheses
      let ?{ before = parentPart; after = childrenPart } = splitOnFirstTopLevelChar(segment, '>') else Runtime.trap("Invalid compressed format: " # segment);

      // Check if parent is terminal
      let (parentName, isParentTerminal) = switch (Text.stripEnd(parentPart, #char('!'))) {
        case (null) (parentPart, false);
        case (?terminalSegment) (terminalSegment, true);
      };

      // Check if childrenPart is a chain (contains > outside parens) or just children
      if (containsTopLevelChar(childrenPart, '>')) {
        // It's a chain: parse as a single child that itself has children
        let (childId, childEntry) = parseSegment(childrenPart);
        let childMap = Map.empty<Text, Domain.SuffixEntry>();
        Map.add(childMap, Text.compare, childId, childEntry);

        (
          parentName,
          {
            isTerminal = isParentTerminal;
            childRule = #specific(childMap);
          },
        );
      } else {
        // It's direct children (possibly multiple)
        let childRule = parseChildRule(childrenPart);
        (
          parentName,
          {
            isTerminal = isParentTerminal;
            childRule = childRule;
          },
        );
      };
    } else {
      func parseExceptions(splitChar : Char) : (Text, [Text]) {
        let parts = Text.split(segment, #char(splitChar));
        let ?segmentName = parts.next() else Runtime.trap("Invalid wildcard/exception format: " # segment);
        let ?exceptionsText = parts.next() else Runtime.trap("Invalid wildcard/exception format: " # segment);

        (segmentName, Text.split(exceptionsText, #char(',')) |> Iter.toArray(_));
      };
      let (segmentName, isTerminal, childRule) = if (Text.contains(segment, #char('^'))) {
        let (segmentName, exceptions) = parseExceptions('^');
        (segmentName, true, #wildcardWithExceptions(exceptions));
      } else if (Text.contains(segment, #char('*'))) {
        let (segmentName, exceptions) = parseExceptions('*');
        (segmentName, false, #wildcardWithExceptions(exceptions));
      } else {
        (segment, true, #none);
      };
      (
        segmentName,
        {
          isTerminal = isTerminal;
          childRule = childRule;
        },
      );
    };
  };

  // Helper function to check if a character exists at top level (outside parentheses)
  private func containsTopLevelChar(text : Text, char : Char) : Bool {
    let chars = Text.toArray(text);
    var parenDepth = 0;

    for (c in chars.vals()) {
      switch (c) {
        case ('(') parenDepth += 1;
        case (')') parenDepth -= 1;
        case (_) {
          if (c == char and parenDepth == 0) {
            return true;
          };
        };
      };
    };
    false;
  };

  private func splitOnFirstTopLevelChar(text : Text, char : Char) : ?{
    before : Text;
    after : Text;
  } {
    let chars = Text.toArray(text);
    var parenDepth = 0;
    var index = 0;

    for (c in chars.vals()) {
      switch (c) {
        case ('(') {
          parenDepth += 1;
        };
        case (')') {
          parenDepth -= 1;
        };
        case (_) {
          if (c == char and parenDepth == 0) {
            let before = Text.fromArray(Array.sliceToArray(chars, 0, index));
            let after = Text.fromArray(Array.sliceToArray(chars, index + 1, chars.size()));
            return ?{ before = before; after = after };
          };
        };
      };
      index += 1;
    };

    null;
  };

  private func parseChildRule(childrenText : Text) : Domain.SuffixChildRule {
    let children = splitChildren(childrenText);
    if (List.isEmpty(children)) {
      return #none;
    };

    if (List.at(children, 0) == "*") {
      let exceptions = List.empty<Text>();
      for (exception in Iter.drop(List.values(children), 1)) {
        switch (Text.stripStart(exception, #char('!'))) {
          case (null) Runtime.trap("Invalid exception format: " # exception);
          case (?cleaned) List.add(exceptions, cleaned);
        };
      };
      return #wildcardWithExceptions(List.toArray(exceptions));
    };

    let map = Map.empty<Text, Domain.SuffixEntry>();

    for (child in List.values(children)) {

      // Remove parentheses if present
      let cleanChild = if (Text.startsWith(child, #char('(')) and Text.endsWith(child, #char(')'))) {
        let ?withoutStart = Text.stripStart(child, #char('(')) else Runtime.unreachable();
        let ?withoutStartAndEnd = Text.stripEnd(withoutStart, #char(')')) else Runtime.unreachable();
        withoutStartAndEnd;
      } else {
        child;
      };

      let (childId, childEntry) = parseSegment(cleanChild);
      Map.add(map, Text.compare, childId, childEntry);
    };
    #specific(map);
  };

  private func splitChildren(text : Text) : List.List<Text> {
    let chars = Text.toArray(text);
    let result = List.empty<Text>();
    var current = "";
    var parenDepth = 0;

    for (char in chars.vals()) {
      switch (char) {
        case ('(') {
          parenDepth += 1;
          current #= Char.toText(char);
        };
        case (')') {
          parenDepth -= 1;
          current #= Char.toText(char);
        };
        case (',') {
          if (parenDepth == 0) {
            if (current != "") {
              List.add(result, current);
              current := "";
            };
          } else {
            current #= Char.toText(char);
          };
        };
        case (_) {
          current #= Char.toText(char);
        };
      };
    };

    if (current != "") {
      List.add(result, current);
    };

    result;
  };
};