1
1
# coding: utf-8
2
+ # frozen_string_literal: true
2
3
require 'cgi'
3
4
4
5
module RDF
@@ -28,27 +29,27 @@ class URI
28
29
include RDF ::Resource
29
30
30
31
# IRI components
31
- UCSCHAR = Regexp . compile ( <<-EOS . gsub ( / \s +/ , '' ) )
32
- [ \\ u00A0-\\ uD7FF]|[ \\ uF900-\\ uFDCF]|[ \\ uFDF0-\\ uFFEF]|
33
- [ \\ u{10000}-\\ u{1FFFD}]|[ \\ u{20000}-\\ u{2FFFD}]|[ \\ u{30000}-\\ u{3FFFD}]|
34
- [ \\ u{40000}-\\ u{4FFFD}]|[ \\ u{50000}-\\ u{5FFFD}]|[ \\ u{60000}-\\ u{6FFFD}]|
35
- [ \\ u{70000}-\\ u{7FFFD}]|[ \\ u{80000}-\\ u{8FFFD}]|[ \\ u{90000}-\\ u{9FFFD}]|
36
- [ \\ u{A0000}-\\ u{AFFFD}]|[ \\ u{B0000}-\\ u{BFFFD}]|[ \\ u{C0000}-\\ u{CFFFD}]|
37
- [ \\ u{D0000}-\\ u{DFFFD}]|[ \\ u{E1000}-\\ u{EFFFD}]
38
- EOS
39
- IPRIVATE = Regexp . compile ( "[\\ uE000-\\ uF8FF]|[ \\ u{F0000}-\\ u{FFFFD}]|[ \\ u100000 -\\ u10FFFD ]" ) . freeze
32
+ UCSCHAR = %(
33
+ \\ u00A0-\\ uD7FF\\ uF900-\\ uFDCF\\ uFDF0-\\ uFFEF
34
+ \\ u{10000}-\\ u{1FFFD}\\ u{20000}-\\ u{2FFFD}\\ u{30000}-\\ u{3FFFD}
35
+ \\ u{40000}-\\ u{4FFFD}\\ u{50000}-\\ u{5FFFD}\\ u{60000}-\\ u{6FFFD}
36
+ \\ u{70000}-\\ u{7FFFD}\\ u{80000}-\\ u{8FFFD}\\ u{90000}-\\ u{9FFFD}
37
+ \\ u{A0000}-\\ u{AFFFD}\\ u{B0000}-\\ u{BFFFD}\\ u{C0000}-\\ u{CFFFD}
38
+ \\ u{D0000}-\\ u{DFFFD}\\ u{E1000}-\\ u{EFFFD}
39
+ ) . gsub ( / \s +/ , '' )
40
+ IPRIVATE = Regexp . compile ( "[\\ uE000-\\ uF8FF\\ u{F0000}-\\ u{FFFFD}\\ u{100000} -\\ u{10FFFD} ]" ) . freeze
40
41
SCHEME = Regexp . compile ( "[A-Za-z](?:[A-Za-z0-9+-\. ])*" ) . freeze
41
42
PORT = Regexp . compile ( "[0-9]*" ) . freeze
42
43
IP_literal = Regexp . compile ( "\\ [[0-9A-Fa-f:\\ .]*\\ ]" ) . freeze # Simplified, no IPvFuture
43
44
PCT_ENCODED = Regexp . compile ( "%[0-9A-Fa-f][0-9A-Fa-f]" ) . freeze
44
- GEN_DELIMS = Regexp . compile ( " [:/\\ ? \\ # \\ [ \\ ]@]" ) . freeze
45
- SUB_DELIMS = Regexp . compile ( " [!\\ $&'\\ ( \\ ) \\ * \\ +,;=]" ) . freeze
46
- RESERVED = Regexp . compile ( "(?: #{ GEN_DELIMS } | #{ SUB_DELIMS } )" ) . freeze
45
+ GEN_DELIMS = Regexp . compile ( %q{ [:/\?\#\[\ ]@]} ) . freeze
46
+ SUB_DELIMS = Regexp . compile ( %q{ [!\$&'\(\)\*\ +,;=]} ) . freeze
47
+ RESERVED = Regexp . union ( GEN_DELIMS , SUB_DELIMS ) . freeze
47
48
UNRESERVED = Regexp . compile ( "[A-Za-z0-9\. _~-]" ) . freeze
48
49
49
- IUNRESERVED = Regexp . compile ( "[A-Za-z0-9 \. _~-]| #{ UCSCHAR } " ) . freeze
50
+ IUNRESERVED = Regexp . union ( UNRESERVED , Regexp . compile ( "[#{ UCSCHAR } ]" ) ) . freeze
50
51
51
- IPCHAR = Regexp . compile ( "(?: #{ IUNRESERVED } | #{ PCT_ENCODED } | #{ SUB_DELIMS } | :|@)" ) . freeze
52
+ IPCHAR = Regexp . union ( IUNRESERVED , PCT_ENCODED , SUB_DELIMS , /[ :|@]/ ) . freeze
52
53
53
54
IQUERY = Regexp . compile ( "(?:#{ IPCHAR } |#{ IPRIVATE } |/|\\ ?)*" ) . freeze
54
55
@@ -65,7 +66,7 @@ class URI
65
66
IPATH_EMPTY = Regexp . compile ( "" ) . freeze
66
67
67
68
IREG_NAME = Regexp . compile ( "(?:(?:#{ IUNRESERVED } )|(?:#{ PCT_ENCODED } )|(?:#{ SUB_DELIMS } ))*" ) . freeze
68
- IHOST = Regexp . compile ( "(?: #{ IP_literal } )|(?: #{ IREG_NAME } )" ) . freeze
69
+ IHOST = Regexp . union ( IP_literal , IREG_NAME ) . freeze
69
70
IUSERINFO = Regexp . compile ( "(?:(?:#{ IUNRESERVED } )|(?:#{ PCT_ENCODED } )|(?:#{ SUB_DELIMS } )|:)*" ) . freeze
70
71
IAUTHORITY = Regexp . compile ( "(?:#{ IUSERINFO } @)?#{ IHOST } (?::#{ PORT } )?" ) . freeze
71
72
@@ -116,7 +117,21 @@ class URI
116
117
# Note: not all reserved characters need to be escaped in SPARQL/Turtle, but they must be unescaped when encountered
117
118
PN_ESCAPE_CHARS = /[~\. !\$ &'\( \) \* \+ ,;=\/ \? \# @%]/ . freeze
118
119
PN_ESCAPES = /\\ #{ Regexp . union ( PN_ESCAPE_CHARS , /[\- _]/ ) } / . freeze
119
-
120
+
121
+ # For URI encoding
122
+ # iuserinfo = *( iunreserved / pct-encoded / sub-delims / ":" )
123
+ ENCODE_USER =
124
+ ENCODE_PASSWORD = Regexp . compile ( "[^A-Za-z0-9\. _~#{ UCSCHAR } !$&'\( \) \* \+ ,;=:-]" ) . freeze
125
+ # isegment = *ipchar
126
+ # ipchar = iunreserved / pct-encoded / sub-delims / ":" / "@"
127
+ ENCODE_ISEGMENT = Regexp . compile ( "[^A-Za-z0-9\. _~#{ UCSCHAR } !$&'\( \) \* \+ ,;=:-]" ) . freeze
128
+ # isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims / "@" )
129
+ ENCODE_ISEGMENT_NC = Regexp . compile ( "[^A-Za-z0-9\. _~#{ UCSCHAR } !$&'\( \) \* \+ ,;=-]" ) . freeze
130
+ # iquery = *( ipchar / iprivate / "/" / "?" )
131
+ ENCODE_IQUERY = Regexp . compile ( "[^A-Za-z0-9\. _~#{ UCSCHAR } \\ uE000-\\ uF8FF\\ u{F0000}-\\ u{FFFFD}\\ u{100000}-\\ u{10FFFD}/?=]" ) . freeze
132
+ # ifragment = *( ipchar / "/" / "?" )
133
+ ENCODE_IFRAGMENT = Regexp . compile ( "[^A-Za-z0-9\. _~#{ UCSCHAR } /?]" ) . freeze
134
+
120
135
##
121
136
# Cache size may be set through {RDF.config} using `uri_cache_size`.
122
137
#
@@ -170,7 +185,7 @@ def self.parse(str)
170
185
# @return [String] normalized path
171
186
# @see http://tools.ietf.org/html/rfc3986#section-5.2.4
172
187
def self . normalize_path ( path )
173
- output , input = "" , path . to_s
188
+ output , input = String . new , path . to_s
174
189
if input . encoding != Encoding ::ASCII_8BIT
175
190
input = input . dup . force_encoding ( Encoding ::ASCII_8BIT )
176
191
end
@@ -353,7 +368,7 @@ def length
353
368
# @return [Boolean] `true` or `false`
354
369
# @since 0.3.9
355
370
def valid?
356
- RDF ::URI ::IRI . match ( to_s ) || false
371
+ RDF ::URI ::IRI . match? ( to_s ) || false
357
372
end
358
373
359
374
##
@@ -920,7 +935,7 @@ def scheme=(value)
920
935
# Return normalized version of scheme, if any
921
936
# @return [String]
922
937
def normalized_scheme
923
- normalize_segment ( scheme . strip , SCHEME , true ) if scheme
938
+ scheme . strip . downcase if scheme
924
939
end
925
940
926
941
##
@@ -946,7 +961,7 @@ def user=(value)
946
961
# Normalized version of user
947
962
# @return [String]
948
963
def normalized_user
949
- URI . encode ( CGI . unescape ( user ) , /[^ #{ IUNRESERVED } | #{ SUB_DELIMS } ]/ ) . force_encoding ( Encoding ::UTF_8 ) if user
964
+ URI . encode ( CGI . unescape ( user ) , ENCODE_USER ) . force_encoding ( Encoding ::UTF_8 ) if user
950
965
end
951
966
952
967
##
@@ -972,7 +987,7 @@ def password=(value)
972
987
# Normalized version of password
973
988
# @return [String]
974
989
def normalized_password
975
- URI . encode ( CGI . unescape ( password ) , /[^ #{ IUNRESERVED } | #{ SUB_DELIMS } ]/ ) . force_encoding ( Encoding ::UTF_8 ) if password
990
+ URI . encode ( CGI . unescape ( password ) , ENCODE_PASSWORD ) . force_encoding ( Encoding ::UTF_8 ) if password
976
991
end
977
992
978
993
HOST_FROM_AUTHORITY_RE = /(?:[^@]+@)?([^:]+)(?::.*)?$/ . freeze
@@ -1000,7 +1015,7 @@ def host=(value)
1000
1015
# @return [String]
1001
1016
def normalized_host
1002
1017
# Remove trailing '.' characters
1003
- normalize_segment ( host , IHOST , true ) . chomp ( '.' ) if host
1018
+ host . sub ( / \. *$/ , '' ) . downcase if host
1004
1019
end
1005
1020
1006
1021
PORT_FROM_AUTHORITY_RE = /:(\d +)$/ . freeze
@@ -1028,12 +1043,8 @@ def port=(value)
1028
1043
# @return [String]
1029
1044
def normalized_port
1030
1045
if port
1031
- np = normalize_segment ( port . to_s , PORT )
1032
- if PORT_MAPPING [ normalized_scheme ] == np . to_i
1033
- nil
1034
- else
1035
- np . to_i
1036
- end
1046
+ np = port . to_i
1047
+ PORT_MAPPING [ normalized_scheme ] != np ? np : nil
1037
1048
end
1038
1049
end
1039
1050
@@ -1064,30 +1075,36 @@ def path=(value)
1064
1075
# Normalized version of path
1065
1076
# @return [String]
1066
1077
def normalized_path
1078
+ if normalized_scheme == "urn"
1079
+ # Special-case URI. Normalize the NID component only
1080
+ nid , p = path . to_s . split ( ':' , 2 )
1081
+ return "#{ nid . downcase } :#{ p } "
1082
+ end
1083
+
1067
1084
segments = path . to_s . split ( '/' , -1 ) # preserve null segments
1068
1085
1069
1086
norm_segs = case
1070
1087
when authority
1071
1088
# ipath-abempty
1072
- segments . map { |s | normalize_segment ( s , ISEGMENT ) }
1089
+ segments . map { |s | normalize_segment ( s , ENCODE_ISEGMENT ) }
1073
1090
when segments [ 0 ] . nil?
1074
1091
# ipath-absolute
1075
1092
res = [ nil ]
1076
- res << normalize_segment ( segments [ 1 ] , ISEGMENT_NZ ) if segments . length > 1
1077
- res += segments [ 2 ..-1 ] . map { |s | normalize_segment ( s , ISEGMENT ) } if segments . length > 2
1093
+ res << normalize_segment ( segments [ 1 ] , ENCODE_ISEGMENT ) if segments . length > 1
1094
+ res += segments [ 2 ..-1 ] . map { |s | normalize_segment ( s , ENCODE_ISEGMENT ) } if segments . length > 2
1078
1095
res
1079
1096
when segments [ 0 ] . to_s . index ( ':' )
1080
1097
# ipath-noscheme
1081
1098
res = [ ]
1082
- res << normalize_segment ( segments [ 0 ] , ISEGMENT_NZ_NC )
1083
- res += segments [ 1 ..-1 ] . map { |s | normalize_segment ( s , ISEGMENT ) } if segments . length > 1
1099
+ res << normalize_segment ( segments [ 0 ] , ENCODE_ISEGMENT_NC )
1100
+ res += segments [ 1 ..-1 ] . map { |s | normalize_segment ( s , ENCODE_ISEGMENT ) } if segments . length > 1
1084
1101
res
1085
1102
when segments [ 0 ]
1086
1103
# ipath-rootless
1087
1104
# ipath-noscheme
1088
1105
res = [ ]
1089
- res << normalize_segment ( segments [ 0 ] , ISEGMENT_NZ )
1090
- res += segments [ 1 ..-1 ] . map { |s | normalize_segment ( s , ISEGMENT ) } if segments . length > 1
1106
+ res << normalize_segment ( segments [ 0 ] , ENCODE_ISEGMENT )
1107
+ res += segments [ 1 ..-1 ] . map { |s | normalize_segment ( s , ENCODE_ISEGMENT ) } if segments . length > 1
1091
1108
res
1092
1109
else
1093
1110
# Should be empty
@@ -1096,7 +1113,7 @@ def normalized_path
1096
1113
1097
1114
res = self . class . normalize_path ( norm_segs . join ( "/" ) )
1098
1115
# Special rules for specific protocols having empty paths
1099
- normalize_segment ( res . empty? ? ( %w( http https ftp tftp ) . include? ( normalized_scheme ) ? '/' : "" ) : res , IHIER_PART )
1116
+ ( res . empty? && %w( http https ftp tftp ) . include? ( normalized_scheme ) ) ? '/' : res
1100
1117
end
1101
1118
1102
1119
##
@@ -1120,7 +1137,7 @@ def query=(value)
1120
1137
# Normalized version of query
1121
1138
# @return [String]
1122
1139
def normalized_query
1123
- normalize_segment ( query , IQUERY ) if query
1140
+ normalize_segment ( query , ENCODE_IQUERY ) if query
1124
1141
end
1125
1142
1126
1143
##
@@ -1144,7 +1161,7 @@ def fragment=(value)
1144
1161
# Normalized version of fragment
1145
1162
# @return [String]
1146
1163
def normalized_fragment
1147
- normalize_segment ( fragment , IFRAGMENT ) if fragment
1164
+ normalize_segment ( fragment , ENCODE_IFRAGMENT ) if fragment
1148
1165
end
1149
1166
1150
1167
##
@@ -1274,15 +1291,15 @@ def query_values=(value)
1274
1291
self . query = case value
1275
1292
when Array , Hash
1276
1293
value . map do |( k , v ) |
1277
- k = normalize_segment ( k . to_s , UNRESERVED )
1294
+ k = normalize_segment ( k . to_s , /[^A-Za-z0-9 \. _~-]/ )
1278
1295
if v . nil?
1279
1296
k
1280
1297
else
1281
1298
Array ( v ) . map do |vv |
1282
1299
if vv === TrueClass
1283
1300
k
1284
1301
else
1285
- "#{ k } =#{ normalize_segment ( vv . to_s , UNRESERVED ) } "
1302
+ "#{ k } =#{ normalize_segment ( vv . to_s , /[^A-Za-z0-9 \. _~-]/ ) } "
1286
1303
end
1287
1304
end . join ( "&" )
1288
1305
end
@@ -1331,15 +1348,15 @@ def self._load(data)
1331
1348
# Normalize a segment using a character range
1332
1349
#
1333
1350
# @param [String] value
1334
- # @param [Regexp] expr
1351
+ # @param [Regexp] expr matches characters to be encoded
1335
1352
# @param [Boolean] downcase
1336
1353
# @return [String]
1337
1354
def normalize_segment ( value , expr , downcase = false )
1338
1355
if value
1339
1356
value = value . dup . force_encoding ( Encoding ::UTF_8 )
1340
1357
decoded = CGI . unescape ( value )
1341
1358
decoded . downcase! if downcase
1342
- URI . encode ( decoded , /[^(?: #{ expr } )]/ ) . force_encoding ( Encoding ::UTF_8 )
1359
+ URI . encode ( decoded , expr ) . force_encoding ( Encoding ::UTF_8 )
1343
1360
end
1344
1361
end
1345
1362
@@ -1364,7 +1381,7 @@ def format_authority
1364
1381
def self . encode ( str , expr )
1365
1382
str . gsub ( expr ) do
1366
1383
us = $&
1367
- tmp = ''
1384
+ tmp = String . new
1368
1385
us . each_byte do |uc |
1369
1386
tmp << sprintf ( '%%%02X' , uc )
1370
1387
end
0 commit comments