1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2009-2015, VU University Amsterdam 7 All rights reserved. 8 9 Redistribution and use in source and binary forms, with or without 10 modification, are permitted provided that the following conditions 11 are met: 12 13 1. Redistributions of source code must retain the above copyright 14 notice, this list of conditions and the following disclaimer. 15 16 2. Redistributions in binary form must reproduce the above copyright 17 notice, this list of conditions and the following disclaimer in 18 the documentation and/or other materials provided with the 19 distribution. 20 21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 POSSIBILITY OF SUCH DAMAGE. 33*/ 34 35:- module(uri, 36 [ uri_components/2, % ?URI, ?Components 37 uri_data/3, % ?Field, +Components, ?Data 38 uri_data/4, % +Field, +Components, -Data, -New 39 40 uri_normalized/2, % +URI, -NormalizedURI 41 iri_normalized/2, % +IRI, -NormalizedIRI 42 uri_normalized_iri/2, % +URI, -NormalizedIRI 43 uri_normalized/3, % +URI, +Base, -NormalizedURI 44 iri_normalized/3, % +IRI, +Base, -NormalizedIRI 45 uri_normalized_iri/3, % +URI, +Base, -NormalizedIRI 46 uri_resolve/3, % +URI, +Base, -AbsURI 47 uri_is_global/1, % +URI 48 uri_query_components/2, % ?QueryString, ?NameValueList 49 uri_authority_components/2, % ?Authority, ?Components 50 uri_authority_data/3, % ?Field, ?Components, ?Data 51 % Encoding 52 uri_encoded/3, % +Component, ?Value, ?Encoded 53 uri_file_name/2, % ?URI, ?Path 54 uri_iri/2 % ?URI, ?IRI 55 ]). 56:- use_foreign_library(foreign(uri)). 57 58/** <module> Process URIs 59 60This library provides high-performance C-based primitives for 61manipulating URIs. We decided for a C-based implementation for the much 62better performance on raw character manipulation. Notably, URI handling 63primitives are used in time-critical parts of RDF processing. This 64implementation is based on RFC-3986: 65 66 http://labs.apache.org/webarch/uri/rfc/rfc3986.html 67 68The URI processing in this library is rather liberal. That is, we break 69URIs according to the rules, but we do not validate that the components 70are valid. Also, percent-decoding for IRIs is liberal. It first tries 71UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim. 72 73Earlier experience has shown that strict enforcement of the URI syntax 74results in many errors that are accepted by many other web-document 75processing tools. 76*/ 77 78%! uri_components(+URI, -Components) is det. 79%! uri_components(-URI, +Components) is det. 80% 81% Break a URI into its 5 basic components according to the 82% RFC-3986 regular expression: 83% 84% == 85% ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 86% 12 3 4 5 6 7 8 9 87% == 88% 89% @param Components is a term uri_components(Scheme, Authority, 90% Path, Search, Fragment). If a URI is *parsed*, i.e., using mode 91% (+,-), components that are not found are left _uninstantiated_ 92% (variable). See uri_data/3 for accessing this structure. 93 94%! uri_data(?Field, +Components, ?Data) is semidet. 95% 96% Provide access the uri_component structure. Defined field-names 97% are: =scheme=, =authority=, =path=, =search= and =fragment= 98 99uri_data(scheme, uri_components(S, _, _, _, _), S). 100uri_data(authority, uri_components(_, A, _, _, _), A). 101uri_data(path, uri_components(_, _, P, _, _), P). 102uri_data(search, uri_components(_, _, _, S, _), S). 103uri_data(fragment, uri_components(_, _, _, _, F), F). 104 105%! uri_data(+Field, +Components, +Data, -NewComponents) is semidet. 106% 107% NewComponents is the same as Components with Field set to Data. 108 109uri_data(scheme, uri_components(_, A, P, Q, F), S, 110 uri_components(S, A, P, Q, F)). 111uri_data(authority, uri_components(S, _, P, Q, F), A, 112 uri_components(S, A, P, Q, F)). 113uri_data(path, uri_components(S, A, _, Q, F), P, 114 uri_components(S, A, P, Q, F)). 115uri_data(search, uri_components(S, A, P, _, F), Q, 116 uri_components(S, A, P, Q, F)). 117uri_data(fragment, uri_components(S, A, P, Q, _), F, 118 uri_components(S, A, P, Q, F)). 119 120%! uri_normalized(+URI, -NormalizedURI) is det. 121% 122% NormalizedURI is the normalized form of URI. Normalization is 123% syntactic and involves the following steps: 124% 125% * 6.2.2.1. Case Normalization 126% * 6.2.2.2. Percent-Encoding Normalization 127% * 6.2.2.3. Path Segment Normalization 128 129%! iri_normalized(+IRI, -NormalizedIRI) is det. 130% 131% NormalizedIRI is the normalized form of IRI. Normalization is 132% syntactic and involves the following steps: 133% 134% * 6.2.2.1. Case Normalization 135% * 6.2.2.3. Path Segment Normalization 136% 137% @see This is similar to uri_normalized/2, but does not do 138% normalization of %-escapes. 139 140%! uri_normalized_iri(+URI, -NormalizedIRI) is det. 141% 142% As uri_normalized/2, but percent-encoding is translated into IRI 143% Unicode characters. The translation is liberal: valid UTF-8 144% sequences of %-encoded bytes are mapped to the Unicode 145% character. Other %XX-sequences are mapped to the corresponding 146% ISO-Latin-1 character and sole % characters are left untouched. 147% 148% @see uri_iri/2. 149 150 151%! uri_is_global(+URI) is semidet. 152% 153% True if URI has a scheme. The semantics is the same as the code 154% below, but the implementation is more efficient as it does not 155% need to parse the other components, nor needs to bind the 156% scheme. 157% 158% == 159% uri_is_global(URI) :- 160% uri_components(URI, Components), 161% uri_data(scheme, Components, Scheme), 162% nonvar(Scheme). 163% == 164 165%! uri_resolve(+URI, +Base, -GlobalURI) is det. 166% 167% Resolve a possibly local URI relative to Base. This implements 168% http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform 169 170%! uri_normalized(+URI, +Base, -NormalizedGlobalURI) is det. 171% 172% NormalizedGlobalURI is the normalized global version of URI. 173% Behaves as if defined by: 174% 175% == 176% uri_normalized(URI, Base, NormalizedGlobalURI) :- 177% uri_resolve(URI, Base, GlobalURI), 178% uri_normalized(GlobalURI, NormalizedGlobalURI). 179% == 180 181%! iri_normalized(+IRI, +Base, -NormalizedGlobalIRI) is det. 182% 183% NormalizedGlobalIRI is the normalized global version of IRI. 184% This is similar to uri_normalized/3, but does not do %-escape 185% normalization. 186 187%! uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI) is det. 188% 189% NormalizedGlobalIRI is the normalized global IRI of URI. Behaves 190% as if defined by: 191% 192% == 193% uri_normalized(URI, Base, NormalizedGlobalIRI) :- 194% uri_resolve(URI, Base, GlobalURI), 195% uri_normalized_iri(GlobalURI, NormalizedGlobalIRI). 196% == 197 198%! uri_query_components(+String, -Query) is det. 199%! uri_query_components(-String, +Query) is det. 200% 201% Perform encoding and decoding of an URI query string. Query is a 202% list of fully decoded (Unicode) Name=Value pairs. In mode (-,+), 203% query elements of the forms Name(Value) and Name-Value are also 204% accepted to enhance interoperability with the option and pairs 205% libraries. E.g. 206% 207% == 208% ?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']). 209% QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'. 210% 211% ?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q). 212% Q = [a=b, c='d+w', n='VU Amsterdam']. 213% == 214 215 216%! uri_authority_components(+Authority, -Components) is det. 217%! uri_authority_components(-Authority, +Components) is det. 218% 219% Break-down the authority component of a URI. The fields of the 220% structure Components can be accessed using uri_authority_data/3. 221 222%! uri_authority_data(+Field, ?Components, ?Data) is semidet. 223% 224% Provide access the uri_authority structure. Defined field-names 225% are: =user=, =password=, =host= and =port= 226 uri_authority(U, _, _, _), U) (user, . 228uri_authority_data(password, uri_authority(_, P, _, _), P). 229uri_authority_data(host, uri_authority(_, _, H, _), H). 230uri_authority_data(port, uri_authority(_, _, _, P), P). 231 232 233%! uri_encoded(+Component, +Value, -Encoded) is det. 234%! uri_encoded(+Component, -Value, +Encoded) is det. 235% 236% Encoded is the URI encoding for Value. When encoding 237% (Value->Encoded), Component specifies the URI component where 238% the value is used. It is one of =query_value=, =fragment= or 239% =path=. Besides alphanumerical characters, the following 240% characters are passed verbatim (the set is split in logical 241% groups according to RFC3986). 242% 243% $ query_value, fragment : 244% "-._~" | "!$'()*,;" | ":@" | "/?" 245% $ path : 246% "-._~" | "!$&'()*,;=" | ":@" | "/" 247 248 249%! uri_iri(+URI, -IRI) is det. 250%! uri_iri(-URI, +IRI) is det. 251% 252% Convert between a URI, encoded in US-ASCII and an IRI. An IRI is 253% a fully expanded Unicode string. Unicode strings are first 254% encoded into UTF-8, after which %-encoding takes place. 255% 256% @error syntax_error(Culprit) in mode (+,-) if URI is not a 257% legally percent-encoded UTF-8 string. 258 259 260%! uri_file_name(+URI, -FileName) is semidet. 261%! uri_file_name(-URI, +FileName) is det. 262% 263% Convert between a URI and a local file_name. This protocol is 264% covered by RFC 1738. Please note that file-URIs use _absolute_ 265% paths. The mode (-, +) translates a possible relative path into 266% an absolute one. 267 268uri_file_name(URI, FileName) :- 269 nonvar(URI), 270 !, 271 uri_components(URI, Components), 272 uri_data(scheme, Components, File), File == file, 273 ( uri_data(authority, Components, '') 274 -> true 275 ; uri_data(authority, Components, localhost) 276 ), 277 uri_data(path, Components, FileNameEnc), 278 uri_encoded(path, FileName0, FileNameEnc), 279 delete_leading_slash(FileName0, FileName). 280uri_file_name(URI, FileName) :- 281 nonvar(FileName), 282 !, 283 absolute_file_name(FileName, Path0), 284 ensure_leading_slash(Path0, Path), 285 uri_encoded(path, Path, PathEnc), 286 uri_data(scheme, Components, file), 287 uri_data(authority, Components, ''), 288 uri_data(path, Components, PathEnc), 289 uri_components(URI, Components). 290 291%! ensure_leading_slash(+WinPath, -Path). 292%! delete_leading_slash(+Path, -WinPath). 293% 294% Deal with the fact that absolute paths in Windows start with a 295% drive letter rather than a /. For URIs we need a path that 296% starts with a /. 297 298ensure_leading_slash(Path, SlashPath) :- 299 ( sub_atom(Path, 0, _, _, /) 300 -> SlashPath = Path 301 ; atom_concat(/, Path, SlashPath) 302 ). 303 304:- if(current_prolog_flag(windows, true)). 305delete_leading_slash(Path, WinPath) :- 306 atom_concat(/, WinPath, Path), 307 is_absolute_file_name(WinPath), 308 !. 309:- endif. 310delete_leading_slash(Path, Path). 311 312 313 /******************************* 314 * SANDBOX * 315 *******************************/ 316 317:- multifile sandbox:safe_primitive/1. 318 319sandbox:safe_primitive(uri:uri_components(_,_)). 320sandbox:safe_primitive(uri:uri_normalized(_,_)). 321sandbox:safe_primitive(uri:iri_normalized(_,_)). 322sandbox:safe_primitive(uri:uri_normalized_iri(_,_)). 323sandbox:safe_primitive(uri:uri_normalized(_,_,_)). 324sandbox:safe_primitive(uri:iri_normalized(_,_,_)). 325sandbox:safe_primitive(uri:uri_normalized_iri(_,_,_)). 326sandbox:safe_primitive(uri:uri_resolve(_,_,_)). 327sandbox:safe_primitive(uri:uri_is_global(_)). 328sandbox:safe_primitive(uri:uri_query_components(_,_)). 329sandbox:safe_primitive(uri:uri_authority_components(_,_)). 330sandbox:safe_primitive(uri:uri_encoded(_,_,_)). 331sandbox:safe_primitive(uri:uri_iri(_,_))