View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2015, VU University Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(uri,
   36          [ uri_components/2,           % ?URI, ?Components
   37            uri_data/3,                 % ?Field, +Components, ?Data
   38            uri_data/4,                 % +Field, +Components, -Data, -New
   39
   40            uri_normalized/2,           % +URI, -NormalizedURI
   41            iri_normalized/2,           % +IRI, -NormalizedIRI
   42            uri_normalized_iri/2,       % +URI, -NormalizedIRI
   43            uri_normalized/3,           % +URI, +Base, -NormalizedURI
   44            iri_normalized/3,           % +IRI, +Base, -NormalizedIRI
   45            uri_normalized_iri/3,       % +URI, +Base, -NormalizedIRI
   46            uri_resolve/3,              % +URI, +Base, -AbsURI
   47            uri_is_global/1,            % +URI
   48            uri_query_components/2,     % ?QueryString, ?NameValueList
   49            uri_authority_components/2, % ?Authority, ?Components
   50            uri_authority_data/3,       % ?Field, ?Components, ?Data
   51                                        % Encoding
   52            uri_encoded/3,              % +Component, ?Value, ?Encoded
   53            uri_file_name/2,            % ?URI, ?Path
   54            uri_iri/2                   % ?URI, ?IRI
   55          ]).   56:- use_foreign_library(foreign(uri)).   57
   58/** <module> Process URIs
   59
   60This  library  provides   high-performance    C-based   primitives   for
   61manipulating URIs. We decided for a  C-based implementation for the much
   62better performance on raw character  manipulation. Notably, URI handling
   63primitives are used in  time-critical  parts   of  RDF  processing. This
   64implementation is based on RFC-3986:
   65
   66        http://labs.apache.org/webarch/uri/rfc/rfc3986.html
   67
   68The URI processing in this library is  rather liberal. That is, we break
   69URIs according to the rules, but we  do not validate that the components
   70are valid. Also, percent-decoding for IRIs   is  liberal. It first tries
   71UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim.
   72
   73Earlier experience has shown that strict   enforcement of the URI syntax
   74results in many errors that  are   accepted  by  many other web-document
   75processing tools.
   76*/
   77
   78%!  uri_components(+URI, -Components) is det.
   79%!  uri_components(-URI, +Components) is det.
   80%
   81%   Break a URI  into  its  5   basic  components  according  to the
   82%   RFC-3986 regular expression:
   83%
   84%       ==
   85%       ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
   86%        12            3  4          5       6  7        8 9
   87%       ==
   88%
   89%   @param Components is a   term  uri_components(Scheme, Authority,
   90%   Path, Search, Fragment). If a URI  is *parsed*, i.e., using mode
   91%   (+,-), components that are not   found are left _uninstantiated_
   92%   (variable). See uri_data/3 for accessing this structure.
   93
   94%!  uri_data(?Field, +Components, ?Data) is semidet.
   95%
   96%   Provide access the uri_component structure.  Defined field-names
   97%   are: =scheme=, =authority=, =path=, =search= and =fragment=
   98
   99uri_data(scheme,    uri_components(S, _, _, _, _), S).
  100uri_data(authority, uri_components(_, A, _, _, _), A).
  101uri_data(path,      uri_components(_, _, P, _, _), P).
  102uri_data(search,    uri_components(_, _, _, S, _), S).
  103uri_data(fragment,  uri_components(_, _, _, _, F), F).
  104
  105%!  uri_data(+Field, +Components, +Data, -NewComponents) is semidet.
  106%
  107%   NewComponents is the same as Components with Field set to Data.
  108
  109uri_data(scheme,    uri_components(_, A, P, Q, F), S,
  110                    uri_components(S, A, P, Q, F)).
  111uri_data(authority, uri_components(S, _, P, Q, F), A,
  112                    uri_components(S, A, P, Q, F)).
  113uri_data(path,      uri_components(S, A, _, Q, F), P,
  114                    uri_components(S, A, P, Q, F)).
  115uri_data(search,    uri_components(S, A, P, _, F), Q,
  116                    uri_components(S, A, P, Q, F)).
  117uri_data(fragment,  uri_components(S, A, P, Q, _), F,
  118                    uri_components(S, A, P, Q, F)).
  119
  120%!  uri_normalized(+URI, -NormalizedURI) is det.
  121%
  122%   NormalizedURI is the normalized form   of  URI. Normalization is
  123%   syntactic and involves the following steps:
  124%
  125%       * 6.2.2.1. Case Normalization
  126%       * 6.2.2.2. Percent-Encoding Normalization
  127%       * 6.2.2.3. Path Segment Normalization
  128
  129%!  iri_normalized(+IRI, -NormalizedIRI) is det.
  130%
  131%   NormalizedIRI is the normalized form   of  IRI. Normalization is
  132%   syntactic and involves the following steps:
  133%
  134%       * 6.2.2.1. Case Normalization
  135%       * 6.2.2.3. Path Segment Normalization
  136%
  137%   @see    This is similar to uri_normalized/2, but does not do
  138%           normalization of %-escapes.
  139
  140%!  uri_normalized_iri(+URI, -NormalizedIRI) is det.
  141%
  142%   As uri_normalized/2, but percent-encoding is translated into IRI
  143%   Unicode characters. The translation  is   liberal:  valid  UTF-8
  144%   sequences  of  %-encoded  bytes  are    mapped  to  the  Unicode
  145%   character. Other %XX-sequences are mapped   to the corresponding
  146%   ISO-Latin-1 character and sole % characters are left untouched.
  147%
  148%   @see uri_iri/2.
  149
  150
  151%!  uri_is_global(+URI) is semidet.
  152%
  153%   True if URI has a scheme. The semantics  is the same as the code
  154%   below, but the implementation is more   efficient as it does not
  155%   need to parse the  other  components,   nor  needs  to  bind the
  156%   scheme.
  157%
  158%   ==
  159%   uri_is_global(URI) :-
  160%           uri_components(URI, Components),
  161%           uri_data(scheme, Components, Scheme),
  162%           nonvar(Scheme).
  163%   ==
  164
  165%!  uri_resolve(+URI, +Base, -GlobalURI) is det.
  166%
  167%   Resolve a possibly local URI relative   to Base. This implements
  168%   http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform
  169
  170%!  uri_normalized(+URI, +Base, -NormalizedGlobalURI) is det.
  171%
  172%   NormalizedGlobalURI is the normalized global version of URI.
  173%   Behaves as if defined by:
  174%
  175%   ==
  176%   uri_normalized(URI, Base, NormalizedGlobalURI) :-
  177%           uri_resolve(URI, Base, GlobalURI),
  178%           uri_normalized(GlobalURI, NormalizedGlobalURI).
  179%   ==
  180
  181%!  iri_normalized(+IRI, +Base, -NormalizedGlobalIRI) is det.
  182%
  183%   NormalizedGlobalIRI is the normalized  global   version  of IRI.
  184%   This is similar to uri_normalized/3, but   does  not do %-escape
  185%   normalization.
  186
  187%!  uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI) is det.
  188%
  189%   NormalizedGlobalIRI is the normalized global IRI of URI. Behaves
  190%   as if defined by:
  191%
  192%   ==
  193%   uri_normalized(URI, Base, NormalizedGlobalIRI) :-
  194%           uri_resolve(URI, Base, GlobalURI),
  195%           uri_normalized_iri(GlobalURI, NormalizedGlobalIRI).
  196%   ==
  197
  198%!  uri_query_components(+String, -Query) is det.
  199%!  uri_query_components(-String, +Query) is det.
  200%
  201%   Perform encoding and decoding of an URI query string. Query is a
  202%   list of fully decoded (Unicode) Name=Value pairs. In mode (-,+),
  203%   query elements of the forms Name(Value)  and Name-Value are also
  204%   accepted to enhance interoperability with   the option and pairs
  205%   libraries.  E.g.
  206%
  207%   ==
  208%   ?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']).
  209%   QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'.
  210%
  211%   ?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q).
  212%   Q = [a=b, c='d+w', n='VU Amsterdam'].
  213%   ==
  214
  215
  216%!  uri_authority_components(+Authority, -Components) is det.
  217%!  uri_authority_components(-Authority, +Components) is det.
  218%
  219%   Break-down the authority component of a   URI. The fields of the
  220%   structure Components can be accessed using uri_authority_data/3.
  221
  222%!  uri_authority_data(+Field, ?Components, ?Data) is semidet.
  223%
  224%   Provide access the uri_authority  structure. Defined field-names
  225%   are: =user=, =password=, =host= and =port=
  226
  227uri_authority_data(user,     uri_authority(U, _, _, _), U).
  228uri_authority_data(password, uri_authority(_, P, _, _), P).
  229uri_authority_data(host,     uri_authority(_, _, H, _), H).
  230uri_authority_data(port,     uri_authority(_, _, _, P), P).
  231
  232
  233%!  uri_encoded(+Component, +Value, -Encoded) is det.
  234%!  uri_encoded(+Component, -Value, +Encoded) is det.
  235%
  236%   Encoded  is  the  URI   encoding    for   Value.  When  encoding
  237%   (Value->Encoded), Component specifies the   URI  component where
  238%   the value is used. It  is   one  of =query_value=, =fragment= or
  239%   =path=.  Besides  alphanumerical  characters,    the   following
  240%   characters are passed verbatim (the  set   is  split  in logical
  241%   groups according to RFC3986).
  242%
  243%       $ query_value, fragment :
  244%       "-._~" | "!$'()*,;" | ":@" | "/?"
  245%       $ path :
  246%       "-._~" | "!$&'()*,;=" | ":@" | "/"
  247
  248
  249%!  uri_iri(+URI, -IRI) is det.
  250%!  uri_iri(-URI, +IRI) is det.
  251%
  252%   Convert between a URI, encoded in US-ASCII and an IRI. An IRI is
  253%   a fully expanded  Unicode  string.   Unicode  strings  are first
  254%   encoded into UTF-8, after which %-encoding takes place.
  255%
  256%   @error syntax_error(Culprit) in mode (+,-) if URI is not a
  257%   legally percent-encoded UTF-8 string.
  258
  259
  260%!  uri_file_name(+URI, -FileName) is semidet.
  261%!  uri_file_name(-URI, +FileName) is det.
  262%
  263%   Convert between a URI and a   local  file_name. This protocol is
  264%   covered by RFC 1738. Please note   that file-URIs use _absolute_
  265%   paths. The mode (-, +) translates  a possible relative path into
  266%   an absolute one.
  267
  268uri_file_name(URI, FileName) :-
  269    nonvar(URI),
  270    !,
  271    uri_components(URI, Components),
  272    uri_data(scheme, Components, File), File == file,
  273    (   uri_data(authority, Components, '')
  274    ->  true
  275    ;   uri_data(authority, Components, localhost)
  276    ),
  277    uri_data(path, Components, FileNameEnc),
  278    uri_encoded(path, FileName0, FileNameEnc),
  279    delete_leading_slash(FileName0, FileName).
  280uri_file_name(URI, FileName) :-
  281    nonvar(FileName),
  282    !,
  283    absolute_file_name(FileName, Path0),
  284    ensure_leading_slash(Path0, Path),
  285    uri_encoded(path, Path, PathEnc),
  286    uri_data(scheme, Components, file),
  287    uri_data(authority, Components, ''),
  288    uri_data(path, Components, PathEnc),
  289    uri_components(URI, Components).
  290
  291%!  ensure_leading_slash(+WinPath, -Path).
  292%!  delete_leading_slash(+Path, -WinPath).
  293%
  294%   Deal with the fact that absolute paths   in Windows start with a
  295%   drive letter rather than a  /.  For   URIs  we  need a path that
  296%   starts with a /.
  297
  298ensure_leading_slash(Path, SlashPath) :-
  299    (   sub_atom(Path, 0, _, _, /)
  300    ->  SlashPath = Path
  301    ;   atom_concat(/, Path, SlashPath)
  302    ).
  303
  304:- if(current_prolog_flag(windows, true)).  305delete_leading_slash(Path, WinPath) :-
  306    atom_concat(/, WinPath, Path),
  307    is_absolute_file_name(WinPath),
  308    !.
  309:- endif.  310delete_leading_slash(Path, Path).
  311
  312
  313                 /*******************************
  314                 *            SANDBOX           *
  315                 *******************************/
  316
  317:- multifile sandbox:safe_primitive/1.  318
  319sandbox:safe_primitive(uri:uri_components(_,_)).
  320sandbox:safe_primitive(uri:uri_normalized(_,_)).
  321sandbox:safe_primitive(uri:iri_normalized(_,_)).
  322sandbox:safe_primitive(uri:uri_normalized_iri(_,_)).
  323sandbox:safe_primitive(uri:uri_normalized(_,_,_)).
  324sandbox:safe_primitive(uri:iri_normalized(_,_,_)).
  325sandbox:safe_primitive(uri:uri_normalized_iri(_,_,_)).
  326sandbox:safe_primitive(uri:uri_resolve(_,_,_)).
  327sandbox:safe_primitive(uri:uri_is_global(_)).
  328sandbox:safe_primitive(uri:uri_query_components(_,_)).
  329sandbox:safe_primitive(uri:uri_authority_components(_,_)).
  330sandbox:safe_primitive(uri:uri_encoded(_,_,_)).
  331sandbox:safe_primitive(uri:uri_iri(_,_))