View source with raw comments or as raw

    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2000-2015, University of Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(url,
   36          [ parse_url/2,                % +URL, -Parts | -URL +Parts
   37            parse_url/3,                % +URL|URI, +BaseURL, -Parts
   38                                        % -URL, +BaseURL, +Parts
   39            is_absolute_url/1,          % +URL
   40            global_url/3,               % +Local, +Base, -Global
   41            http_location/2,            % ?Parts, ?Location
   42            www_form_encode/2,          % Value <-> Encoded
   43            parse_url_search/2,         % Form-data <-> Form fields
   44
   45            url_iri/2,                  % ?URL, ?IRI
   46
   47            file_name_to_url/2,         % ?FileName, ?URL
   48
   49            set_url_encoding/2          % ?Old, +New
   50          ]).   51:- use_module(library(lists)).   52:- use_module(library(error)).   53:- use_module(library(utf8)).

Analysing and constructing URL

This library deals with the analysis and construction of a URL, Universal Resource Locator. URL is the basis for communicating locations of resources (data) on the web. A URL consists of a protocol identifier (e.g. HTTP, FTP, and a protocol-specific syntax further defining the location. URLs are standardized in RFC-1738.

The implementation in this library covers only a small portion of the defined protocols. Though the initial implementation followed RFC-1738 strictly, the current is more relaxed to deal with frequent violations of the standard encountered in practical use.

author: - Jan Wielemaker; - Lukas Faulstich
deprecated: -
New code should use library(uri), provided by the clib package. */

   74                 /*******************************
   75                 *            GLOBALISE         *
   76                 *******************************/

global_url(+URL, +Base, -Global) is det

Translate a possibly relative URL into an absolute one.

Errors: - syntax_error(illegal_url) if URL is not legal.

   84global_url(URL, BaseURL, Global) :-
   85    (   is_absolute_url(URL),
   86        \+ sub_atom(URL, _, _, _, '%')      % may have escape, use general
   87    ->  Global = URL
   88    ;   sub_atom(URL, 0, _, _, '//')
   89    ->  parse_url(BaseURL, [], Attributes),
   90        memberchk(protocol(Proto), Attributes),
   91        atomic_list_concat([Proto, :, URL], Global)
   92    ;   sub_atom(URL, 0, _, _, #)
   93    ->  (   sub_atom(BaseURL, _, _, 0, #)
   94        ->  sub_atom(URL, 1, _, 0, NoHash),
   95            atom_concat(BaseURL, NoHash, Global)
   96        ;   atom_concat(BaseURL, URL, Global)
   97        )
   98    ;   parse_url(URL, BaseURL, Attributes)
   99    ->  phrase(curl(Attributes), Chars),
  100        atom_codes(Global, Chars)
  101    ;   throw(error(syntax_error(illegal_url), URL))
  102    ).

is_absolute_url(+URL): True if URL is an absolute URL. That is, a URL that starts with a protocol identifier.

  109is_absolute_url(URL) :-
  110    sub_atom(URL, 0, _, _, 'http://'),
  111    !.
  112is_absolute_url(URL) :-
  113    sub_atom(URL, 0, _, _, 'https://'),
  114    !.
  115is_absolute_url(URL) :-
  116    sub_atom(URL, 0, _, _, 'ftp://'),
  117    !.
  118is_absolute_url(URL) :-
  119    sub_atom(URL, 0, _, _, 'file://'),
  120    !.
  121is_absolute_url(URL) :-
  122    atom_codes(URL, Codes),
  123    phrase(absolute_url, Codes, _),
  124    !.
  125
  126
  127                 /*******************************
  128                 *        CREATE URL/URI        *
  129                 *******************************/

http_location(?Parts, ?Location)

Construct or analyze an HTTP location. This is similar to parse_url/2, but only deals with the location part of an HTTP URL. That is, the path, search and fragment specifiers. In the HTTP protocol, the first line of a message is

<Action> <Location> HTTP/<version>

Arguments:

Location

- Atom or list of character codes.

  144http_location(Parts, Location) :-       % Parts --> Location
  145    nonvar(Parts),
  146    !,
  147    phrase(curi(Parts), String),
  148    !,
  149    atom_codes(Location, String).
  150http_location(Parts, Location) :-       % Location --> Parts
  151    atom(Location),
  152    !,
  153    atom_codes(Location, Codes),
  154    phrase(http_location(Parts), Codes).
  155http_location(Parts, Codes) :-          % LocationCodes --> Parts
  156    is_list(Codes),
  157    phrase(http_location(Parts), Codes).
  158
  159
  160curl(A) -->
  161    { memberchk(protocol(Protocol), A)
  162    },
  163    !,
  164    catomic(Protocol),
  165    ":",
  166    curl(Protocol, A).
  167curl(A) -->
  168    curl(http, A).
  169
  170curl(file, A) -->
  171    !,
  172    (   "//"
  173    ->  cpath(A)
  174    ;   cpath(A)
  175    ).
  176curl(_, A) -->
  177    "//",
  178    cuser(A),
  179    chost(A),
  180    cport(A),
  181    cpath(A),
  182    csearch(A),
  183    cfragment(A).
  184
  185curi(A) -->
  186    cpath(A),
  187    csearch(A).
  188
  189cpath(A) -->
  190    (   { memberchk(path(Path), A) }
  191    ->  { atom_codes(Path, Codes) },
  192        www_encode(Codes, [0'/, 0'+, 0':, 0',])
  193    ;   ""
  194    ).
  195
  196cuser(A) -->
  197    (   { memberchk(user(User), A) }
  198    ->  { atom_codes(User, Codes) },
  199        www_encode(Codes, [0':]),
  200        "@"
  201    ;   ""
  202    ).
  203
  204chost(A) -->
  205    (   { memberchk(host(Host), A) }
  206    ->  { atom_codes(Host, Codes) },
  207        www_encode(Codes, [])
  208    ;   ""
  209    ).
  210
  211cport(A) -->
  212    (   { memberchk(port(Port), A), Port \== 80 }
  213    ->  { number_codes(Port, Codes) },
  214        ":",
  215        www_encode(Codes, [])
  216    ;   ""
  217    ).
  218
  219
  220catomic(A, In, Out) :-
  221    atom_codes(A, Codes),
  222    append(Codes, Out, In).

csearch(+Attributes)//

  226csearch(A)-->
  227    (   { memberchk(search(Parameters), A) }
  228    ->  csearch(Parameters, [0'?])
  229    ;   []
  230    ).
  231
  232csearch([], _) -->
  233    [].
  234csearch([Parameter|Parameters], Sep) -->
  235    !,
  236    codes(Sep),
  237    cparam(Parameter),
  238    csearch(Parameters, [0'&]).
  239
  240cparam(Name=Value) -->
  241    !,
  242    cname(Name),
  243    "=",
  244    cvalue(Value).
  245cparam(NameValue) -->                   % allow to feed Name(Value)
  246    { compound(NameValue),
  247      !,
  248      NameValue =.. [Name,Value]
  249    },
  250    cname(Name),
  251    "=",
  252    cvalue(Value).
  253cparam(Name)-->
  254    cname(Name).
  255
  256codes([]) --> [].
  257codes([H|T]) --> [H], codes(T).
  258
  259cname(Atom) -->
  260    { atom_codes(Atom, Codes) },
  261    www_encode(Codes, []).

cvalue(+Value)// is det: Construct a string from Value. Value is either atomic or a code-list.

  268cvalue(Value) -->
  269    { atomic(Value),
  270      !,
  271      atom_codes(Value, Codes)
  272    },
  273    www_encode(Codes, []).
  274cvalue(Codes) -->
  275    { must_be(codes, Codes)
  276    },
  277    www_encode(Codes, []).

cfragment(+Attributes)//

  282cfragment(A) -->
  283    { memberchk(fragment(Frag), A),
  284      !,
  285      atom_codes(Frag, Codes)
  286    },
  287    "#",
  288    www_encode(Codes, []).
  289cfragment(_) -->
  290    "".
  291
  292
  293                 /*******************************
  294                 *            PARSING           *
  295                 *******************************/

parse_url(?URL, ?Attributes) is det

Construct or analyse a URL. URL is an atom holding a URL or a variable. Attributes is a list of components. Each component is of the format Name(Value). Defined components are:

protocol(Protocol): The used protocol. This is, after the optional url:, an identifier separated from the remainder of the URL using :. parse_url/2 assumes the http protocol if no protocol is specified and the URL can be parsed as a valid HTTP url. In addition to the RFC-1738 specified protocols, the file protocol is supported as well.
host(Host): Host-name or IP-address on which the resource is located. Supported by all network-based protocols.
port(Port): Integer port-number to access on the \arg{Host}. This only appears if the port is explicitly specified in the URL. Implicit default ports (e.g., 80 for HTTP) do not appear in the part-list.
path(Path): (File-) path addressed by the URL. This is supported for the ftp, http and file protocols. If no path appears, the library generates the path /.
search(ListOfNameValue): Search-specification of HTTP URL. This is the part after the ?, normally used to transfer data from HTML forms that use the GET protocol. In the URL it consists of a www-form-encoded list of Name=Value pairs. This is mapped to a list of Prolog Name=Value terms with decoded names and values.
fragment(Fragment): Fragment specification of HTTP URL. This is the part after the # character.

The example below illustrates all of this for an HTTP URL.

?- parse_url('http://www.xyz.org/hello?msg=Hello+World%21#x',
       P).

P = [ protocol(http),
      host('www.xyz.org'),
      fragment(x),
      search([ msg = 'Hello World!'
             ]),
      path('/hello')
    ]

By instantiating the parts-list this predicate can be used to create a URL.

  356parse_url(URL, Attributes) :-
  357    nonvar(URL),
  358    !,
  359    atom_codes(URL, Codes),
  360    phrase(url(Attributes), Codes).
  361parse_url(URL, Attributes) :-
  362    phrase(curl(Attributes), Codes),
  363    !,
  364    atom_codes(URL, Codes).

parse_url(+URL, +BaseURL, -Attributes) is det: Similar to parse_url/2 for relative URLs. If URL is relative, it is resolved using the absolute URL BaseURL.

  371parse_url(URL, BaseURL, Attributes) :-
  372    nonvar(URL),
  373    !,
  374    atom_codes(URL, Codes),
  375    (   phrase(absolute_url, Codes, _)
  376    ->  phrase(url(Attributes), Codes)
  377    ;   (   atomic(BaseURL)
  378        ->  parse_url(BaseURL, BaseA0)
  379        ;   BaseA0 = BaseURL
  380        ),
  381        select(path(BasePath), BaseA0, BaseA1),
  382        delete(BaseA1, search(_), BaseA2),
  383        delete(BaseA2, fragment(_), BaseA3),
  384        phrase(relative_uri(URIA0), Codes),
  385        select(path(LocalPath), URIA0, URIA1),
  386        !,
  387        globalise_path(LocalPath, BasePath, Path),
  388        append(BaseA3, [path(Path)|URIA1], Attributes)
  389    ).
  390parse_url(URL, BaseURL, Attributes) :-
  391    parse_url(BaseURL, BaseAttributes),
  392    memberchk(path(BasePath), BaseAttributes),
  393    (   memberchk(path(LocalPath), Attributes)
  394    ->  globalise_path(LocalPath, BasePath, Path)
  395    ;   Path = BasePath
  396    ),
  397    append([path(Path)|Attributes], BaseAttributes, GlobalAttributes),
  398    phrase(curl(GlobalAttributes), Chars),
  399    atom_codes(URL, Chars).

globalise_path(+LocalPath, +RelativeTo, -FullPath) is det: The first clause deals with the standard URL /... global paths. The second with file://drive:path on MS-Windows. This is a bit of a cludge, but unfortunately common practice is -especially on Windows- not always following the standard

  409globalise_path(LocalPath, _, LocalPath) :-
  410    sub_atom(LocalPath, 0, _, _, /),
  411    !.
  412globalise_path(LocalPath, _, LocalPath) :-
  413    is_absolute_file_name(LocalPath),
  414    !.
  415globalise_path(Local, Base, Path) :-
  416    base_dir(Base, BaseDir),
  417    make_path(BaseDir, Local, Path).
  418
  419base_dir(BasePath, BaseDir) :-
  420    (   atom_concat(BaseDir, /, BasePath)
  421    ->  true
  422    ;   file_directory_name(BasePath, BaseDir)
  423    ).
  424
  425make_path(Dir, Local, Path) :-
  426    atom_concat('../', L2, Local),
  427    file_directory_name(Dir, Parent),
  428    Parent \== Dir,
  429    !,
  430    make_path(Parent, L2, Path).
  431make_path(/, Local, Path) :-
  432    !,
  433    atom_concat(/, Local, Path).
  434make_path(Dir, Local, Path) :-
  435    atomic_list_concat([Dir, /, Local], Path).

absolute_url//: True if the input describes an absolute URL. This means it starts with a URL schema. We demand a schema of length > 1 to avoid confusion with Windows drive letters.

  444absolute_url -->
  445    lwalpha(_First),
  446    schema_chars(Rest),
  447    { Rest \== [] },
  448    ":",
  449    !.
  450
  451
  452                 /*******************************
  453                 *           SEQUENCES          *
  454                 *******************************/
  455
  456digits(L) -->
  457    digits(L, []).
  458
  459digits([C|T0], T) -->
  460    digit(C),
  461    !,
  462    digits(T0, T).
  463digits(T, T) -->
  464    [].
  465
  466
  467digit(C, [C|T], T) :- code_type(C, digit).
  468
  469                 /*******************************
  470                 *            RFC-3986          *
  471                 *******************************/

uri(-Parts)//

  475url([protocol(Schema)|Parts]) -->
  476    schema(Schema),
  477    ":",
  478    !,
  479    hier_part(Schema, Parts, P2),
  480    query(P2, P3),
  481    fragment(P3, []).
  482url([protocol(http)|Parts]) -->         % implicit HTTP
  483    authority(Parts, [path(Path)]),
  484    path_abempty(Path).
  485
  486relative_uri(Parts) -->
  487    relative_part(Parts, P2),
  488    query(P2, P3),
  489    fragment(P3, []).
  490
  491relative_part(Parts, Tail) -->
  492    "//",
  493    !,
  494    authority(Parts, [path(Path)|Tail]),
  495    path_abempty(Path).
  496relative_part([path(Path)|T], T) -->
  497    (   path_absolute(Path)
  498    ;   path_noschema(Path)
  499    ;   path_empty(Path)
  500    ),
  501    !.
  502
  503http_location([path(Path)|P2]) -->
  504    path_abempty(Path),
  505    query(P2, P3),
  506    fragment(P3, []).

schema(-Atom)//

Schema is case-insensitive and the canonical version is lowercase.

Schema ::= ALPHA *(ALPHA|DIGIT|"+"|"-"|".")

  517schema(Schema) -->
  518    lwalpha(C0),
  519    schema_chars(Codes),
  520    { atom_codes(Schema, [C0|Codes]) }.
  521
  522schema_chars([H|T]) -->
  523    schema_char(H),
  524    !,
  525    schema_chars(T).
  526schema_chars([]) -->
  527    [].
  528
  529schema_char(H) -->
  530    [C],
  531    { C < 128,
  532      (   code_type(C, alpha)
  533      ->  code_type(H, to_lower(C))
  534      ;   code_type(C, digit)
  535      ->  H = C
  536      ;   schema_extra(C)
  537      ->  H = C
  538      )
  539    }.
  540
  541schema_extra(0'+).
  542schema_extra(0'-).
  543schema_extra(0'.).      % 0'

hier_part(+Schema, -Parts, ?Tail)//

  548hier_part(file, [path(Path)|Tail], Tail) -->
  549    !,
  550    "//",
  551    (   win_drive_path(Path)
  552    ;   path_absolute(Path)
  553    ;   path_rootless(Path)
  554    ;   path_empty(Path)
  555    ),
  556    !.
  557hier_part(_, Parts, Tail) -->
  558    "//",
  559    !,
  560    authority(Parts, [path(Path)|Tail]),
  561    path_abempty(Path).
  562hier_part(_, [path(Path)|T], T) -->
  563    (   path_absolute(Path)
  564    ;   path_rootless(Path)
  565    ;   path_empty(Path)
  566    ),
  567    !.
  568
  569authority(Parts, Tail) -->
  570    user_info_chars(UserChars),
  571    "@",
  572    !,
  573    { atom_codes(User, UserChars),
  574      Parts = [user(User),host(Host)|T0]
  575    },
  576    host(Host),
  577    port(T0,Tail).
  578authority([host(Host)|T0], Tail) -->
  579    host(Host),
  580    port(T0, Tail).
  581
  582user_info_chars([H|T]) -->
  583    user_info_char(H),
  584    !,
  585    user_info_chars(T).
  586user_info_chars([]) -->
  587    [].
  588
  589user_info_char(_) --> "@", !, {fail}.
  590user_info_char(C) --> pchar(C).
  591
  592%host(Host) --> ip_literal(Host), !.            % TBD: IP6 addresses
  593host(Host) --> ip4_address(Host), !.
  594host(Host) --> reg_name(Host).
  595
  596ip4_address(Atom) -->
  597    i256_chars(Chars, [0'.|T0]),
  598    i256_chars(T0, [0'.|T1]),
  599    i256_chars(T1, [0'.|T2]),
  600    i256_chars(T2, []),
  601    { atom_codes(Atom, Chars) }.
  602
  603i256_chars(Chars, T) -->
  604    digits(Chars, T),
  605    { \+ \+ ( T = [],
  606              Chars \== [],
  607              number_codes(I, Chars),
  608              I < 256
  609            )
  610    }.
  611
  612reg_name(Host) -->
  613    reg_name_chars(Chars),
  614    { atom_codes(Host, Chars) }.
  615
  616reg_name_chars([H|T]) -->
  617    reg_name_char(H),
  618    !,
  619    reg_name_chars(T).
  620reg_name_chars([]) -->
  621    [].
  622
  623reg_name_char(C) -->
  624    pchar(C),
  625    { C \== 0':,
  626      C \== 0'@
  627    }.
  628
  629port([port(Port)|T], T) -->
  630    ":",
  631    !,
  632    digit(D0),
  633    digits(Ds),
  634    { number_codes(Port, [D0|Ds]) }.
  635port(T, T) -->
  636    [].
  637
  638path_abempty(Path) -->
  639    segments_chars(Chars, []),
  640    {   Chars == []
  641    ->  Path = '/'
  642    ;   atom_codes(Path, Chars)
  643    }.
  644
  645
  646win_drive_path(Path) -->
  647    drive_letter(C0),
  648    ":",
  649    (   "/"
  650    ->  {Codes = [C0, 0':, 0'/|Chars]}
  651    ;   {Codes = [C0, 0':|Chars]}
  652    ),
  653    segment_nz_chars(Chars, T0),
  654    segments_chars(T0, []),
  655    { atom_codes(Path, Codes) }.
  656
  657
  658path_absolute(Path) -->
  659    "/",
  660    segment_nz_chars(Chars, T0),
  661    segments_chars(T0, []),
  662    { atom_codes(Path, [0'/| Chars]) }.
  663
  664path_noschema(Path) -->
  665    segment_nz_nc_chars(Chars, T0),
  666    segments_chars(T0, []),
  667    { atom_codes(Path, Chars) }.
  668
  669path_rootless(Path) -->
  670    segment_nz_chars(Chars, T0),
  671    segments_chars(T0, []),
  672    { atom_codes(Path, Chars) }.
  673
  674path_empty('/') -->
  675    "".
  676
  677segments_chars([0'/|Chars], T) -->      % 0'
  678    "/",
  679    !,
  680    segment_chars(Chars, T0),
  681    segments_chars(T0, T).
  682segments_chars(T, T) -->
  683    [].
  684
  685segment_chars([H|T0], T) -->
  686    pchar(H),
  687    !,
  688    segment_chars(T0, T).
  689segment_chars(T, T) -->
  690    [].
  691
  692segment_nz_chars([H|T0], T) -->
  693    pchar(H),
  694    segment_chars(T0, T).
  695
  696segment_nz_nc_chars([H|T0], T) -->
  697    segment_nz_nc_char(H),
  698    !,
  699    segment_nz_nc_chars(T0, T).
  700segment_nz_nc_chars(T, T) -->
  701    [].
  702
  703segment_nz_nc_char(_) --> ":", !, {fail}.
  704segment_nz_nc_char(C) --> pchar(C).

query(-Parts, ?Tail)// is det: Extract &Name=Value, ...

  711query([search(Params)|T], T) -->
  712    "?",
  713    !,
  714    search(Params).
  715query(T,T) -->
  716    [].
  717
  718search([Parameter|Parameters])-->
  719    parameter(Parameter),
  720    !,
  721    (   search_sep
  722    ->  search(Parameters)
  723    ;   { Parameters = [] }
  724    ).
  725search([]) -->
  726    [].
  727
  728parameter(Param)-->
  729    !,
  730    search_chars(NameS),
  731    { atom_codes(Name, NameS)
  732    },
  733    (   "="
  734    ->  search_value_chars(ValueS),
  735        { atom_codes(Value, ValueS),
  736          Param = (Name = Value)
  737        }
  738    ;   { Param = Name
  739        }
  740    ).
  741
  742search_chars([C|T]) -->
  743    search_char(C),
  744    !,
  745    search_chars(T).
  746search_chars([]) -->
  747    [].
  748
  749search_char(_) --> search_sep, !, { fail }.
  750search_char(_) --> "=", !, { fail }.
  751search_char(C) --> fragment_char(C).
  752
  753search_value_chars([C|T]) -->
  754    search_value_char(C),
  755    !,
  756    search_value_chars(T).
  757search_value_chars([]) -->
  758    [].
  759
  760search_value_char(_) --> search_sep, !, { fail }.
  761search_value_char(C) --> fragment_char(C).

search_sep// is semidet

Matches a search-parameter separator. Traditionally, this is the &-char, but these days there are `newstyle' ;-char separators.

See also: - http://perldoc.perl.org/CGI.html
To be done: - This should be configurable

  771search_sep --> "&", !.
  772search_sep --> ";".

fragment(-Fragment, ?Tail)//: Extract the fragment (after the =#=)

  779fragment([fragment(Fragment)|T], T) -->
  780    "#",
  781    !,
  782    fragment_chars(Codes),
  783    { atom_codes(Fragment, Codes) }.
  784fragment(T, T) -->
  785    [].
  786
  787fragment_chars([H|T]) -->
  788    fragment_char(H),
  789    !,
  790    fragment_chars(T).
  791fragment_chars([]) -->
  792    [].

fragment_char(-Char): Find a fragment character.

  799fragment_char(C)   --> pchar(C), !.
  800fragment_char(0'/) --> "/", !.
  801fragment_char(0'?) --> "?", !.
  802fragment_char(0'[) --> "[", !.          % Not according RDF3986!
  803fragment_char(0']) --> "]", !.
  804
  805
  806                 /*******************************
  807                 *      CHARACTER CLASSES       *
  808                 *******************************/

pchar(-Code)//: unreserved|pct_encoded|sub_delim|":"|"@"
Performs UTF-8 decoding of percent encoded strings.

  816pchar(0'\s) --> "+", !.
  817pchar(C) -->
  818    [C],
  819    {   unreserved(C)
  820    ;   sub_delim(C)
  821    ;   C == 0':
  822    ;   C == 0'@
  823    },
  824    !.
  825pchar(C) -->
  826    percent_coded(C).

lwalpha(-C)//: Demand alpha, return as lowercase

  832lwalpha(H) -->
  833    [C],
  834    { C < 128,
  835      code_type(C, alpha),
  836      code_type(H, to_lower(C))
  837    }.
  838
  839drive_letter(C) -->
  840    [C],
  841    { C < 128,
  842      code_type(C, alpha)
  843    }.
  844
  845
  846                 /*******************************
  847                 *      RESERVED CHARACTERS     *
  848                 *******************************/

sub_delim(?Code): Sub-delimiters

  854sub_delim(0'!).
  855sub_delim(0'$).
  856sub_delim(0'&).
  857sub_delim(0'').
  858sub_delim(0'().
  859sub_delim(0')).
  860sub_delim(0'*).
  861sub_delim(0'+).
  862sub_delim(0',).
  863sub_delim(0';).
  864sub_delim(0'=).

unreserved(+C): Characters that can be represented without percent escaping RFC 3986, section 2.3

  872term_expansion(unreserved(map), Clauses) :-
  873    findall(unreserved(C), unreserved_(C), Clauses).
  874
  875unreserved_(C) :-
  876    between(1, 128, C),
  877    code_type(C, alnum).
  878unreserved_(0'-).
  879unreserved_(0'.).
  880unreserved_(0'_).
  881unreserved_(0'~).                       % 0'
  882
  883unreserved(map).                        % Expanded
  884
  885
  886                 /*******************************
  887                 *              FORMS           *
  888                 *******************************/
  889
  890/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  891Encoding/decoding of form-fields  using   the  popular  www-form-encoded
  892encoding used with the HTTP GET.
  893- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

www_form_encode(+Value, -XWWWFormEncoded) is det

www_form_encode(-Value, +XWWWFormEncoded) is det

En/decode to/from application/x-www-form-encoded. Encoding encodes all characters except RFC 3986 unreserved (ASCII alnum (see code_type/2)), and one of "-._~" using percent encoding. Newline is mapped to %OD%OA. When decoding, newlines appear as a single newline (10) character.

Note that a space is encoded as %20 instead of +. Decoding decodes both to a space.

deprecated: - Use uri_encoded/3 for new code.

  909www_form_encode(Value, Encoded) :-
  910    atomic(Value),
  911    !,
  912    atom_codes(Value, Codes),
  913    phrase(www_encode(Codes, []), EncCodes),
  914    atom_codes(Encoded, EncCodes).
  915www_form_encode(Value, Encoded) :-
  916    atom_codes(Encoded, EncCodes),
  917    phrase(www_decode(Codes), EncCodes),
  918    atom_codes(Value, Codes).

www_encode(+Codes, +ExtraUnescaped)//

  922www_encode([0'\r, 0'\n|T], Extra) -->
  923    !,
  924    "%0D%0A",
  925    www_encode(T, Extra).
  926www_encode([0'\n|T], Extra) -->
  927    !,
  928    "%0D%0A",
  929    www_encode(T, Extra).
  930www_encode([H|T], Extra) -->
  931    percent_encode(H, Extra),
  932    www_encode(T, Extra).
  933www_encode([], _) -->
  934    "".
  935
  936percent_encode(C, _Extra) -->
  937    { unreserved(C) },
  938    !,
  939    [C].
  940percent_encode(C, Extra) -->
  941    { memberchk(C, Extra) },
  942    !,
  943    [C].
  944%percent_encode(0' , _) --> !, "+".     % Deprecated: use %20
  945percent_encode(C, _) -->
  946    { C =< 127 },
  947    !,
  948    percent_byte(C).
  949percent_encode(C, _) -->                % Unicode characters
  950    { current_prolog_flag(url_encoding, utf8),
  951      !,
  952      phrase(utf8_codes([C]), Bytes)
  953    },
  954    percent_bytes(Bytes).
  955percent_encode(C, _) -->
  956    { C =< 255 },
  957    !,
  958    percent_byte(C).
  959percent_encode(_C, _) -->
  960    { representation_error(url_character)
  961    }.
  962
  963percent_bytes([]) -->
  964    "".
  965percent_bytes([H|T]) -->
  966    percent_byte(H),
  967    percent_bytes(T).
  968
  969percent_byte(C) -->
  970    [0'%, D1, D2],
  971    {   nonvar(C)
  972    ->  Dv1 is (C>>4 /\ 0xf),
  973        Dv2 is (C /\ 0xf),
  974        code_type(D1, xdigit(Dv1)),
  975        code_type(D2, xdigit(Dv2))
  976    ;   code_type(D1, xdigit(Dv1)),
  977        code_type(D2, xdigit(Dv2)),
  978        C is ((Dv1)<<4) + Dv2
  979    }.
  980
  981percent_coded(C) -->
  982    percent_byte(C0),
  983    !,
  984    (   { C0 == 13                  % %0D%0A --> \n
  985        },
  986        "%0",
  987        ( "A" ; "a" )
  988    ->  { C = 10
  989        }
  990    ;   { C0 >= 0xc0 },             % UTF-8 lead-in
  991        utf8_cont(Cs),
  992        { phrase(utf8_codes([C]), [C0|Cs]) }
  993    ->  []
  994    ;   { C = C0
  995        }
  996    ).

www_decode(-Codes)//

 1000www_decode([0' |T]) -->
 1001    "+",
 1002    !,
 1003    www_decode(T).
 1004www_decode([C|T]) -->
 1005    percent_coded(C),
 1006    !,
 1007    www_decode(T).
 1008www_decode([C|T]) -->
 1009    [C],
 1010    !,
 1011    www_decode(T).
 1012www_decode([]) -->
 1013    [].
 1014
 1015utf8_cont([H|T]) -->
 1016    percent_byte(H),
 1017    { between(0x80, 0xbf, H) },
 1018    !,
 1019    utf8_cont(T).
 1020utf8_cont([]) -->
 1021    [].

set_url_encoding(?Old, +New) is semidet

Query and set the encoding for URLs. The default is utf8. The only other defined value is iso_latin_1.

To be done: - Having a global flag is highly inconvenient, but a work-around for old sites using ISO Latin 1 encoding.

 1032:- create_prolog_flag(url_encoding, utf8, [type(atom)]). 1033
 1034set_url_encoding(Old, New) :-
 1035    current_prolog_flag(url_encoding, Old),
 1036    (   Old == New
 1037    ->  true
 1038    ;   must_be(oneof([utf8, iso_latin_1]), New),
 1039        set_prolog_flag(url_encoding, New)
 1040    ).
 1041
 1042
 1043                 /*******************************
 1044                 *       IRI PROCESSING         *
 1045                 *******************************/

url_iri(+Encoded, -Decoded) is det
url_iri(-Encoded, +Decoded) is det: Convert between a URL, encoding in US-ASCII and an IRI. An IRI is a fully expanded Unicode string. Unicode strings are first encoded into UTF-8, after which %-encoding takes place.

 1054url_iri(Encoded, Decoded) :-
 1055    nonvar(Encoded),
 1056    !,
 1057    (   sub_atom(Encoded, _, _, _, '%')
 1058    ->  atom_codes(Encoded, Codes),
 1059        unescape_precent(Codes, UTF8),
 1060        phrase(utf8_codes(Unicodes), UTF8),
 1061        atom_codes(Decoded, Unicodes)
 1062    ;   Decoded = Encoded
 1063    ).
 1064url_iri(URL, IRI) :-
 1065    atom_codes(IRI, IRICodes),
 1066    atom_codes('/:?#&=', ExtraEscapes),
 1067    phrase(www_encode(IRICodes, ExtraEscapes), UrlCodes),
 1068    atom_codes(URL, UrlCodes).
 1069
 1070
 1071unescape_precent([], []).
 1072unescape_precent([0'%,C1,C2|T0], [H|T]) :-     %'
 1073    !,
 1074    code_type(C1, xdigit(D1)),
 1075    code_type(C2, xdigit(D2)),
 1076    H is D1*16 + D2,
 1077    unescape_precent(T0, T).
 1078unescape_precent([H|T0], [H|T]) :-
 1079    unescape_precent(T0, T).
 1080
 1081
 1082                 /*******************************
 1083                 *           FORM DATA          *
 1084                 *******************************/

parse_url_search(?Spec, ?Fields:list(Name=Value)) is det: Construct or analyze an HTTP search specification. This deals with form data using the MIME-type application/x-www-form-urlencoded as used in HTTP GET requests.

 1093parse_url_search(Spec, Fields) :-
 1094    atomic(Spec),
 1095    !,
 1096    atom_codes(Spec, Codes),
 1097    phrase(search(Fields), Codes).
 1098parse_url_search(Codes, Fields) :-
 1099    is_list(Codes),
 1100    !,
 1101    phrase(search(Fields), Codes).
 1102parse_url_search(Codes, Fields) :-
 1103    must_be(list, Fields),
 1104    phrase(csearch(Fields, []), Codes).
 1105
 1106
 1107                 /*******************************
 1108                 *          FILE URLs           *
 1109                 *******************************/

file_name_to_url(+File, -URL) is det

file_name_to_url(-File, +URL) is semidet

Translate between a filename and a file:// URL.

To be done: - Current implementation does not deal with paths that need special encoding.

 1119file_name_to_url(File, FileURL) :-
 1120    nonvar(File),
 1121    !,
 1122    absolute_file_name(File, Path),
 1123    atom_concat('file://', Path, FileURL),
 1124    !.
 1125file_name_to_url(File, FileURL) :-
 1126    atom_concat('file://', File, FileURL),
 1127    !