/*  Part of SWI-Prolog

    Author:        Jan Wielemaker
    E-mail:        J.Wielemaker@vu.nl
    WWW:           http://www.swi-prolog.org
    Copyright (c)  2000-2015, University of Amsterdam
    All rights reserved.

    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
    are met:

    1. Redistributions of source code must retain the above copyright
       notice, this list of conditions and the following disclaimer.

    2. Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in
       the documentation and/or other materials provided with the
       distribution.

    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    POSSIBILITY OF SUCH DAMAGE.
*/

:- module(url,
          [ parse_url/2,                % +URL, -Parts | -URL +Parts
            parse_url/3,                % +URL|URI, +BaseURL, -Parts
                                        % -URL, +BaseURL, +Parts
            is_absolute_url/1,          % +URL
            global_url/3,               % +Local, +Base, -Global
            http_location/2,            % ?Parts, ?Location
            www_form_encode/2,          % Value <-> Encoded
            parse_url_search/2,         % Form-data <-> Form fields

            url_iri/2,                  % ?URL, ?IRI

            file_name_to_url/2,         % ?FileName, ?URL

            set_url_encoding/2          % ?Old, +New
          ]).
:- use_module(library(lists)).
:- use_module(library(error)).
:- use_module(library(utf8)).

/** <module> Analysing and constructing URL

This library deals with the analysis and construction of a URL,
Universal Resource Locator. URL is the basis for communicating locations
of resources (data) on the web. A URL consists of a protocol identifier
(e.g. HTTP, FTP, and a protocol-specific syntax further defining the
location. URLs are standardized in RFC-1738.

The implementation in this library covers only a small portion of the
defined protocols.  Though the initial implementation followed RFC-1738
strictly, the current is more relaxed to deal with frequent violations
of the standard encountered in practical use.

@author Jan Wielemaker
@author Lukas Faulstich
@deprecated New code should use library(uri), provided by the =clib=
            package.
*/

                 /*******************************
                 *            GLOBALISE         *
                 *******************************/

%!  global_url(+URL, +Base, -Global) is det.
%
%   Translate a possibly relative URL  into   an  absolute  one.
%
%   @error syntax_error(illegal_url) if URL is not legal.

global_url(URL, BaseURL, Global) :-
    (   is_absolute_url(URL),
        \+ sub_atom(URL, _, _, _, '%')      % may have escape, use general
    ->  Global = URL
    ;   sub_atom(URL, 0, _, _, '//')
    ->  parse_url(BaseURL, [], Attributes),
        memberchk(protocol(Proto), Attributes),
        atomic_list_concat([Proto, :, URL], Global)
    ;   sub_atom(URL, 0, _, _, #)
    ->  (   sub_atom(BaseURL, _, _, 0, #)
        ->  sub_atom(URL, 1, _, 0, NoHash),
            atom_concat(BaseURL, NoHash, Global)
        ;   atom_concat(BaseURL, URL, Global)
        )
    ;   parse_url(URL, BaseURL, Attributes)
    ->  phrase(curl(Attributes), Chars),
        atom_codes(Global, Chars)
    ;   throw(error(syntax_error(illegal_url), URL))
    ).

%!  is_absolute_url(+URL)
%
%   True if URL is an absolute URL. That  is, a URL that starts with
%   a protocol identifier.

is_absolute_url(URL) :-
    sub_atom(URL, 0, _, _, 'http://'),
    !.
is_absolute_url(URL) :-
    sub_atom(URL, 0, _, _, 'https://'),
    !.
is_absolute_url(URL) :-
    sub_atom(URL, 0, _, _, 'ftp://'),
    !.
is_absolute_url(URL) :-
    sub_atom(URL, 0, _, _, 'file://'),
    !.
is_absolute_url(URL) :-
    atom_codes(URL, Codes),
    phrase(absolute_url, Codes, _),
    !.


                 /*******************************
                 *        CREATE URL/URI        *
                 *******************************/

%!  http_location(?Parts, ?Location)
%
%   Construct or analyze an  HTTP  location.   This  is  similar  to
%   parse_url/2, but only deals with the   location  part of an HTTP
%   URL. That is, the path, search   and fragment specifiers. In the
%   HTTP protocol, the first line of a message is
%
%       ==
%       <Action> <Location> HTTP/<version>
%       ==
%
%   @param Location Atom or list of character codes.

http_location(Parts, Location) :-       % Parts --> Location
    nonvar(Parts),
    !,
    phrase(curi(Parts), String),
    !,
    atom_codes(Location, String).
http_location(Parts, Location) :-       % Location --> Parts
    atom(Location),
    !,
    atom_codes(Location, Codes),
    phrase(http_location(Parts), Codes).
http_location(Parts, Codes) :-          % LocationCodes --> Parts
    is_list(Codes),
    phrase(http_location(Parts), Codes).


curl(A) -->
    { memberchk(protocol(Protocol), A)
    },
    !,
    catomic(Protocol),
    ":",
    curl(Protocol, A).
curl(A) -->
    curl(http, A).

curl(file, A) -->
    !,
    (   "//"
    ->  cpath(A)
    ;   cpath(A)
    ).
curl(_, A) -->
    "//",
    cuser(A),
    chost(A),
    cport(A),
    cpath(A),
    csearch(A),
    cfragment(A).

curi(A) -->
    cpath(A),
    csearch(A).

cpath(A) -->
    (   { memberchk(path(Path), A) }
    ->  { atom_codes(Path, Codes) },
        www_encode(Codes, [0'/, 0'+, 0':, 0',])
    ;   ""
    ).

cuser(A) -->
    (   { memberchk(user(User), A) }
    ->  { atom_codes(User, Codes) },
        www_encode(Codes, [0':]),
        "@"
    ;   ""
    ).

chost(A) -->
    (   { memberchk(host(Host), A) }
    ->  { atom_codes(Host, Codes) },
        www_encode(Codes, [])
    ;   ""
    ).

cport(A) -->
    (   { memberchk(port(Port), A), Port \== 80 }
    ->  { number_codes(Port, Codes) },
        ":",
        www_encode(Codes, [])
    ;   ""
    ).


catomic(A, In, Out) :-
    atom_codes(A, Codes),
    append(Codes, Out, In).

%!  csearch(+Attributes)//

csearch(A)-->
    (   { memberchk(search(Parameters), A) }
    ->  csearch(Parameters, [0'?])
    ;   []
    ).

csearch([], _) -->
    [].
csearch([Parameter|Parameters], Sep) -->
    !,
    codes(Sep),
    cparam(Parameter),
    csearch(Parameters, [0'&]).

cparam(Name=Value) -->
    !,
    cname(Name),
    "=",
    cvalue(Value).
cparam(NameValue) -->                   % allow to feed Name(Value)
    { compound(NameValue),
      !,
      NameValue =.. [Name,Value]
    },
    cname(Name),
    "=",
    cvalue(Value).
cparam(Name)-->
    cname(Name).

codes([]) --> [].
codes([H|T]) --> [H], codes(T).

cname(Atom) -->
    { atom_codes(Atom, Codes) },
    www_encode(Codes, []).

%!  cvalue(+Value)// is det.
%
%   Construct a string from  Value.  Value   is  either  atomic or a
%   code-list.

cvalue(Value) -->
    { atomic(Value),
      !,
      atom_codes(Value, Codes)
    },
    www_encode(Codes, []).
cvalue(Codes) -->
    { must_be(codes, Codes)
    },
    www_encode(Codes, []).


%!  cfragment(+Attributes)//

cfragment(A) -->
    { memberchk(fragment(Frag), A),
      !,
      atom_codes(Frag, Codes)
    },
    "#",
    www_encode(Codes, []).
cfragment(_) -->
    "".


                 /*******************************
                 *            PARSING           *
                 *******************************/

%!  parse_url(?URL, ?Attributes) is det.
%
%   Construct or analyse a URL. URL is an   atom  holding a URL or a
%   variable. Attributes is a list of  components. Each component is
%   of the format Name(Value). Defined components are:
%
%       * protocol(Protocol)
%       The used protocol. This is, after  the optional =|url:|=, an
%       identifier separated from the remainder of  the URL using :.
%       parse_url/2 assumes the =http= protocol   if  no protocol is
%       specified and the URL can be parsed  as a valid HTTP url. In
%       addition to the RFC-1738  specified   protocols,  the =file=
%       protocol is supported as well.
%
%       * host(Host)
%       Host-name or IP-address on which   the  resource is located.
%       Supported by all network-based protocols.
%
%       * port(Port)
%       Integer port-number to access on   the \arg{Host}. This only
%       appears if the port is  explicitly   specified  in  the URL.
%       Implicit default ports (e.g., 80 for   HTTP) do _not_ appear
%       in the part-list.
%
%       * path(Path)
%       (File-) path addressed by the URL. This is supported for the
%       =ftp=, =http= and =file= protocols. If  no path appears, the
%       library generates the path =|/|=.
%
%       * search(ListOfNameValue)
%       Search-specification of HTTP URL. This is the part after the
%       =|?|=, normally used to transfer data   from HTML forms that
%       use the =GET=  protocol.  In  the   URL  it  consists  of  a
%       www-form-encoded list of Name=Value pairs. This is mapped to
%       a list of Prolog Name=Value  terms   with  decoded names and
%       values.
%
%       * fragment(Fragment)
%       Fragment specification of HTTP URL. This   is the part after
%       the =|#|= character.
%
%   The example below illustrates all of this for an HTTP URL.
%
%       ==
%       ?- parse_url('http://www.xyz.org/hello?msg=Hello+World%21#x',
%              P).
%
%       P = [ protocol(http),
%             host('www.xyz.org'),
%             fragment(x),
%             search([ msg = 'Hello World!'
%                    ]),
%             path('/hello')
%           ]
%       ==
%
%   By instantiating the parts-list this predicate   can  be used to
%   create a URL.

parse_url(URL, Attributes) :-
    nonvar(URL),
    !,
    atom_codes(URL, Codes),
    phrase(url(Attributes), Codes).
parse_url(URL, Attributes) :-
    phrase(curl(Attributes), Codes),
    !,
    atom_codes(URL, Codes).

%!  parse_url(+URL, +BaseURL, -Attributes) is det.
%
%   Similar to parse_url/2 for relative URLs.  If URL is relative,
%   it is resolved using the absolute URL BaseURL.

parse_url(URL, BaseURL, Attributes) :-
    nonvar(URL),
    !,
    atom_codes(URL, Codes),
    (   phrase(absolute_url, Codes, _)
    ->  phrase(url(Attributes), Codes)
    ;   (   atomic(BaseURL)
        ->  parse_url(BaseURL, BaseA0)
        ;   BaseA0 = BaseURL
        ),
        select(path(BasePath), BaseA0, BaseA1),
        delete(BaseA1, search(_), BaseA2),
        delete(BaseA2, fragment(_), BaseA3),
        phrase(relative_uri(URIA0), Codes),
        select(path(LocalPath), URIA0, URIA1),
        !,
        globalise_path(LocalPath, BasePath, Path),
        append(BaseA3, [path(Path)|URIA1], Attributes)
    ).
parse_url(URL, BaseURL, Attributes) :-
    parse_url(BaseURL, BaseAttributes),
    memberchk(path(BasePath), BaseAttributes),
    (   memberchk(path(LocalPath), Attributes)
    ->  globalise_path(LocalPath, BasePath, Path)
    ;   Path = BasePath
    ),
    append([path(Path)|Attributes], BaseAttributes, GlobalAttributes),
    phrase(curl(GlobalAttributes), Chars),
    atom_codes(URL, Chars).


%!  globalise_path(+LocalPath, +RelativeTo, -FullPath) is det.
%
%   The first clause deals with the  standard URL /... global paths.
%   The second with file://drive:path on MS-Windows.   This is a bit
%   of a cludge, but unfortunately common practice is -especially on
%   Windows- not always following the standard

globalise_path(LocalPath, _, LocalPath) :-
    sub_atom(LocalPath, 0, _, _, /),
    !.
globalise_path(LocalPath, _, LocalPath) :-
    is_absolute_file_name(LocalPath),
    !.
globalise_path(Local, Base, Path) :-
    base_dir(Base, BaseDir),
    make_path(BaseDir, Local, Path).

base_dir(BasePath, BaseDir) :-
    (   atom_concat(BaseDir, /, BasePath)
    ->  true
    ;   file_directory_name(BasePath, BaseDir)
    ).

make_path(Dir, Local, Path) :-
    atom_concat('../', L2, Local),
    file_directory_name(Dir, Parent),
    Parent \== Dir,
    !,
    make_path(Parent, L2, Path).
make_path(/, Local, Path) :-
    !,
    atom_concat(/, Local, Path).
make_path(Dir, Local, Path) :-
    atomic_list_concat([Dir, /, Local], Path).


%!  absolute_url//
%
%   True if the input  describes  an   absolute  URL.  This means it
%   starts with a URL schema. We demand a   schema  of length > 1 to
%   avoid confusion with Windows drive letters.

absolute_url -->
    lwalpha(_First),
    schema_chars(Rest),
    { Rest \== [] },
    ":",
    !.


                 /*******************************
                 *           SEQUENCES          *
                 *******************************/

digits(L) -->
    digits(L, []).

digits([C|T0], T) -->
    digit(C),
    !,
    digits(T0, T).
digits(T, T) -->
    [].


digit(C, [C|T], T) :- code_type(C, digit).

                 /*******************************
                 *            RFC-3986          *
                 *******************************/

%!  uri(-Parts)//

url([protocol(Schema)|Parts]) -->
    schema(Schema),
    ":",
    !,
    hier_part(Schema, Parts, P2),
    query(P2, P3),
    fragment(P3, []).
url([protocol(http)|Parts]) -->         % implicit HTTP
    authority(Parts, [path(Path)]),
    path_abempty(Path).

relative_uri(Parts) -->
    relative_part(Parts, P2),
    query(P2, P3),
    fragment(P3, []).

relative_part(Parts, Tail) -->
    "//",
    !,
    authority(Parts, [path(Path)|Tail]),
    path_abempty(Path).
relative_part([path(Path)|T], T) -->
    (   path_absolute(Path)
    ;   path_noschema(Path)
    ;   path_empty(Path)
    ),
    !.

http_location([path(Path)|P2]) -->
    path_abempty(Path),
    query(P2, P3),
    fragment(P3, []).

%!  schema(-Atom)//
%
%   Schema  is  case-insensitive  and  the    canonical  version  is
%   lowercase.
%
%   ==
%   Schema ::= ALPHA *(ALPHA|DIGIT|"+"|"-"|".")
%   ==

schema(Schema) -->
    lwalpha(C0),
    schema_chars(Codes),
    { atom_codes(Schema, [C0|Codes]) }.

schema_chars([H|T]) -->
    schema_char(H),
    !,
    schema_chars(T).
schema_chars([]) -->
    [].

schema_char(H) -->
    [C],
    { C < 128,
      (   code_type(C, alpha)
      ->  code_type(H, to_lower(C))
      ;   code_type(C, digit)
      ->  H = C
      ;   schema_extra(C)
      ->  H = C
      )
    }.

schema_extra(0'+).
schema_extra(0'-).
schema_extra(0'.).      % 0'


%!  hier_part(+Schema, -Parts, ?Tail)//

hier_part(file, [path(Path)|Tail], Tail) -->
    !,
    "//",
    (   win_drive_path(Path)
    ;   path_absolute(Path)
    ;   path_rootless(Path)
    ;   path_empty(Path)
    ),
    !.
hier_part(_, Parts, Tail) -->
    "//",
    !,
    authority(Parts, [path(Path)|Tail]),
    path_abempty(Path).
hier_part(_, [path(Path)|T], T) -->
    (   path_absolute(Path)
    ;   path_rootless(Path)
    ;   path_empty(Path)
    ),
    !.

authority(Parts, Tail) -->
    user_info_chars(UserChars),
    "@",
    !,
    { atom_codes(User, UserChars),
      Parts = [user(User),host(Host)|T0]
    },
    host(Host),
    port(T0,Tail).
authority([host(Host)|T0], Tail) -->
    host(Host),
    port(T0, Tail).

user_info_chars([H|T]) -->
    user_info_char(H),
    !,
    user_info_chars(T).
user_info_chars([]) -->
    [].

user_info_char(_) --> "@", !, {fail}.
user_info_char(C) --> pchar(C).

%host(Host) --> ip_literal(Host), !.            % TBD: IP6 addresses
host(Host) --> ip4_address(Host), !.
host(Host) --> reg_name(Host).

ip4_address(Atom) -->
    i256_chars(Chars, [0'.|T0]),
    i256_chars(T0, [0'.|T1]),
    i256_chars(T1, [0'.|T2]),
    i256_chars(T2, []),
    { atom_codes(Atom, Chars) }.

i256_chars(Chars, T) -->
    digits(Chars, T),
    { \+ \+ ( T = [],
              Chars \== [],
              number_codes(I, Chars),
              I < 256
            )
    }.

reg_name(Host) -->
    reg_name_chars(Chars),
    { atom_codes(Host, Chars) }.

reg_name_chars([H|T]) -->
    reg_name_char(H),
    !,
    reg_name_chars(T).
reg_name_chars([]) -->
    [].

reg_name_char(C) -->
    pchar(C),
    { C \== 0':,
      C \== 0'@
    }.

port([port(Port)|T], T) -->
    ":",
    !,
    digit(D0),
    digits(Ds),
    { number_codes(Port, [D0|Ds]) }.
port(T, T) -->
    [].

path_abempty(Path) -->
    segments_chars(Chars, []),
    {   Chars == []
    ->  Path = '/'
    ;   atom_codes(Path, Chars)
    }.


win_drive_path(Path) -->
    drive_letter(C0),
    ":",
    (   "/"
    ->  {Codes = [C0, 0':, 0'/|Chars]}
    ;   {Codes = [C0, 0':|Chars]}
    ),
    segment_nz_chars(Chars, T0),
    segments_chars(T0, []),
    { atom_codes(Path, Codes) }.


path_absolute(Path) -->
    "/",
    segment_nz_chars(Chars, T0),
    segments_chars(T0, []),
    { atom_codes(Path, [0'/| Chars]) }.

path_noschema(Path) -->
    segment_nz_nc_chars(Chars, T0),
    segments_chars(T0, []),
    { atom_codes(Path, Chars) }.

path_rootless(Path) -->
    segment_nz_chars(Chars, T0),
    segments_chars(T0, []),
    { atom_codes(Path, Chars) }.

path_empty('/') -->
    "".

segments_chars([0'/|Chars], T) -->      % 0'
    "/",
    !,
    segment_chars(Chars, T0),
    segments_chars(T0, T).
segments_chars(T, T) -->
    [].

segment_chars([H|T0], T) -->
    pchar(H),
    !,
    segment_chars(T0, T).
segment_chars(T, T) -->
    [].

segment_nz_chars([H|T0], T) -->
    pchar(H),
    segment_chars(T0, T).

segment_nz_nc_chars([H|T0], T) -->
    segment_nz_nc_char(H),
    !,
    segment_nz_nc_chars(T0, T).
segment_nz_nc_chars(T, T) -->
    [].

segment_nz_nc_char(_) --> ":", !, {fail}.
segment_nz_nc_char(C) --> pchar(C).


%!  query(-Parts, ?Tail)// is det.
%
%   Extract &Name=Value, ...

query([search(Params)|T], T) -->
    "?",
    !,
    search(Params).
query(T,T) -->
    [].

search([Parameter|Parameters])-->
    parameter(Parameter),
    !,
    (   search_sep
    ->  search(Parameters)
    ;   { Parameters = [] }
    ).
search([]) -->
    [].

parameter(Param)-->
    !,
    search_chars(NameS),
    { atom_codes(Name, NameS)
    },
    (   "="
    ->  search_value_chars(ValueS),
        { atom_codes(Value, ValueS),
          Param = (Name = Value)
        }
    ;   { Param = Name
        }
    ).

search_chars([C|T]) -->
    search_char(C),
    !,
    search_chars(T).
search_chars([]) -->
    [].

search_char(_) --> search_sep, !, { fail }.
search_char(_) --> "=", !, { fail }.
search_char(C) --> fragment_char(C).

search_value_chars([C|T]) -->
    search_value_char(C),
    !,
    search_value_chars(T).
search_value_chars([]) -->
    [].

search_value_char(_) --> search_sep, !, { fail }.
search_value_char(C) --> fragment_char(C).

%!  search_sep// is semidet.
%
%   Matches a search-parameter separator.  Traditionally, this is the
%   &-char, but these days there are `newstyle' ;-char separators.
%
%   @see http://perldoc.perl.org/CGI.html
%   @tbd This should be configurable

search_sep --> "&", !.
search_sep --> ";".


%!  fragment(-Fragment, ?Tail)//
%
%   Extract the fragment (after the =#=)

fragment([fragment(Fragment)|T], T) -->
    "#",
    !,
    fragment_chars(Codes),
    { atom_codes(Fragment, Codes) }.
fragment(T, T) -->
    [].

fragment_chars([H|T]) -->
    fragment_char(H),
    !,
    fragment_chars(T).
fragment_chars([]) -->
    [].


%!  fragment_char(-Char)
%
%   Find a fragment character.

fragment_char(C)   --> pchar(C), !.
fragment_char(0'/) --> "/", !.
fragment_char(0'?) --> "?", !.
fragment_char(0'[) --> "[", !.          % Not according RDF3986!
fragment_char(0']) --> "]", !.


                 /*******************************
                 *      CHARACTER CLASSES       *
                 *******************************/

%!  pchar(-Code)//
%
%   unreserved|pct_encoded|sub_delim|":"|"@"
%
%   Performs UTF-8 decoding of percent encoded strings.

pchar(0'\s) --> "+", !.
pchar(C) -->
    [C],
    {   unreserved(C)
    ;   sub_delim(C)
    ;   C == 0':
    ;   C == 0'@
    },
    !.
pchar(C) -->
    percent_coded(C).

%!  lwalpha(-C)//
%
%   Demand alpha, return as lowercase

lwalpha(H) -->
    [C],
    { C < 128,
      code_type(C, alpha),
      code_type(H, to_lower(C))
    }.

drive_letter(C) -->
    [C],
    { C < 128,
      code_type(C, alpha)
    }.


                 /*******************************
                 *      RESERVED CHARACTERS     *
                 *******************************/

%!  sub_delim(?Code)
%
%   Sub-delimiters

sub_delim(0'!).
sub_delim(0'$).
sub_delim(0'&).
sub_delim(0'').
sub_delim(0'().
sub_delim(0')).
sub_delim(0'*).
sub_delim(0'+).
sub_delim(0',).
sub_delim(0';).
sub_delim(0'=).


%!  unreserved(+C)
%
%   Characters that can be represented without percent escaping
%   RFC 3986, section 2.3

term_expansion(unreserved(map), Clauses) :-
    findall(unreserved(C), unreserved_(C), Clauses).

unreserved_(C) :-
    between(1, 128, C),
    code_type(C, alnum).
unreserved_(0'-).
unreserved_(0'.).
unreserved_(0'_).
unreserved_(0'~).                       % 0'

unreserved(map).                        % Expanded


                 /*******************************
                 *              FORMS           *
                 *******************************/

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Encoding/decoding of form-fields  using   the  popular  www-form-encoded
encoding used with the HTTP GET.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

%!  www_form_encode(+Value, -XWWWFormEncoded) is det.
%!  www_form_encode(-Value, +XWWWFormEncoded) is det.
%
%   En/decode   to/from   application/x-www-form-encoded.   Encoding
%   encodes all characters  except  RFC   3986  _unreserved_  (ASCII
%   =alnum= (see code_type/2)), and  one   of  "-._~"  using percent
%   encoding.  Newline  is  mapped  to  =|%OD%OA|=.  When  decoding,
%   newlines appear as a single newline (10) character.
%
%   Note that a space  is  encoded   as  =|%20|=  instead  of =|+|=.
%   Decoding decodes both to a space.
%
%   @deprecated Use uri_encoded/3 for new code.

www_form_encode(Value, Encoded) :-
    atomic(Value),
    !,
    atom_codes(Value, Codes),
    phrase(www_encode(Codes, []), EncCodes),
    atom_codes(Encoded, EncCodes).
www_form_encode(Value, Encoded) :-
    atom_codes(Encoded, EncCodes),
    phrase(www_decode(Codes), EncCodes),
    atom_codes(Value, Codes).

%!  www_encode(+Codes, +ExtraUnescaped)//

www_encode([0'\r, 0'\n|T], Extra) -->
    !,
    "%0D%0A",
    www_encode(T, Extra).
www_encode([0'\n|T], Extra) -->
    !,
    "%0D%0A",
    www_encode(T, Extra).
www_encode([H|T], Extra) -->
    percent_encode(H, Extra),
    www_encode(T, Extra).
www_encode([], _) -->
    "".

percent_encode(C, _Extra) -->
    { unreserved(C) },
    !,
    [C].
percent_encode(C, Extra) -->
    { memberchk(C, Extra) },
    !,
    [C].
%percent_encode(0' , _) --> !, "+".     % Deprecated: use %20
percent_encode(C, _) -->
    { C =< 127 },
    !,
    percent_byte(C).
percent_encode(C, _) -->                % Unicode characters
    { current_prolog_flag(url_encoding, utf8),
      !,
      phrase(utf8_codes([C]), Bytes)
    },
    percent_bytes(Bytes).
percent_encode(C, _) -->
    { C =< 255 },
    !,
    percent_byte(C).
percent_encode(_C, _) -->
    { representation_error(url_character)
    }.

percent_bytes([]) -->
    "".
percent_bytes([H|T]) -->
    percent_byte(H),
    percent_bytes(T).

percent_byte(C) -->
    [0'%, D1, D2],
    {   nonvar(C)
    ->  Dv1 is (C>>4 /\ 0xf),
        Dv2 is (C /\ 0xf),
        code_type(D1, xdigit(Dv1)),
        code_type(D2, xdigit(Dv2))
    ;   code_type(D1, xdigit(Dv1)),
        code_type(D2, xdigit(Dv2)),
        C is ((Dv1)<<4) + Dv2
    }.

percent_coded(C) -->
    percent_byte(C0),
    !,
    (   { C0 == 13                  % %0D%0A --> \n
        },
        "%0",
        ( "A" ; "a" )
    ->  { C = 10
        }
    ;   { C0 >= 0xc0 },             % UTF-8 lead-in
        utf8_cont(Cs),
        { phrase(utf8_codes([C]), [C0|Cs]) }
    ->  []
    ;   { C = C0
        }
    ).

%!  www_decode(-Codes)//

www_decode([0' |T]) -->
    "+",
    !,
    www_decode(T).
www_decode([C|T]) -->
    percent_coded(C),
    !,
    www_decode(T).
www_decode([C|T]) -->
    [C],
    !,
    www_decode(T).
www_decode([]) -->
    [].

utf8_cont([H|T]) -->
    percent_byte(H),
    { between(0x80, 0xbf, H) },
    !,
    utf8_cont(T).
utf8_cont([]) -->
    [].


%!  set_url_encoding(?Old, +New) is semidet.
%
%   Query and set the encoding for URLs.  The default is =utf8=.
%   The only other defined value is =iso_latin_1=.
%
%   @tbd    Having a global flag is highly inconvenient, but a
%           work-around for old sites using ISO Latin 1 encoding.

:- create_prolog_flag(url_encoding, utf8, [type(atom)]).

set_url_encoding(Old, New) :-
    current_prolog_flag(url_encoding, Old),
    (   Old == New
    ->  true
    ;   must_be(oneof([utf8, iso_latin_1]), New),
        set_prolog_flag(url_encoding, New)
    ).


                 /*******************************
                 *       IRI PROCESSING         *
                 *******************************/

%!  url_iri(+Encoded, -Decoded) is det.
%!  url_iri(-Encoded, +Decoded) is det.
%
%   Convert between a URL, encoding in US-ASCII   and an IRI. An IRI
%   is a fully expanded Unicode string.   Unicode  strings are first
%   encoded into UTF-8, after which %-encoding takes place.

url_iri(Encoded, Decoded) :-
    nonvar(Encoded),
    !,
    (   sub_atom(Encoded, _, _, _, '%')
    ->  atom_codes(Encoded, Codes),
        unescape_precent(Codes, UTF8),
        phrase(utf8_codes(Unicodes), UTF8),
        atom_codes(Decoded, Unicodes)
    ;   Decoded = Encoded
    ).
url_iri(URL, IRI) :-
    atom_codes(IRI, IRICodes),
    atom_codes('/:?#&=', ExtraEscapes),
    phrase(www_encode(IRICodes, ExtraEscapes), UrlCodes),
    atom_codes(URL, UrlCodes).


unescape_precent([], []).
unescape_precent([0'%,C1,C2|T0], [H|T]) :-     %'
    !,
    code_type(C1, xdigit(D1)),
    code_type(C2, xdigit(D2)),
    H is D1*16 + D2,
    unescape_precent(T0, T).
unescape_precent([H|T0], [H|T]) :-
    unescape_precent(T0, T).


                 /*******************************
                 *           FORM DATA          *
                 *******************************/

%!  parse_url_search(?Spec, ?Fields:list(Name=Value)) is det.
%
%   Construct or analyze an HTTP   search  specification. This deals
%   with       form       data       using       the       MIME-type
%   =application/x-www-form-urlencoded=  as  used   in    HTTP   GET
%   requests.

parse_url_search(Spec, Fields) :-
    atomic(Spec),
    !,
    atom_codes(Spec, Codes),
    phrase(search(Fields), Codes).
parse_url_search(Codes, Fields) :-
    is_list(Codes),
    !,
    phrase(search(Fields), Codes).
parse_url_search(Codes, Fields) :-
    must_be(list, Fields),
    phrase(csearch(Fields, []), Codes).


                 /*******************************
                 *          FILE URLs           *
                 *******************************/

%!  file_name_to_url(+File, -URL) is det.
%!  file_name_to_url(-File, +URL) is semidet.
%
%   Translate between a filename and a file:// URL.
%
%   @tbd    Current implementation does not deal with paths that
%           need special encoding.

file_name_to_url(File, FileURL) :-
    nonvar(File),
    !,
    absolute_file_name(File, Path),
    atom_concat('file://', Path, FileURL),
    !.
file_name_to_url(File, FileURL) :-
    atom_concat('file://', File, FileURL),
    !.