View source with raw comments or as raw

    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2017, VU University Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(pcre,
   36          [ re_match/2,           % +Regex, +String
   37            re_match/3,           % +Regex, +String, +Options
   38            re_matchsub/4,        % +Regex, +String, -Subs, +Options
   39            re_foldl/6,           % :Goal, +Regex, +String, ?V0, ?V, +Options
   40            re_split/3,		  % +Pattern, +String, -Split:list
   41            re_split/4,		  % +Pattern, +String, -Split:list, +Options
   42            re_replace/4,	  % +Pattern, +With, +String, -NewString
   43
   44            re_compile/3,         % +Pattern, -Regex, +Options
   45            re_flush/0,
   46            re_config/1           % ?Config
   47          ]).   48:- use_module(library(error)).   49:- use_module(library(apply)).   50:- use_module(library(dcg/basics)).   51:- use_foreign_library(foreign(pcre4pl)).   52
   53:- meta_predicate
   54    re_foldl(3, +, +, ?, ?, +).

Perl compatible regular expression matching for SWI-Prolog

This module provides an interface to the PCRE (Perl Compatible Regular Expression) library. This Prolog interface provides an almost comprehensive wrapper around PCRE.

Regular expressions are created from a pattern and options and represented as a SWI-Prolog blob. This implies they are subject to (atom) garbage collection. Compiled regular expressions can safely be used in multiple threads. Most predicates accept both an explicitly compiled regular expression, a pattern or a term Pattern/Flags. In the latter two cases a regular expression blob is created and stored in a cache. The cache can be cleared using re_flush/0.

See also: - `man pcre` for details. */

   73:- predicate_options(re_match/3, 3,
   74                     [ anchored(boolean),
   75                       bol(boolean),
   76                       bsr(oneof([anycrlf,unicode])),
   77                       empty(boolean),
   78                       empty_atstart(boolean),
   79                       eol(boolean),
   80                       newline(oneof([any,anycrlf,cr,lf,crlf])),
   81                       start(integer)
   82                     ]).   83:- predicate_options(re_compile/3, 3,
   84                     [ anchored(boolean),
   85                       bsr(oneof([anycrlf,unicode])),
   86                       caseless(boolean),
   87                       dollar_endonly(boolean),
   88                       dotall(boolean),
   89                       dupnames(boolean),
   90                       extended(boolean),
   91                       extra(boolean),
   92                       firstline(boolean),
   93                       compat(oneof([javascript])),
   94                       multiline(boolean),
   95                       newline(oneof([any,anycrlf,cr,lf,crlf])),
   96                       ucp(boolean),
   97                       ungreedy(boolean)
   98                     ]).

re_match(+Regex, +String) is semidet

re_match(+Regex, +String, +Options) is semidet

Succeeds if String matches Regex. For example:

?- re_match("^needle"/i, "Needle in a haystack").
true.

Options:

anchored(Bool): If true, match only at the first position
bol(Bool): Subject string is the beginning of a line (default false)
bsr(Mode): If anycrlf, \R only matches CR, LF or CRLF. If unicode, \R matches all Unicode line endings. Subject string is the end of a line (default false)
empty(Bool): An empty string is a valid match (default true)
empty_atstart(Bool): An empty string at the start of the subject is a valid match (default true)
eol(Bool): Subject string is the end of a line (default false)
newline(Mode): If any, recognize any Unicode newline sequence, if anycrlf, recognize CR, LF, and CRLF as newline sequences, if cr, recognize CR, if lf, recognize LF and finally if crlf recognize CRLF as newline.
start(+From): Start at the given character index

Arguments:

Regex

- is the output of re_compile/3, a pattern or a term Pattern/Flags, where Pattern is an atom or string. The defined flags and there related option for re_compile/3 are below.

x: extended(true)
i: caseless(true)
m: multiline(true)
s: dotall(true)
a: capture_type(atom)
r: capture_type(range)
t: capture_type(term)

  148re_match(Regex, String) :-
  149    re_match(Regex, String, []).
  150re_match(Regex, String, Options) :-
  151    re_compiled(Regex, Compiled),
  152    re_match_(Compiled, String, Options).

re_matchsub(+Regex, +String, -Sub:dict, +Options) is semidet

Match String against Regex. On success, Sub is a dict containing integer keys for the numbered capture group and atom keys for the named capture groups. The associated value is determined by the capture_type(Type) option passed to re_compile/3, may be specified using flags if Regex is of the form Pattern/Flags and may be specified at the level of individual captures using a naming convention for the caption name. See re_compile/3 for details.

The example below exploits the typed groups to parse a date specification:

?- re_matchsub("(?<date> (?<year_I>(?:\\d\\d)?\\d\\d) -
                (?<month_I>\\d\\d) - (?<day_I>\\d\\d) )"/e,
               "2017-04-20", Sub, []).
Sub = re_match{0:"2017-04-20", date:"2017-04-20",
               day:20, month:4, year:2017}.

  176re_matchsub(Regex, String, Subs, Options) :-
  177    re_compiled(Regex, Compiled),
  178    re_matchsub_(Compiled, String, Pairs, Options),
  179    dict_pairs(Subs, re_match, Pairs).

re_foldl(:Goal, +Regex, +String, ?V0, ?V, +Options) is semidet

Fold all matches of Regex on String. Each match is represented by a dict as specified for re_matchsub/4. V0 and V are related using a sequence of invocations of Goal as illustrated below.

call(Goal, Dict1, V0, V1),
call(Goal, Dict2, V1, V2),
...
call(Goal, Dictn, Vn, V).

This predicate is used to implement re_split/4 and re_replace/4. For example, we can count all matches of a Regex on String using this code:

re_match_count(Regex, String, Count) :-
    re_foldl(increment, Regex, String, 0, Count, []).

increment(_Match, V0, V1) :-
    V1 is V0+1.

After which we can query

?- re_match_count("a", "aap", X).
X = 2.

  213re_foldl(Goal, Regex, String, V0, V, Options) :-
  214    re_compiled(Regex, Compiled),
  215    re_foldl_(Compiled, String, Goal, V0, V, Options).
  216
  217:- public re_call_folder/4.  218
  219re_call_folder(Goal, Pairs, V0, V1) :-
  220    dict_pairs(Dict, re_match, Pairs),
  221    call(Goal, Dict, V0, V1).

re_split(+Pattern, +String, -Split:list) is det

re_split(+Pattern, +String, -Split:list, +Options) is det

Split String using the regular expression Pattern. Split is a list of strings holding alternating matches of Pattern and skipped parts of the String, starting with a skipped part. The Split lists ends with a string of the content of String after the last match. If Pattern does not appear in String, Split is a list holding a copy of String. This implies the number of elements in Split is always odd. For example:

?- re_split("a+", "abaac", Split, []).
Split = ["","a","b","aa","c"].
?- re_split(":\\s*"/n, "Age: 33", Split, []).
Split = ['Age', ': ', 33].

Arguments:

Pattern - is the pattern text, optionally follows by /Flags. Similar to re_matchsub/4, the final output type can be controlled by a flag a (atom), s (string, default) or n (number if possible, atom otherwise).

  247re_split(Pattern, String, Split) :-
  248    re_split(Pattern, String, Split, []).
  249re_split(Pattern, String, Split, Options) :-
  250    range_regex(Pattern, Compiled, Type),
  251    State = state(String, 0, Type),
  252    re_foldl(split(State), Compiled, String, Split, [Last], Options),
  253    arg(2, State, LastSkipStart),
  254    typed_sub(Type, String, LastSkipStart, _, 0, Last).
  255
  256range_regex(Pattern/Flags, Compiled, Type) :- !,
  257    atom_chars(Flags, Chars),
  258    replace_flags(Chars, Chars1, Type),
  259    atom_chars(RFlags, [r|Chars1]),
  260    re_compiled(Pattern/RFlags, Compiled).
  261range_regex(Pattern, Compiled, string) :-
  262    re_compiled(Pattern/r, Compiled).
  263
  264replace_flags([], [], Type) :-
  265    default(Type, string).
  266replace_flags([H|T0], T, Type) :-
  267    split_type(H, Type),
  268    !,
  269    replace_flags(T0, T, Type).
  270replace_flags([H|T0], [H|T], Type) :-
  271    replace_flags(T0, T, Type).
  272
  273split_type(a, atom).
  274split_type(s, string).
  275split_type(n, name).
  276
  277split(State, Dict, [Skipped,Sep|T], T) :-
  278    matched(State, Dict.0, Sep),
  279    skipped(State, Dict.0, Skipped).
  280
  281matched(state(String, _, Type), Start-Len, Matched) :-
  282    typed_sub(Type, String, Start, Len, _, Matched).
  283
  284skipped(State, Start-Len, Skipped) :-
  285    State = state(String, Here, Type),
  286    SkipLen is Start-Here,
  287    typed_sub(Type, String, Here, SkipLen, _, Skipped),
  288    NextSkipStart is Start+Len,
  289    nb_setarg(2, State, NextSkipStart).
  290
  291typed_sub(string, Haystack, B, L, A, String) :-
  292    sub_string(Haystack, B, L, A, String).
  293typed_sub(atom, Haystack, B, L, A, String) :-
  294    sub_atom(Haystack, B, L, A, String).
  295typed_sub(name, Haystack, B, L, A, Value) :-
  296    sub_string(Haystack, B, L, A, String),
  297    (   number_string(Number, String)
  298    ->  Value = Number
  299    ;   atom_string(Value, String)
  300    ).

re_replace(+Pattern, +With, +String, -NewString)

Replace matches of the regular expression Pattern in String with With. With may reference captured substrings using \N or $Name. Both N and Name may be written as {N} and {Name} to avoid ambiguities.

Arguments:

Pattern - is the pattern text, optionally follows by /Flags. Flags may include g, replacing all occurences of Pattern. In addition, similar to re_matchsub/4, the final output type can be controlled by a flag a (atom) or s (string, default).

  313re_replace(Pattern, With, String, NewString) :-
  314    range_regex(Pattern, Compiled, All, Type),
  315    compile_replacement(With, RCompiled),
  316    State = state(String, 0, Type),
  317    (   All == all
  318    ->  re_foldl(replace(State, RCompiled), Compiled, String, Parts, [Last], [])
  319    ;   (   re_matchsub(Compiled, String, Match, [])
  320        ->  replace(State, RCompiled, Match, Parts, [Last])
  321        ;   Repl = false
  322        )
  323    ),
  324    (   Repl == false
  325    ->  parts_to_output(Type, [String], NewString)
  326    ;   arg(2, State, LastSkipStart),
  327        sub_string(String, LastSkipStart, _, 0, Last),
  328        parts_to_output(Type, Parts, NewString)
  329    ).
  330
  331range_regex(Pattern/Flags, Compiled, All, Type) :- !,
  332    atom_chars(Flags, Chars),
  333    replace_flags(Chars, Chars1, All, Type),
  334    atom_chars(RFlags, [r|Chars1]),
  335    re_compiled(Pattern/RFlags, Compiled).
  336range_regex(Pattern, Compiled, first, string) :-
  337    re_compiled(Pattern/r, Compiled).
  338
  339replace_flags([], [], All, Type) :-
  340    default(All, first),
  341    default(Type, string).
  342replace_flags([H|T0], T, All, Type) :-
  343    (   all(H, All)
  344    ->  true
  345    ;   type(H, Type)
  346    ),
  347    !,
  348    replace_flags(T0, T, All, Type).
  349replace_flags([H|T0], [H|T], All, Type) :-
  350    replace_flags(T0, T, All, Type).
  351
  352all(g, all).
  353type(a, atom).
  354type(s, string).
  355
  356default(Val, Val) :- !.
  357default(_, _).
  358
  359replace(State, With, Dict, [Skipped|Parts], T) :-
  360    State = state(String, _, _Type),
  361    copy_term(With, r(PartsR, Skel)),
  362    Skel :< Dict,
  363    range_strings(PartsR, String, Parts, T),
  364    skipped(State, Dict.0, Skipped).
  365
  366range_strings([], _, T, T).
  367range_strings([Start-Len|T0], String, [S|T1], T) :-
  368    !,
  369    sub_string(String, Start, Len, _, S),
  370    range_strings(T0, String, T1, T).
  371range_strings([S|T0], String, [S|T1], T) :-
  372    range_strings(T0, String, T1, T).
  373
  374parts_to_output(string, Parts, String) :-
  375    atomics_to_string(Parts, String).
  376parts_to_output(atom, Parts, String) :-
  377    atomic_list_concat(Parts, String).

compile_replacement(+With, -Compiled): Compile the replacement specification into a specification that can be processed quickly. The compiled expressions are cached and may be reclaimed using re_flush/0.

  385:- dynamic replacement_cache/2.  386:- volatile replacement_cache/2.  387
  388compile_replacement(With, Compiled) :-
  389    replacement_cache(With, Compiled),
  390    !.
  391compile_replacement(With, Compiled) :-
  392    compile_replacement_nocache(With, Compiled),
  393    assertz(replacement_cache(With, Compiled)).
  394
  395compile_replacement_nocache(With, r(Parts, Extract)) :-
  396    string_codes(With, Codes),
  397    phrase(replacement_parts(Parts, Pairs), Codes),
  398    dict_pairs(Extract, _, Pairs).
  399
  400replacement_parts(Parts, Extract) -->
  401    string(HCodes),
  402    (   ("\\" ; "$"),
  403        capture_name(Name)
  404    ->  !,
  405        { add_part(HCodes, Parts, T0),
  406          T0 = [Repl|T1],
  407          Extract = [Name-Repl|Extract1]
  408        },
  409        replacement_parts(T1, Extract1)
  410    ;   eos
  411    ->  !,
  412        { add_part(HCodes, Parts, []),
  413          Extract = []
  414        }
  415    ).
  416
  417add_part([], Parts, Parts) :-
  418    !.
  419add_part(Codes, [H|T], T) :-
  420    string_codes(H, Codes).
  421
  422capture_name(Name) -->
  423    "{",
  424    (   digit(D0)
  425    ->  digits(DL),
  426        "}",
  427        { number_codes(Name, [D0|DL]) }
  428    ;   letter(A0),
  429        alnums(AL),
  430        "}",
  431        { atom_codes(Name, [A0|AL]) }
  432    ).
  433capture_name(Name) -->
  434    digit(D0),
  435    !,
  436    digits(DL),
  437    { number_codes(Name, [D0|DL]) }.
  438capture_name(Name) -->
  439    letter(A0),
  440    !,
  441    alnums(AL),
  442    { atom_codes(Name, [A0|AL]) }.
  443
  444letter(L) -->
  445    [L],
  446    { between(0'a,0'z,L)
  447    ; between(0'A,0'Z,L)
  448    ; L == 0'_
  449    }, !.
  450
  451alnums([H|T]) -->
  452    alnum(H),
  453    !,
  454    alnums(T).
  455alnums([]) -->
  456    "".
  457
  458alnum(L) -->
  459    [L],
  460    { between(0'a,0'z,L)
  461    ; between(0'A,0'Z,L)
  462    ; between(0'0,0'9,L)
  463    ; L == 0'_
  464    }, !.

re_compile(+Pattern, -Regex, +Options) is det

Compiles Pattern to a Regex blob of type regex (see blob/2). Defined Options are defined below. Please consult the PCRE documentation for details.

anchored(Bool): Force pattern anchoring
bsr(Mode): If anycrlf, \R only matches CR, LF or CRLF. If unicode, \R matches all Unicode line endings.
caseless(Bool): If true, do caseless matching.
dollar_endonly(Bool): If true, $ not to match newline at end
dotall(Bool): If true, . matches anything including NL
dupnames(Bool): If true, allow duplicate names for subpatterns
extended(Bool): If true, ignore white space and # comments
extra(Bool): If true, PCRE extra features (not much use currently)
firstline(Bool): If true, force matching to be before newline
compat(With): If javascript, JavaScript compatibility
multiline(Bool): If true, ^ and $ match newlines within data
newline(Mode): If any, recognize any Unicode newline sequence, if anycrlf (default), recognize CR, LF, and CRLF as newline sequences, if cr, recognize CR, if lf, recognize LF and finally if crlf recognize CRLF as newline.
ucp(Bool): If true, use Unicode properties for \d, \w, etc.
ungreedy(Bool): If true, invert greediness of quantifiers

In addition to the options above that directly map to pcre flags the following options are processed:

optimize(Bool)

If true, study the regular expression.

capture_type(+Type)

How to return the matched part of the input and possibly captured groups in there. Possible values are:

string: Return the captured string as a string (default).
atom: Return the captured string as an atom.
range: Return the captured string as a pair Start-Length. Note the we use Start-Length` rather than the more conventional Start-End to allow for immediate use with sub_atom/5 and sub_string/5.
term: Parse the captured string as a Prolog term. This is notably practical if you capture a number.

The capture_type specifies the default for this pattern. The interface supports a different type for each named group using the syntax (?<name_T>...), where T is one of S (string), A (atom), I (integer), F (float), N (number), T (term) and R (range). In the current implementation I, F and N are synonyms for T. Future versions may act different if the parsed value is not of the requested numeric type.

re_compiled(+Spec, --Regex) is det: Create a compiled regex from a specification. Cached compiled regular expressions can be reclaimed using re_flush/0.

  539:- dynamic re_pool/3.  540:- volatile re_pool/3.  541
  542re_compiled(Regex, Regex) :-
  543    blob(Regex, regex),
  544    !.
  545re_compiled(Text/Flags, Regex) :-
  546    must_be(text, Text),
  547    must_be(atom, Flags),
  548    re_pool(Text, Flags, Regex),
  549    !.
  550re_compiled(Text/Flags, Regex) :-
  551    !,
  552    re_flags_options(Flags, Options),
  553    re_compile(Text, Regex, Options),
  554    assertz(re_pool(Text, Flags, Regex)).
  555re_compiled(Text, Regex) :-
  556    must_be(text, Text),
  557    re_pool(Text, '', Regex),
  558    !.
  559re_compiled(Text, Regex) :-
  560    re_compiled(Text/'', Regex).
  561
  562re_flags_options(Flags, Options) :-
  563    atom_chars(Flags, Chars),
  564    maplist(re_flag_option, Chars, Options).
  565
  566re_flag_option(Flag, Option) :-
  567    re_flag_option_(Flag, Option),
  568    !.
  569re_flag_option(Flag, _) :-
  570    existence_error(re_flag, Flag).
  571
  572re_flag_option_(i, caseless(true)).
  573re_flag_option_(m, multiline(true)).
  574re_flag_option_(x, extended(true)).
  575re_flag_option_(s, dotall(true)).
  576re_flag_option_(a, capture_type(atom)).
  577re_flag_option_(r, capture_type(range)).
  578re_flag_option_(t, capture_type(term)).

re_flush

Clean pattern and replacement caches.

To be done: - Flush automatically if the cache becomes too large.

  586re_flush :-
  587    retractall(replacement_cache(_,_)),
  588    retractall(re_pool(_,_,_)).

re_config(+Term)

Extract configuration information from the pcre library. Term is of the form Name(Value). Name is derived from the PCRE_CONFIG_* constant after removing =PCRE_CONFIG_= and mapping the name to lower case, e.g. utf8, unicode_properties, etc. Value is either a Prolog boolean, integer or atom.

Finally, the functionality of pcre_version() is available using the configuration name version.

See also: - `man pcreapi` for details