pcre.pl

View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2017, VU University Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(pcre,
   36          [ re_match/2,           % +Regex, +String
   37            re_match/3,           % +Regex, +String, +Options
   38            re_matchsub/4,        % +Regex, +String, -Subs, +Options
   39            re_foldl/6,           % :Goal, +Regex, +String, ?V0, ?V, +Options
   40            re_split/3,		  % +Pattern, +String, -Split:list
   41            re_split/4,		  % +Pattern, +String, -Split:list, +Options
   42            re_replace/4,	  % +Pattern, +With, +String, -NewString
   43
   44            re_compile/3,         % +Pattern, -Regex, +Options
   45            re_flush/0,
   46            re_config/1           % ?Config
   47          ]).   48:- use_module(library(error)).   49:- use_module(library(apply)).   50:- use_module(library(dcg/basics)).   51:- use_foreign_library(foreign(pcre4pl)).   52
   53:- meta_predicate
   54    re_foldl(3, +, +, ?, ?, +).   55
   56/** <module> Perl compatible regular expression matching for SWI-Prolog
   57
   58This module provides an interface   to  the [PCRE](http://www.pcre.org/)
   59(Perl Compatible Regular Expression)  library.   This  Prolog  interface
   60provides an almost comprehensive wrapper around PCRE.
   61
   62Regular  expressions  are  created  from  a   pattern  and  options  and
   63represented as a SWI-Prolog _blob_.  This   implies  they are subject to
   64(atom) garbage collection. Compiled regular   expressions  can safely be
   65used in multiple threads. Most  predicates   accept  both  an explicitly
   66compiled regular expression, a pattern or   a term Pattern/Flags. In the
   67latter two cases a regular expression _blob_  is created and stored in a
   68cache. The cache can be cleared using re_flush/0.
   69
   70@see `man pcre` for details.
   71*/
   72
   73:- predicate_options(re_match/3, 3,
   74                     [ anchored(boolean),
   75                       bol(boolean),
   76                       bsr(oneof([anycrlf,unicode])),
   77                       empty(boolean),
   78                       empty_atstart(boolean),
   79                       eol(boolean),
   80                       newline(oneof([any,anycrlf,cr,lf,crlf])),
   81                       start(integer)
   82                     ]).   83:- predicate_options(re_compile/3, 3,
   84                     [ anchored(boolean),
   85                       bsr(oneof([anycrlf,unicode])),
   86                       caseless(boolean),
   87                       dollar_endonly(boolean),
   88                       dotall(boolean),
   89                       dupnames(boolean),
   90                       extended(boolean),
   91                       extra(boolean),
   92                       firstline(boolean),
   93                       compat(oneof([javascript])),
   94                       multiline(boolean),
   95                       newline(oneof([any,anycrlf,cr,lf,crlf])),
   96                       ucp(boolean),
   97                       ungreedy(boolean)
   98                     ]).   99
  100
  101%!  re_match(+Regex, +String) is semidet.
  102%!  re_match(+Regex, +String, +Options) is semidet.
  103%
  104%   Succeeds if String matches Regex.  For example:
  105%
  106%     ```
  107%     ?- re_match("^needle"/i, "Needle in a haystack").
  108%     true.
  109%     ```
  110%
  111%   Options:
  112%
  113%     * anchored(Bool)
  114%     If =true=, match only at the first position
  115%     * bol(Bool)
  116%     Subject string is the beginning of a line (default =false=)
  117%     * bsr(Mode)
  118%     If =anycrlf=, \R only matches CR, LF or CRLF.  If =unicode=,
  119%     \R matches all Unicode line endings.
  120%     Subject string is the end of a line (default =false=)
  121%     * empty(Bool)
  122%     An empty string is a valid match (default =true=)
  123%     * empty_atstart(Bool)
  124%     An empty string at the start of the subject is a valid match
  125%     (default =true=)
  126%     * eol(Bool)
  127%     Subject string is the end of a line (default =false=)
  128%     * newline(Mode)
  129%     If =any=, recognize any Unicode newline sequence,
  130%     if =anycrlf=, recognize CR, LF, and CRLF as newline
  131%     sequences, if =cr=, recognize CR, if =lf=, recognize
  132%     LF and finally if =crlf= recognize CRLF as newline.
  133%     * start(+From)
  134%     Start at the given character index
  135%
  136%   @arg Regex is the output  of  re_compile/3,   a  pattern  or  a term
  137%   Pattern/Flags, where Pattern is an atom or string. The defined flags
  138%   and there related option for re_compile/3 are below.
  139%
  140%     - *x*: extended(true)
  141%     - *i*: caseless(true)
  142%     - *m*: multiline(true)
  143%     - *s*: dotall(true)
  144%     - *a*: capture_type(atom)
  145%     - *r*: capture_type(range)
  146%     - *t*: capture_type(term)
  147
  148re_match(Regex, String) :-
  149    re_match(Regex, String, []).
  150re_match(Regex, String, Options) :-
  151    re_compiled(Regex, Compiled),
  152    re_match_(Compiled, String, Options).
  153
  154%!  re_matchsub(+Regex, +String, -Sub:dict, +Options) is semidet.
  155%
  156%   Match String against Regex. On  success,   Sub  is a dict containing
  157%   integer keys for the numbered capture group   and  atom keys for the
  158%   named capture groups. The associated  value   is  determined  by the
  159%   capture_type(Type) option passed to re_compile/3,   may be specified
  160%   using flags if Regex  is  of  the   form  Pattern/Flags  and  may be
  161%   specified at the  level  of  individual   captures  using  a  naming
  162%   convention for the caption name. See re_compile/3 for details.
  163%
  164%   The example below  exploits  the  typed   groups  to  parse  a  date
  165%   specification:
  166%
  167%     ```
  168%     ?- re_matchsub("(?<date> (?<year_I>(?:\\d\\d)?\\d\\d) -
  169%                     (?<month_I>\\d\\d) - (?<day_I>\\d\\d) )"/e,
  170%                    "2017-04-20", Sub, []).
  171%     Sub = re_match{0:"2017-04-20", date:"2017-04-20",
  172%                    day:20, month:4, year:2017}.
  173%
  174%     ```
  175
  176re_matchsub(Regex, String, Subs, Options) :-
  177    re_compiled(Regex, Compiled),
  178    re_matchsub_(Compiled, String, Pairs, Options),
  179    dict_pairs(Subs, re_match, Pairs).
  180
  181%!  re_foldl(:Goal, +Regex, +String, ?V0, ?V, +Options) is semidet.
  182%
  183%   _Fold_ all matches of Regex on String.  Each match is represented by
  184%   a dict as specified for re_matchsub/4. V0  and V are related using a
  185%   sequence of invocations of Goal as illustrated below.
  186%
  187%	```
  188%       call(Goal, Dict1, V0, V1),
  189%       call(Goal, Dict2, V1, V2),
  190%       ...
  191%       call(Goal, Dictn, Vn, V).
  192%       ```
  193%
  194%   This predicate is used to implement re_split/4 and re_replace/4. For
  195%   example, we can count all matches of   a  Regex on String using this
  196%   code:
  197%
  198%     ```
  199%     re_match_count(Regex, String, Count) :-
  200%         re_foldl(increment, Regex, String, 0, Count, []).
  201%
  202%     increment(_Match, V0, V1) :-
  203%	  V1 is V0+1.
  204%     ```
  205%
  206%   After which we can query
  207%
  208%     ```
  209%     ?- re_match_count("a", "aap", X).
  210%     X = 2.
  211%     ```
  212
  213re_foldl(Goal, Regex, String, V0, V, Options) :-
  214    re_compiled(Regex, Compiled),
  215    re_foldl_(Compiled, String, Goal, V0, V, Options).
  216
  217:- public re_call_folder/4.  218
  219re_call_folder(Goal, Pairs, V0, V1) :-
  220    dict_pairs(Dict, re_match, Pairs),
  221    call(Goal, Dict, V0, V1).
  222
  223
  224%!  re_split(+Pattern, +String, -Split:list) is det.
  225%!  re_split(+Pattern, +String, -Split:list, +Options) is det.
  226%
  227%   Split String using the regular expression   Pattern. Split is a list
  228%   of strings holding alternating matches of  Pattern and skipped parts
  229%   of the String, starting with a skipped   part.  The Split lists ends
  230%   with a string of the content  of   String  after  the last match. If
  231%   Pattern does not appear in String, Split is a list holding a copy of
  232%   String. This implies the number  of   elements  in Split is _always_
  233%   odd.  For example:
  234%
  235%     ```
  236%     ?- re_split("a+", "abaac", Split, []).
  237%     Split = ["","a","b","aa","c"].
  238%     ?- re_split(":\\s*"/n, "Age: 33", Split, []).
  239%     Split = ['Age', ': ', 33].
  240%     ```
  241%
  242%   @arg Pattern is the pattern  text,   optionally  follows  by /Flags.
  243%   Similar to re_matchsub/4, the final output type can be controlled by
  244%   a flag =a= (atom), =s= (string, default) or =n= (number if possible,
  245%   atom otherwise).
  246
  247re_split(Pattern, String, Split) :-
  248    re_split(Pattern, String, Split, []).
  249re_split(Pattern, String, Split, Options) :-
  250    range_regex(Pattern, Compiled, Type),
  251    State = state(String, 0, Type),
  252    re_foldl(split(State), Compiled, String, Split, [Last], Options),
  253    arg(2, State, LastSkipStart),
  254    typed_sub(Type, String, LastSkipStart, _, 0, Last).
  255
  256range_regex(Pattern/Flags, Compiled, Type) :- !,
  257    atom_chars(Flags, Chars),
  258    replace_flags(Chars, Chars1, Type),
  259    atom_chars(RFlags, [r|Chars1]),
  260    re_compiled(Pattern/RFlags, Compiled).
  261range_regex(Pattern, Compiled, string) :-
  262    re_compiled(Pattern/r, Compiled).
  263
  264replace_flags([], [], Type) :-
  265    default(Type, string).
  266replace_flags([H|T0], T, Type) :-
  267    split_type(H, Type),
  268    !,
  269    replace_flags(T0, T, Type).
  270replace_flags([H|T0], [H|T], Type) :-
  271    replace_flags(T0, T, Type).
  272
  273split_type(a, atom).
  274split_type(s, string).
  275split_type(n, name).
  276
  277split(State, Dict, [Skipped,Sep|T], T) :-
  278    matched(State, Dict.0, Sep),
  279    skipped(State, Dict.0, Skipped).
  280
  281matched(state(String, _, Type), Start-Len, Matched) :-
  282    typed_sub(Type, String, Start, Len, _, Matched).
  283
  284skipped(State, Start-Len, Skipped) :-
  285    State = state(String, Here, Type),
  286    SkipLen is Start-Here,
  287    typed_sub(Type, String, Here, SkipLen, _, Skipped),
  288    NextSkipStart is Start+Len,
  289    nb_setarg(2, State, NextSkipStart).
  290
  291typed_sub(string, Haystack, B, L, A, String) :-
  292    sub_string(Haystack, B, L, A, String).
  293typed_sub(atom, Haystack, B, L, A, String) :-
  294    sub_atom(Haystack, B, L, A, String).
  295typed_sub(name, Haystack, B, L, A, Value) :-
  296    sub_string(Haystack, B, L, A, String),
  297    (   number_string(Number, String)
  298    ->  Value = Number
  299    ;   atom_string(Value, String)
  300    ).
  301
  302%!  re_replace(+Pattern, +With, +String, -NewString)
  303%
  304%   Replace matches of the regular  expression   Pattern  in String with
  305%   With. With may reference captured substrings using \N or $Name. Both
  306%   N and Name may be written as {N} and {Name} to avoid ambiguities.
  307%
  308%   @arg Pattern is the pattern  text,   optionally  follows  by /Flags.
  309%   Flags may include `g`,  replacing  all   occurences  of  Pattern. In
  310%   addition, similar to re_matchsub/4, the  final   output  type can be
  311%   controlled by a flag =a= (atom) or =s= (string, default).
  312
  313re_replace(Pattern, With, String, NewString) :-
  314    range_regex(Pattern, Compiled, All, Type),
  315    compile_replacement(With, RCompiled),
  316    State = state(String, 0, Type),
  317    (   All == all
  318    ->  re_foldl(replace(State, RCompiled), Compiled, String, Parts, [Last], [])
  319    ;   (   re_matchsub(Compiled, String, Match, [])
  320        ->  replace(State, RCompiled, Match, Parts, [Last])
  321        ;   Repl = false
  322        )
  323    ),
  324    (   Repl == false
  325    ->  parts_to_output(Type, [String], NewString)
  326    ;   arg(2, State, LastSkipStart),
  327        sub_string(String, LastSkipStart, _, 0, Last),
  328        parts_to_output(Type, Parts, NewString)
  329    ).
  330
  331range_regex(Pattern/Flags, Compiled, All, Type) :- !,
  332    atom_chars(Flags, Chars),
  333    replace_flags(Chars, Chars1, All, Type),
  334    atom_chars(RFlags, [r|Chars1]),
  335    re_compiled(Pattern/RFlags, Compiled).
  336range_regex(Pattern, Compiled, first, string) :-
  337    re_compiled(Pattern/r, Compiled).
  338
  339replace_flags([], [], All, Type) :-
  340    default(All, first),
  341    default(Type, string).
  342replace_flags([H|T0], T, All, Type) :-
  343    (   all(H, All)
  344    ->  true
  345    ;   type(H, Type)
  346    ),
  347    !,
  348    replace_flags(T0, T, All, Type).
  349replace_flags([H|T0], [H|T], All, Type) :-
  350    replace_flags(T0, T, All, Type).
  351
  352all(g, all).
  353type(a, atom).
  354type(s, string).
  355
  356default(Val, Val) :- !.
  357default(_, _).
  358
  359replace(State, With, Dict, [Skipped|Parts], T) :-
  360    State = state(String, _, _Type),
  361    copy_term(With, r(PartsR, Skel)),
  362    Skel :< Dict,
  363    range_strings(PartsR, String, Parts, T),
  364    skipped(State, Dict.0, Skipped).
  365
  366range_strings([], _, T, T).
  367range_strings([Start-Len|T0], String, [S|T1], T) :-
  368    !,
  369    sub_string(String, Start, Len, _, S),
  370    range_strings(T0, String, T1, T).
  371range_strings([S|T0], String, [S|T1], T) :-
  372    range_strings(T0, String, T1, T).
  373
  374parts_to_output(string, Parts, String) :-
  375    atomics_to_string(Parts, String).
  376parts_to_output(atom, Parts, String) :-
  377    atomic_list_concat(Parts, String).
  378
  379%!  compile_replacement(+With, -Compiled)
  380%
  381%   Compile the replacement specification into  a specification that can
  382%   be processed quickly. The compiled expressions are cached and may be
  383%   reclaimed using re_flush/0.
  384
  385:- dynamic replacement_cache/2.  386:- volatile replacement_cache/2.  387
  388compile_replacement(With, Compiled) :-
  389    replacement_cache(With, Compiled),
  390    !.
  391compile_replacement(With, Compiled) :-
  392    compile_replacement_nocache(With, Compiled),
  393    assertz(replacement_cache(With, Compiled)).
  394
  395compile_replacement_nocache(With, r(Parts, Extract)) :-
  396    string_codes(With, Codes),
  397    phrase(replacement_parts(Parts, Pairs), Codes),
  398    dict_pairs(Extract, _, Pairs).
  399
  400replacement_parts(Parts, Extract) -->
  401    string(HCodes),
  402    (   ("\\" ; "$"),
  403        capture_name(Name)
  404    ->  !,
  405        { add_part(HCodes, Parts, T0),
  406          T0 = [Repl|T1],
  407          Extract = [Name-Repl|Extract1]
  408        },
  409        replacement_parts(T1, Extract1)
  410    ;   eos
  411    ->  !,
  412        { add_part(HCodes, Parts, []),
  413          Extract = []
  414        }
  415    ).
  416
  417add_part([], Parts, Parts) :-
  418    !.
  419add_part(Codes, [H|T], T) :-
  420    string_codes(H, Codes).
  421
  422capture_name(Name) -->
  423    "{",
  424    (   digit(D0)
  425    ->  digits(DL),
  426        "}",
  427        { number_codes(Name, [D0|DL]) }
  428    ;   letter(A0),
  429        alnums(AL),
  430        "}",
  431        { atom_codes(Name, [A0|AL]) }
  432    ).
  433capture_name(Name) -->
  434    digit(D0),
  435    !,
  436    digits(DL),
  437    { number_codes(Name, [D0|DL]) }.
  438capture_name(Name) -->
  439    letter(A0),
  440    !,
  441    alnums(AL),
  442    { atom_codes(Name, [A0|AL]) }.
  443
  444letter(L) -->
  445    [L],
  446    { between(0'a,0'z,L)
  447    ; between(0'A,0'Z,L)
  448    ; L == 0'_
  449    }, !.
  450
  451alnums([H|T]) -->
  452    alnum(H),
  453    !,
  454    alnums(T).
  455alnums([]) -->
  456    "".
  457
  458alnum(L) -->
  459    [L],
  460    { between(0'a,0'z,L)
  461    ; between(0'A,0'Z,L)
  462    ; between(0'0,0'9,L)
  463    ; L == 0'_
  464    }, !.
  465
  466%!  re_compile(+Pattern, -Regex, +Options) is det.
  467%
  468%   Compiles Pattern to a Regex _blob_ of type =regex= (see blob/2).
  469%   Defined Options are  defined  below.   Please  consult  the PCRE
  470%   documentation for details.
  471%
  472%     * anchored(Bool)
  473%     Force pattern anchoring
  474%     * bsr(Mode)
  475%     If =anycrlf=, \R only matches CR, LF or CRLF.  If =unicode=,
  476%     \R matches all Unicode line endings.
  477%     * caseless(Bool)
  478%     If =true=, do caseless matching.
  479%     * dollar_endonly(Bool)
  480%     If =true=, $ not to match newline at end
  481%     * dotall(Bool)
  482%     If =true=, . matches anything including NL
  483%     * dupnames(Bool)
  484%     If =true=, allow duplicate names for subpatterns
  485%     * extended(Bool)
  486%     If =true=, ignore white space and # comments
  487%     * extra(Bool)
  488%     If =true=, PCRE extra features (not much use currently)
  489%     * firstline(Bool)
  490%     If =true=, force matching to be before newline
  491%     * compat(With)
  492%     If =javascript=, JavaScript compatibility
  493%     * multiline(Bool)
  494%     If =true=, ^ and $ match newlines within data
  495%     * newline(Mode)
  496%     If =any=, recognize any Unicode newline sequence,
  497%     if =anycrlf= (default), recognize CR, LF, and CRLF as newline
  498%     sequences, if =cr=, recognize CR, if =lf=, recognize
  499%     LF and finally if =crlf= recognize CRLF as newline.
  500%     * ucp(Bool)
  501%     If =true=, use Unicode properties for \d, \w, etc.
  502%     * ungreedy(Bool)
  503%     If =true=, invert greediness of quantifiers
  504%
  505%   In addition to the options above that directly map to pcre flags the
  506%   following options are processed:
  507%
  508%     * optimize(Bool)
  509%     If `true`, _study_ the regular expression.
  510%     * capture_type(+Type)
  511%     How to return the matched part of the input and possibly captured
  512%     groups in there.  Possible values are:
  513%       - string
  514%       Return the captured string as a string (default).
  515%       - atom
  516%       Return the captured string as an atom.
  517%       - range
  518%       Return the captured string as a pair `Start-Length`.  Note
  519%       the we use ``Start-Length` rather than the more conventional
  520%       `Start-End` to allow for immediate use with sub_atom/5 and
  521%       sub_string/5.
  522%       - term
  523%       Parse the captured string as a Prolog term.  This is notably
  524%       practical if you capture a number.
  525%
  526%    The `capture_type` specifies the  default   for  this  pattern. The
  527%    interface supports a different type for   each  _named_ group using
  528%    the syntax =|(?<name_T>...)|=, where =T= is   one  of =S= (string),
  529%    =A= (atom), =I= (integer), =F= (float),   =N=  (number), =T= (term)
  530%    and =R= (range). In the current implementation =I=, =F= and =N= are
  531%    synonyms for =T=. Future versions may   act different if the parsed
  532%    value is not of the requested numeric type.
  533
  534%!  re_compiled(+Spec, --Regex) is det.
  535%
  536%   Create a compiled regex from a specification.  Cached compiled
  537%   regular expressions can be reclaimed using re_flush/0.
  538
  539:- dynamic re_pool/3.  540:- volatile re_pool/3.  541
  542re_compiled(Regex, Regex) :-
  543    blob(Regex, regex),
  544    !.
  545re_compiled(Text/Flags, Regex) :-
  546    must_be(text, Text),
  547    must_be(atom, Flags),
  548    re_pool(Text, Flags, Regex),
  549    !.
  550re_compiled(Text/Flags, Regex) :-
  551    !,
  552    re_flags_options(Flags, Options),
  553    re_compile(Text, Regex, Options),
  554    assertz(re_pool(Text, Flags, Regex)).
  555re_compiled(Text, Regex) :-
  556    must_be(text, Text),
  557    re_pool(Text, '', Regex),
  558    !.
  559re_compiled(Text, Regex) :-
  560    re_compiled(Text/'', Regex).
  561
  562re_flags_options(Flags, Options) :-
  563    atom_chars(Flags, Chars),
  564    maplist(re_flag_option, Chars, Options).
  565
  566re_flag_option(Flag, Option) :-
  567    re_flag_option_(Flag, Option),
  568    !.
  569re_flag_option(Flag, _) :-
  570    existence_error(re_flag, Flag).
  571
  572re_flag_option_(i, caseless(true)).
  573re_flag_option_(m, multiline(true)).
  574re_flag_option_(x, extended(true)).
  575re_flag_option_(s, dotall(true)).
  576re_flag_option_(a, capture_type(atom)).
  577re_flag_option_(r, capture_type(range)).
  578re_flag_option_(t, capture_type(term)).
  579
  580%!  re_flush
  581%
  582%   Clean pattern and replacement caches.
  583%
  584%   @tbd Flush automatically if the cache becomes too large.
  585
  586re_flush :-
  587    retractall(replacement_cache(_,_)),
  588    retractall(re_pool(_,_,_)).
  589
  590%!  re_config(+Term)
  591%
  592%   Extract configuration information from the pcre  library. Term is of
  593%   the form Name(Value). Name  is   derived  from the =|PCRE_CONFIG_*|=
  594%   constant after removing =PCRE_CONFIG_= and mapping the name to lower
  595%   case, e.g. `utf8`, `unicode_properties`,  etc.   Value  is  either a
  596%   Prolog boolean, integer or atom.
  597%
  598%   Finally, the functionality of pcre_version()  is available using the
  599%   configuration name `version`.
  600%
  601%   @see `man pcreapi` for details