1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2017, VU University Amsterdam 7 All rights reserved. 8 9 Redistribution and use in source and binary forms, with or without 10 modification, are permitted provided that the following conditions 11 are met: 12 13 1. Redistributions of source code must retain the above copyright 14 notice, this list of conditions and the following disclaimer. 15 16 2. Redistributions in binary form must reproduce the above copyright 17 notice, this list of conditions and the following disclaimer in 18 the documentation and/or other materials provided with the 19 distribution. 20 21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 POSSIBILITY OF SUCH DAMAGE. 33*/ 34 35:- module(pcre, 36 [ re_match/2, % +Regex, +String 37 re_match/3, % +Regex, +String, +Options 38 re_matchsub/4, % +Regex, +String, -Subs, +Options 39 re_foldl/6, % :Goal, +Regex, +String, ?V0, ?V, +Options 40 re_split/3, % +Pattern, +String, -Split:list 41 re_split/4, % +Pattern, +String, -Split:list, +Options 42 re_replace/4, % +Pattern, +With, +String, -NewString 43 44 re_compile/3, % +Pattern, -Regex, +Options 45 re_flush/0, 46 re_config/1 % ?Config 47 ]). 48:- use_module(library(error)). 49:- use_module(library(apply)). 50:- use_module(library(dcg/basics)). 51:- use_foreign_library(foreign(pcre4pl)). 52 53:- meta_predicate 54 re_foldl( , , , , , ).
73:- predicate_options(re_match/3, 3, 74 [ anchored(boolean), 75 bol(boolean), 76 bsr(oneof([anycrlf,unicode])), 77 empty(boolean), 78 empty_atstart(boolean), 79 eol(boolean), 80 newline(oneof([any,anycrlf,cr,lf,crlf])), 81 start(integer) 82 ]). 83:- predicate_options(re_compile/3, 3, 84 [ anchored(boolean), 85 bsr(oneof([anycrlf,unicode])), 86 caseless(boolean), 87 dollar_endonly(boolean), 88 dotall(boolean), 89 dupnames(boolean), 90 extended(boolean), 91 extra(boolean), 92 firstline(boolean), 93 compat(oneof([javascript])), 94 multiline(boolean), 95 newline(oneof([any,anycrlf,cr,lf,crlf])), 96 ucp(boolean), 97 ungreedy(boolean) 98 ]).
?- re_match("^needle"/i, "Needle in a haystack"). true.
Options:
true
, match only at the first positionfalse
)anycrlf
, \R only matches CR, LF or CRLF. If unicode
,
\R matches all Unicode line endings.
Subject string is the end of a line (default false
)true
)true
)false
)any
, recognize any Unicode newline sequence,
if anycrlf
, recognize CR, LF, and CRLF as newline
sequences, if cr
, recognize CR, if lf
, recognize
LF and finally if crlf
recognize CRLF as newline.148re_match(Regex, String) :- 149 re_match(Regex, String, []). 150re_match(Regex, String, Options) :- 151 re_compiled(Regex, Compiled), 152 re_match_(Compiled, String, Options).
capture_type(Type)
option passed to re_compile/3, may be specified
using flags if Regex is of the form Pattern/Flags and may be
specified at the level of individual captures using a naming
convention for the caption name. See re_compile/3 for details.
The example below exploits the typed groups to parse a date specification:
?- re_matchsub("(?<date> (?<year_I>(?:\\d\\d)?\\d\\d) - (?<month_I>\\d\\d) - (?<day_I>\\d\\d) )"/e, "2017-04-20", Sub, []). Sub = re_match{0:"2017-04-20", date:"2017-04-20", day:20, month:4, year:2017}.
176re_matchsub(Regex, String, Subs, Options) :-
177 re_compiled(Regex, Compiled),
178 re_matchsub_(Compiled, String, Pairs, Options),
179 dict_pairs(Subs, re_match, Pairs).
call(Goal, Dict1, V0, V1), call(Goal, Dict2, V1, V2), ... call(Goal, Dictn, Vn, V).
This predicate is used to implement re_split/4 and re_replace/4. For example, we can count all matches of a Regex on String using this code:
re_match_count(Regex, String, Count) :- re_foldl(increment, Regex, String, 0, Count, []). increment(_Match, V0, V1) :- V1 is V0+1.
After which we can query
?- re_match_count("a", "aap", X). X = 2.
213re_foldl(Goal, Regex, String, V0, V, Options) :- 214 re_compiled(Regex, Compiled), 215 re_foldl_(Compiled, String, Goal, V0, V, Options). 216 217:- public re_call_folder/4. 218 219re_call_folder(Goal, Pairs, V0, V1) :- 220 dict_pairs(Dict, re_match, Pairs), 221 call(Goal, Dict, V0, V1).
?- re_split("a+", "abaac", Split, []). Split = ["","a","b","aa","c"]. ?- re_split(":\\s*"/n, "Age: 33", Split, []). Split = ['Age', ': ', 33].
247re_split(Pattern, String, Split) :- 248 re_split(Pattern, String, Split, []). 249re_split(Pattern, String, Split, Options) :- 250 range_regex(Pattern, Compiled, Type), 251 State = state(String, 0, Type), 252 re_foldl(split(State), Compiled, String, Split, [Last], Options), 253 arg(2, State, LastSkipStart), 254 typed_sub(Type, String, LastSkipStart, _, 0, Last). 255 256range_regex(Pattern/Flags, Compiled, Type) :- !, 257 atom_chars(Flags, Chars), 258 replace_flags(Chars, Chars1, Type), 259 atom_chars(RFlags, [r|Chars1]), 260 re_compiled(Pattern/RFlags, Compiled). 261range_regex(Pattern, Compiled, string) :- 262 re_compiled(Pattern/r, Compiled). 263 264replace_flags([], [], Type) :- 265 default(Type, string). 266replace_flags([H|T0], T, Type) :- 267 split_type(H, Type), 268 !, 269 replace_flags(T0, T, Type). 270replace_flags([H|T0], [H|T], Type) :- 271 replace_flags(T0, T, Type). 272 273split_type(a, atom). 274split_type(s, string). 275split_type(n, name). 276 277split(State, Dict, [Skipped,Sep|T], T) :- 278 matched(State, Dict.0, Sep), 279 skipped(State, Dict.0, Skipped). 280 281matched(state(String, _, Type), Start-Len, Matched) :- 282 typed_sub(Type, String, Start, Len, _, Matched). 283 284skipped(State, Start-Len, Skipped) :- 285 State = state(String, Here, Type), 286 SkipLen is Start-Here, 287 typed_sub(Type, String, Here, SkipLen, _, Skipped), 288 NextSkipStart is Start+Len, 289 nb_setarg(2, State, NextSkipStart). 290 291typed_sub(string, Haystack, B, L, A, String) :- 292 sub_string(Haystack, B, L, A, String). 293typed_sub(atom, Haystack, B, L, A, String) :- 294 sub_atom(Haystack, B, L, A, String). 295typed_sub(name, Haystack, B, L, A, Value) :- 296 sub_string(Haystack, B, L, A, String), 297 ( number_string(Number, String) 298 -> Value = Number 299 ; atom_string(Value, String) 300 ).
313re_replace(Pattern, With, String, NewString) :- 314 range_regex(Pattern, Compiled, All, Type), 315 compile_replacement(With, RCompiled), 316 State = state(String, 0, Type), 317 ( All == all 318 -> re_foldl(replace(State, RCompiled), Compiled, String, Parts, [Last], []) 319 ; ( re_matchsub(Compiled, String, Match, []) 320 -> replace(State, RCompiled, Match, Parts, [Last]) 321 ; Repl = false 322 ) 323 ), 324 ( Repl == false 325 -> parts_to_output(Type, [String], NewString) 326 ; arg(2, State, LastSkipStart), 327 sub_string(String, LastSkipStart, _, 0, Last), 328 parts_to_output(Type, Parts, NewString) 329 ). 330 331range_regex(Pattern/Flags, Compiled, All, Type) :- !, 332 atom_chars(Flags, Chars), 333 replace_flags(Chars, Chars1, All, Type), 334 atom_chars(RFlags, [r|Chars1]), 335 re_compiled(Pattern/RFlags, Compiled). 336range_regex(Pattern, Compiled, first, string) :- 337 re_compiled(Pattern/r, Compiled). 338 339replace_flags([], [], All, Type) :- 340 default(All, first), 341 default(Type, string). 342replace_flags([H|T0], T, All, Type) :- 343 ( all(H, All) 344 -> true 345 ; type(H, Type) 346 ), 347 !, 348 replace_flags(T0, T, All, Type). 349replace_flags([H|T0], [H|T], All, Type) :- 350 replace_flags(T0, T, All, Type). 351 352all(g, all). 353type(a, atom). 354type(s, string). 355 356default(Val, Val) :- !. 357default(_, _). 358 359replace(State, With, Dict, [Skipped|Parts], T) :- 360 State = state(String, _, _Type), 361 copy_term(With, r(PartsR, Skel)), 362 Skel :< Dict, 363 range_strings(PartsR, String, Parts, T), 364 skipped(State, Dict.0, Skipped). 365 366range_strings([], _, T, T). 367range_strings([Start-Len|T0], String, [S|T1], T) :- 368 !, 369 sub_string(String, Start, Len, _, S), 370 range_strings(T0, String, T1, T). 371range_strings([S|T0], String, [S|T1], T) :- 372 range_strings(T0, String, T1, T). 373 374parts_to_output(string, Parts, String) :- 375 atomics_to_string(Parts, String). 376parts_to_output(atom, Parts, String) :- 377 atomic_list_concat(Parts, String).
385:- dynamic replacement_cache/2. 386:- volatile replacement_cache/2. 387 388compile_replacement(With, Compiled) :- 389 replacement_cache(With, Compiled), 390 !. 391compile_replacement(With, Compiled) :- 392 compile_replacement_nocache(With, Compiled), 393 assertz(replacement_cache(With, Compiled)). 394 395compile_replacement_nocache(With, r(Parts, Extract)) :- 396 string_codes(With, Codes), 397 phrase(replacement_parts(Parts, Pairs), Codes), 398 dict_pairs(Extract, _, Pairs). 399 400replacement_parts(Parts, Extract) --> 401 string(HCodes), 402 ( ("\\" ; "$"), 403 capture_name(Name) 404 -> !, 405 { add_part(HCodes, Parts, T0), 406 T0 = [Repl|T1], 407 Extract = [Name-Repl|Extract1] 408 }, 409 replacement_parts(T1, Extract1) 410 ; eos 411 -> !, 412 { add_part(HCodes, Parts, []), 413 Extract = [] 414 } 415 ). 416 417add_part([], Parts, Parts) :- 418 !. 419add_part(Codes, [H|T], T) :- 420 string_codes(H, Codes). 421 422capture_name(Name) --> 423 "{", 424 ( digit(D0) 425 -> digits(DL), 426 "}", 427 { number_codes(Name, [D0|DL]) } 428 ; letter(A0), 429 alnums(AL), 430 "}", 431 { atom_codes(Name, [A0|AL]) } 432 ). 433capture_name(Name) --> 434 digit(D0), 435 !, 436 digits(DL), 437 { number_codes(Name, [D0|DL]) }. 438capture_name(Name) --> 439 letter(A0), 440 !, 441 alnums(AL), 442 { atom_codes(Name, [A0|AL]) }. 443 444letter(L) --> 445 [L], 446 { between(0'a,0'z,L) 447 ; between(0'A,0'Z,L) 448 ; L == 0'_ 449 }, !. 450 451alnums([H|T]) --> 452 alnum(H), 453 !, 454 alnums(T). 455alnums([]) --> 456 "". 457 458alnum(L) --> 459 [L], 460 { between(0'a,0'z,L) 461 ; between(0'A,0'Z,L) 462 ; between(0'0,0'9,L) 463 ; L == 0'_ 464 }, !.
regex
(see blob/2).
Defined Options are defined below. Please consult the PCRE
documentation for details.
anycrlf
, \R only matches CR, LF or CRLF. If unicode
,
\R matches all Unicode line endings.true
, do caseless matching.true
, $ not to match newline at endtrue
, . matches anything including NLtrue
, allow duplicate names for subpatternstrue
, ignore white space and # commentstrue
, PCRE extra features (not much use currently)true
, force matching to be before newlinejavascript
, JavaScript compatibilitytrue
, ^ and $ match newlines within dataany
, recognize any Unicode newline sequence,
if anycrlf
(default), recognize CR, LF, and CRLF as newline
sequences, if cr
, recognize CR, if lf
, recognize
LF and finally if crlf
recognize CRLF as newline.true
, use Unicode properties for \d, \w, etc.true
, invert greediness of quantifiersIn addition to the options above that directly map to pcre flags the following options are processed:
true
, study the regular expression.Start-Length
. Note
the we use
Start-Length` rather than the more conventional
Start-End
to allow for immediate use with sub_atom/5 and
sub_string/5.
The capture_type
specifies the default for this pattern. The
interface supports a different type for each named group using
the syntax (?<name_T>...)
, where T
is one of S
(string),
A
(atom), I
(integer), F
(float), N
(number), T
(term)
and R
(range). In the current implementation I
, F
and N
are
synonyms for T
. Future versions may act different if the parsed
value is not of the requested numeric type.
539:- dynamic re_pool/3. 540:- volatile re_pool/3. 541 542re_compiled(Regex, Regex) :- 543 blob(Regex, regex), 544 !. 545re_compiled(Text/Flags, Regex) :- 546 must_be(text, Text), 547 must_be(atom, Flags), 548 re_pool(Text, Flags, Regex), 549 !. 550re_compiled(Text/Flags, Regex) :- 551 !, 552 re_flags_options(Flags, Options), 553 re_compile(Text, Regex, Options), 554 assertz(re_pool(Text, Flags, Regex)). 555re_compiled(Text, Regex) :- 556 must_be(text, Text), 557 re_pool(Text, '', Regex), 558 !. 559re_compiled(Text, Regex) :- 560 re_compiled(Text/'', Regex). 561 562re_flags_options(Flags, Options) :- 563 atom_chars(Flags, Chars), 564 maplist(re_flag_option, Chars, Options). 565 566re_flag_option(Flag, Option) :- 567 re_flag_option_(Flag, Option), 568 !. 569re_flag_option(Flag, _) :- 570 existence_error(re_flag, Flag). 571 572re_flag_option_(i, caseless(true)). 573re_flag_option_(m, multiline(true)). 574re_flag_option_(x, extended(true)). 575re_flag_option_(s, dotall(true)). 576re_flag_option_(a, capture_type(atom)). 577re_flag_option_(r, capture_type(range)). 578re_flag_option_(t, capture_type(term)).
586re_flush :-
587 retractall(replacement_cache(_,_)),
588 retractall(re_pool(_,_,_)).
PCRE_CONFIG_*
constant after removing =PCRE_CONFIG_= and mapping the name to lower
case, e.g. utf8
, unicode_properties
, etc. Value is either a
Prolog boolean, integer or atom.
Finally, the functionality of pcre_version()
is available using the
configuration name version
.
Perl compatible regular expression matching for SWI-Prolog
This module provides an interface to the PCRE (Perl Compatible Regular Expression) library. This Prolog interface provides an almost comprehensive wrapper around PCRE.
Regular expressions are created from a pattern and options and represented as a SWI-Prolog blob. This implies they are subject to (atom) garbage collection. Compiled regular expressions can safely be used in multiple threads. Most predicates accept both an explicitly compiled regular expression, a pattern or a term Pattern/Flags. In the latter two cases a regular expression blob is created and stored in a cache. The cache can be cleared using re_flush/0.