1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2017, VU University Amsterdam 7 All rights reserved. 8 9 Redistribution and use in source and binary forms, with or without 10 modification, are permitted provided that the following conditions 11 are met: 12 13 1. Redistributions of source code must retain the above copyright 14 notice, this list of conditions and the following disclaimer. 15 16 2. Redistributions in binary form must reproduce the above copyright 17 notice, this list of conditions and the following disclaimer in 18 the documentation and/or other materials provided with the 19 distribution. 20 21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 POSSIBILITY OF SUCH DAMAGE. 33*/ 34 35:- module(pcre, 36 [ re_match/2, % +Regex, +String 37 re_match/3, % +Regex, +String, +Options 38 re_matchsub/4, % +Regex, +String, -Subs, +Options 39 re_foldl/6, % :Goal, +Regex, +String, ?V0, ?V, +Options 40 re_split/3, % +Pattern, +String, -Split:list 41 re_split/4, % +Pattern, +String, -Split:list, +Options 42 re_replace/4, % +Pattern, +With, +String, -NewString 43 44 re_compile/3, % +Pattern, -Regex, +Options 45 re_flush/0, 46 re_config/1 % ?Config 47 ]). 48:- use_module(library(error)). 49:- use_module(library(apply)). 50:- use_module(library(dcg/basics)). 51:- use_foreign_library(foreign(pcre4pl)). 52 53:- meta_predicate 54 re_foldl( , , , , , ). 55 56/** <module> Perl compatible regular expression matching for SWI-Prolog 57 58This module provides an interface to the [PCRE](http://www.pcre.org/) 59(Perl Compatible Regular Expression) library. This Prolog interface 60provides an almost comprehensive wrapper around PCRE. 61 62Regular expressions are created from a pattern and options and 63represented as a SWI-Prolog _blob_. This implies they are subject to 64(atom) garbage collection. Compiled regular expressions can safely be 65used in multiple threads. Most predicates accept both an explicitly 66compiled regular expression, a pattern or a term Pattern/Flags. In the 67latter two cases a regular expression _blob_ is created and stored in a 68cache. The cache can be cleared using re_flush/0. 69 70@see `man pcre` for details. 71*/ 72 73:- predicate_options(re_match/3, 3, 74 [ anchored(boolean), 75 bol(boolean), 76 bsr(oneof([anycrlf,unicode])), 77 empty(boolean), 78 empty_atstart(boolean), 79 eol(boolean), 80 newline(oneof([any,anycrlf,cr,lf,crlf])), 81 start(integer) 82 ]). 83:- predicate_options(re_compile/3, 3, 84 [ anchored(boolean), 85 bsr(oneof([anycrlf,unicode])), 86 caseless(boolean), 87 dollar_endonly(boolean), 88 dotall(boolean), 89 dupnames(boolean), 90 extended(boolean), 91 extra(boolean), 92 firstline(boolean), 93 compat(oneof([javascript])), 94 multiline(boolean), 95 newline(oneof([any,anycrlf,cr,lf,crlf])), 96 ucp(boolean), 97 ungreedy(boolean) 98 ]). 99 100 101%! re_match(+Regex, +String) is semidet. 102%! re_match(+Regex, +String, +Options) is semidet. 103% 104% Succeeds if String matches Regex. For example: 105% 106% ``` 107% ?- re_match("^needle"/i, "Needle in a haystack"). 108% true. 109% ``` 110% 111% Options: 112% 113% * anchored(Bool) 114% If =true=, match only at the first position 115% * bol(Bool) 116% Subject string is the beginning of a line (default =false=) 117% * bsr(Mode) 118% If =anycrlf=, \R only matches CR, LF or CRLF. If =unicode=, 119% \R matches all Unicode line endings. 120% Subject string is the end of a line (default =false=) 121% * empty(Bool) 122% An empty string is a valid match (default =true=) 123% * empty_atstart(Bool) 124% An empty string at the start of the subject is a valid match 125% (default =true=) 126% * eol(Bool) 127% Subject string is the end of a line (default =false=) 128% * newline(Mode) 129% If =any=, recognize any Unicode newline sequence, 130% if =anycrlf=, recognize CR, LF, and CRLF as newline 131% sequences, if =cr=, recognize CR, if =lf=, recognize 132% LF and finally if =crlf= recognize CRLF as newline. 133% * start(+From) 134% Start at the given character index 135% 136% @arg Regex is the output of re_compile/3, a pattern or a term 137% Pattern/Flags, where Pattern is an atom or string. The defined flags 138% and there related option for re_compile/3 are below. 139% 140% - *x*: extended(true) 141% - *i*: caseless(true) 142% - *m*: multiline(true) 143% - *s*: dotall(true) 144% - *a*: capture_type(atom) 145% - *r*: capture_type(range) 146% - *t*: capture_type(term) 147 148re_match(Regex, String) :- 149 re_match(Regex, String, []). 150re_match(Regex, String, Options) :- 151 re_compiled(Regex, Compiled), 152 re_match_(Compiled, String, Options). 153 154%! re_matchsub(+Regex, +String, -Sub:dict, +Options) is semidet. 155% 156% Match String against Regex. On success, Sub is a dict containing 157% integer keys for the numbered capture group and atom keys for the 158% named capture groups. The associated value is determined by the 159% capture_type(Type) option passed to re_compile/3, may be specified 160% using flags if Regex is of the form Pattern/Flags and may be 161% specified at the level of individual captures using a naming 162% convention for the caption name. See re_compile/3 for details. 163% 164% The example below exploits the typed groups to parse a date 165% specification: 166% 167% ``` 168% ?- re_matchsub("(?<date> (?<year_I>(?:\\d\\d)?\\d\\d) - 169% (?<month_I>\\d\\d) - (?<day_I>\\d\\d) )"/e, 170% "2017-04-20", Sub, []). 171% Sub = re_match{0:"2017-04-20", date:"2017-04-20", 172% day:20, month:4, year:2017}. 173% 174% ``` 175 176re_matchsub(Regex, String, Subs, Options) :- 177 re_compiled(Regex, Compiled), 178 re_matchsub_(Compiled, String, Pairs, Options), 179 dict_pairs(Subs, re_match, Pairs). 180 181%! re_foldl(:Goal, +Regex, +String, ?V0, ?V, +Options) is semidet. 182% 183% _Fold_ all matches of Regex on String. Each match is represented by 184% a dict as specified for re_matchsub/4. V0 and V are related using a 185% sequence of invocations of Goal as illustrated below. 186% 187% ``` 188% call(Goal, Dict1, V0, V1), 189% call(Goal, Dict2, V1, V2), 190% ... 191% call(Goal, Dictn, Vn, V). 192% ``` 193% 194% This predicate is used to implement re_split/4 and re_replace/4. For 195% example, we can count all matches of a Regex on String using this 196% code: 197% 198% ``` 199% re_match_count(Regex, String, Count) :- 200% re_foldl(increment, Regex, String, 0, Count, []). 201% 202% increment(_Match, V0, V1) :- 203% V1 is V0+1. 204% ``` 205% 206% After which we can query 207% 208% ``` 209% ?- re_match_count("a", "aap", X). 210% X = 2. 211% ``` 212 213re_foldl(Goal, Regex, String, V0, V, Options) :- 214 re_compiled(Regex, Compiled), 215 re_foldl_(Compiled, String, Goal, V0, V, Options). 216 217:- public re_call_folder/4. 218 219re_call_folder(Goal, Pairs, V0, V1) :- 220 dict_pairs(Dict, re_match, Pairs), 221 call(Goal, Dict, V0, V1). 222 223 224%! re_split(+Pattern, +String, -Split:list) is det. 225%! re_split(+Pattern, +String, -Split:list, +Options) is det. 226% 227% Split String using the regular expression Pattern. Split is a list 228% of strings holding alternating matches of Pattern and skipped parts 229% of the String, starting with a skipped part. The Split lists ends 230% with a string of the content of String after the last match. If 231% Pattern does not appear in String, Split is a list holding a copy of 232% String. This implies the number of elements in Split is _always_ 233% odd. For example: 234% 235% ``` 236% ?- re_split("a+", "abaac", Split, []). 237% Split = ["","a","b","aa","c"]. 238% ?- re_split(":\\s*"/n, "Age: 33", Split, []). 239% Split = ['Age', ': ', 33]. 240% ``` 241% 242% @arg Pattern is the pattern text, optionally follows by /Flags. 243% Similar to re_matchsub/4, the final output type can be controlled by 244% a flag =a= (atom), =s= (string, default) or =n= (number if possible, 245% atom otherwise). 246 247re_split(Pattern, String, Split) :- 248 re_split(Pattern, String, Split, []). 249re_split(Pattern, String, Split, Options) :- 250 range_regex(Pattern, Compiled, Type), 251 State = state(String, 0, Type), 252 re_foldl(split(State), Compiled, String, Split, [Last], Options), 253 arg(2, State, LastSkipStart), 254 typed_sub(Type, String, LastSkipStart, _, 0, Last). 255 256range_regex(Pattern/Flags, Compiled, Type) :- !, 257 atom_chars(Flags, Chars), 258 replace_flags(Chars, Chars1, Type), 259 atom_chars(RFlags, [r|Chars1]), 260 re_compiled(Pattern/RFlags, Compiled). 261range_regex(Pattern, Compiled, string) :- 262 re_compiled(Pattern/r, Compiled). 263 264replace_flags([], [], Type) :- 265 default(Type, string). 266replace_flags([H|T0], T, Type) :- 267 split_type(H, Type), 268 !, 269 replace_flags(T0, T, Type). 270replace_flags([H|T0], [H|T], Type) :- 271 replace_flags(T0, T, Type). 272 273split_type(a, atom). 274split_type(s, string). 275split_type(n, name). 276 277split(State, Dict, [Skipped,Sep|T], T) :- 278 matched(State, Dict.0, Sep), 279 skipped(State, Dict.0, Skipped). 280 281matched(state(String, _, Type), Start-Len, Matched) :- 282 typed_sub(Type, String, Start, Len, _, Matched). 283 284skipped(State, Start-Len, Skipped) :- 285 State = state(String, Here, Type), 286 SkipLen is Start-Here, 287 typed_sub(Type, String, Here, SkipLen, _, Skipped), 288 NextSkipStart is Start+Len, 289 nb_setarg(2, State, NextSkipStart). 290 291typed_sub(string, Haystack, B, L, A, String) :- 292 sub_string(Haystack, B, L, A, String). 293typed_sub(atom, Haystack, B, L, A, String) :- 294 sub_atom(Haystack, B, L, A, String). 295typed_sub(name, Haystack, B, L, A, Value) :- 296 sub_string(Haystack, B, L, A, String), 297 ( number_string(Number, String) 298 -> Value = Number 299 ; atom_string(Value, String) 300 ). 301 302%! re_replace(+Pattern, +With, +String, -NewString) 303% 304% Replace matches of the regular expression Pattern in String with 305% With. With may reference captured substrings using \N or $Name. Both 306% N and Name may be written as {N} and {Name} to avoid ambiguities. 307% 308% @arg Pattern is the pattern text, optionally follows by /Flags. 309% Flags may include `g`, replacing all occurences of Pattern. In 310% addition, similar to re_matchsub/4, the final output type can be 311% controlled by a flag =a= (atom) or =s= (string, default). 312 313re_replace(Pattern, With, String, NewString) :- 314 range_regex(Pattern, Compiled, All, Type), 315 compile_replacement(With, RCompiled), 316 State = state(String, 0, Type), 317 ( All == all 318 -> re_foldl(replace(State, RCompiled), Compiled, String, Parts, [Last], []) 319 ; ( re_matchsub(Compiled, String, Match, []) 320 -> replace(State, RCompiled, Match, Parts, [Last]) 321 ; Repl = false 322 ) 323 ), 324 ( Repl == false 325 -> parts_to_output(Type, [String], NewString) 326 ; arg(2, State, LastSkipStart), 327 sub_string(String, LastSkipStart, _, 0, Last), 328 parts_to_output(Type, Parts, NewString) 329 ). 330 331range_regex(Pattern/Flags, Compiled, All, Type) :- !, 332 atom_chars(Flags, Chars), 333 replace_flags(Chars, Chars1, All, Type), 334 atom_chars(RFlags, [r|Chars1]), 335 re_compiled(Pattern/RFlags, Compiled). 336range_regex(Pattern, Compiled, first, string) :- 337 re_compiled(Pattern/r, Compiled). 338 339replace_flags([], [], All, Type) :- 340 default(All, first), 341 default(Type, string). 342replace_flags([H|T0], T, All, Type) :- 343 ( all(H, All) 344 -> true 345 ; type(H, Type) 346 ), 347 !, 348 replace_flags(T0, T, All, Type). 349replace_flags([H|T0], [H|T], All, Type) :- 350 replace_flags(T0, T, All, Type). 351 352all(g, all). 353type(a, atom). 354type(s, string). 355 356default(Val, Val) :- !. 357default(_, _). 358 359replace(State, With, Dict, [Skipped|Parts], T) :- 360 State = state(String, _, _Type), 361 copy_term(With, r(PartsR, Skel)), 362 Skel :< Dict, 363 range_strings(PartsR, String, Parts, T), 364 skipped(State, Dict.0, Skipped). 365 366range_strings([], _, T, T). 367range_strings([Start-Len|T0], String, [S|T1], T) :- 368 !, 369 sub_string(String, Start, Len, _, S), 370 range_strings(T0, String, T1, T). 371range_strings([S|T0], String, [S|T1], T) :- 372 range_strings(T0, String, T1, T). 373 374parts_to_output(string, Parts, String) :- 375 atomics_to_string(Parts, String). 376parts_to_output(atom, Parts, String) :- 377 atomic_list_concat(Parts, String). 378 379%! compile_replacement(+With, -Compiled) 380% 381% Compile the replacement specification into a specification that can 382% be processed quickly. The compiled expressions are cached and may be 383% reclaimed using re_flush/0. 384 385:- dynamic replacement_cache/2. 386:- volatile replacement_cache/2. 387 388compile_replacement(With, Compiled) :- 389 replacement_cache(With, Compiled), 390 !. 391compile_replacement(With, Compiled) :- 392 compile_replacement_nocache(With, Compiled), 393 assertz(replacement_cache(With, Compiled)). 394 395compile_replacement_nocache(With, r(Parts, Extract)) :- 396 string_codes(With, Codes), 397 phrase(replacement_parts(Parts, Pairs), Codes), 398 dict_pairs(Extract, _, Pairs). 399 400replacement_parts(Parts, Extract) --> 401 string(HCodes), 402 ( ("\\" ; "$"), 403 capture_name(Name) 404 -> !, 405 { add_part(HCodes, Parts, T0), 406 T0 = [Repl|T1], 407 Extract = [Name-Repl|Extract1] 408 }, 409 replacement_parts(T1, Extract1) 410 ; eos 411 -> !, 412 { add_part(HCodes, Parts, []), 413 Extract = [] 414 } 415 ). 416 417add_part([], Parts, Parts) :- 418 !. 419add_part(Codes, [H|T], T) :- 420 string_codes(H, Codes). 421 422capture_name(Name) --> 423 "{", 424 ( digit(D0) 425 -> digits(DL), 426 "}", 427 { number_codes(Name, [D0|DL]) } 428 ; letter(A0), 429 alnums(AL), 430 "}", 431 { atom_codes(Name, [A0|AL]) } 432 ). 433capture_name(Name) --> 434 digit(D0), 435 !, 436 digits(DL), 437 { number_codes(Name, [D0|DL]) }. 438capture_name(Name) --> 439 letter(A0), 440 !, 441 alnums(AL), 442 { atom_codes(Name, [A0|AL]) }. 443 444letter(L) --> 445 [L], 446 { between(0'a,0'z,L) 447 ; between(0'A,0'Z,L) 448 ; L == 0'_ 449 }, !. 450 451alnums([H|T]) --> 452 alnum(H), 453 !, 454 alnums(T). 455alnums([]) --> 456 "". 457 458alnum(L) --> 459 [L], 460 { between(0'a,0'z,L) 461 ; between(0'A,0'Z,L) 462 ; between(0'0,0'9,L) 463 ; L == 0'_ 464 }, !. 465 466%! re_compile(+Pattern, -Regex, +Options) is det. 467% 468% Compiles Pattern to a Regex _blob_ of type =regex= (see blob/2). 469% Defined Options are defined below. Please consult the PCRE 470% documentation for details. 471% 472% * anchored(Bool) 473% Force pattern anchoring 474% * bsr(Mode) 475% If =anycrlf=, \R only matches CR, LF or CRLF. If =unicode=, 476% \R matches all Unicode line endings. 477% * caseless(Bool) 478% If =true=, do caseless matching. 479% * dollar_endonly(Bool) 480% If =true=, $ not to match newline at end 481% * dotall(Bool) 482% If =true=, . matches anything including NL 483% * dupnames(Bool) 484% If =true=, allow duplicate names for subpatterns 485% * extended(Bool) 486% If =true=, ignore white space and # comments 487% * extra(Bool) 488% If =true=, PCRE extra features (not much use currently) 489% * firstline(Bool) 490% If =true=, force matching to be before newline 491% * compat(With) 492% If =javascript=, JavaScript compatibility 493% * multiline(Bool) 494% If =true=, ^ and $ match newlines within data 495% * newline(Mode) 496% If =any=, recognize any Unicode newline sequence, 497% if =anycrlf= (default), recognize CR, LF, and CRLF as newline 498% sequences, if =cr=, recognize CR, if =lf=, recognize 499% LF and finally if =crlf= recognize CRLF as newline. 500% * ucp(Bool) 501% If =true=, use Unicode properties for \d, \w, etc. 502% * ungreedy(Bool) 503% If =true=, invert greediness of quantifiers 504% 505% In addition to the options above that directly map to pcre flags the 506% following options are processed: 507% 508% * optimize(Bool) 509% If `true`, _study_ the regular expression. 510% * capture_type(+Type) 511% How to return the matched part of the input and possibly captured 512% groups in there. Possible values are: 513% - string 514% Return the captured string as a string (default). 515% - atom 516% Return the captured string as an atom. 517% - range 518% Return the captured string as a pair `Start-Length`. Note 519% the we use ``Start-Length` rather than the more conventional 520% `Start-End` to allow for immediate use with sub_atom/5 and 521% sub_string/5. 522% - term 523% Parse the captured string as a Prolog term. This is notably 524% practical if you capture a number. 525% 526% The `capture_type` specifies the default for this pattern. The 527% interface supports a different type for each _named_ group using 528% the syntax =|(?<name_T>...)|=, where =T= is one of =S= (string), 529% =A= (atom), =I= (integer), =F= (float), =N= (number), =T= (term) 530% and =R= (range). In the current implementation =I=, =F= and =N= are 531% synonyms for =T=. Future versions may act different if the parsed 532% value is not of the requested numeric type. 533 534%! re_compiled(+Spec, --Regex) is det. 535% 536% Create a compiled regex from a specification. Cached compiled 537% regular expressions can be reclaimed using re_flush/0. 538 539:- dynamic re_pool/3. 540:- volatile re_pool/3. 541 542re_compiled(Regex, Regex) :- 543 blob(Regex, regex), 544 !. 545re_compiled(Text/Flags, Regex) :- 546 must_be(text, Text), 547 must_be(atom, Flags), 548 re_pool(Text, Flags, Regex), 549 !. 550re_compiled(Text/Flags, Regex) :- 551 !, 552 re_flags_options(Flags, Options), 553 re_compile(Text, Regex, Options), 554 assertz(re_pool(Text, Flags, Regex)). 555re_compiled(Text, Regex) :- 556 must_be(text, Text), 557 re_pool(Text, '', Regex), 558 !. 559re_compiled(Text, Regex) :- 560 re_compiled(Text/'', Regex). 561 562re_flags_options(Flags, Options) :- 563 atom_chars(Flags, Chars), 564 maplist(re_flag_option, Chars, Options). 565 566re_flag_option(Flag, Option) :- 567 re_flag_option_(Flag, Option), 568 !. 569re_flag_option(Flag, _) :- 570 existence_error(re_flag, Flag). 571 572re_flag_option_(i, caseless(true)). 573re_flag_option_(m, multiline(true)). 574re_flag_option_(x, extended(true)). 575re_flag_option_(s, dotall(true)). 576re_flag_option_(a, capture_type(atom)). 577re_flag_option_(r, capture_type(range)). 578re_flag_option_(t, capture_type(term)). 579 580%! re_flush 581% 582% Clean pattern and replacement caches. 583% 584% @tbd Flush automatically if the cache becomes too large. 585 586re_flush :- 587 retractall(replacement_cache(_,_)), 588 retractall(re_pool(_,_,_)). 589 590%! re_config(+Term) 591% 592% Extract configuration information from the pcre library. Term is of 593% the form Name(Value). Name is derived from the =|PCRE_CONFIG_*|= 594% constant after removing =PCRE_CONFIG_= and mapping the name to lower 595% case, e.g. `utf8`, `unicode_properties`, etc. Value is either a 596% Prolog boolean, integer or atom. 597% 598% Finally, the functionality of pcre_version() is available using the 599% configuration name `version`. 600% 601% @see `man pcreapi` for details