1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2000-2016, University of Amsterdam 7 VU University Amsterdam 8 All rights reserved. 9 10 Redistribution and use in source and binary forms, with or without 11 modification, are permitted provided that the following conditions 12 are met: 13 14 1. Redistributions of source code must retain the above copyright 15 notice, this list of conditions and the following disclaimer. 16 17 2. Redistributions in binary form must reproduce the above copyright 18 notice, this list of conditions and the following disclaimer in 19 the documentation and/or other materials provided with the 20 distribution. 21 22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 32 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 POSSIBILITY OF SUCH DAMAGE. 34*/ 35 36:- module(sgml, 37 [ load_html/3, % +Input, -DOM, +Options 38 load_xml/3, % +Input, -DOM, +Options 39 load_sgml/3, % +Input, -DOM, +Options 40 41 load_sgml_file/2, % +File, -ListOfContent 42 load_xml_file/2, % +File, -ListOfContent 43 load_html_file/2, % +File, -Document 44 45 load_structure/3, % +File, -Term, +Options 46 47 load_dtd/2, % +DTD, +File 48 load_dtd/3, % +DTD, +File, +Options 49 dtd/2, % +Type, -DTD 50 dtd_property/2, % +DTD, ?Property 51 52 new_dtd/2, % +Doctype, -DTD 53 free_dtd/1, % +DTD 54 open_dtd/3, % +DTD, +Options, -Stream 55 56 new_sgml_parser/2, % -Parser, +Options 57 free_sgml_parser/1, % +Parser 58 set_sgml_parser/2, % +Parser, +Options 59 get_sgml_parser/2, % +Parser, +Options 60 sgml_parse/2, % +Parser, +Options 61 62 sgml_register_catalog_file/2, % +File, +StartOrEnd 63 64 xml_quote_attribute/3, % +In, -Quoted, +Encoding 65 xml_quote_cdata/3, % +In, -Quoted, +Encoding 66 xml_quote_attribute/2, % +In, -Quoted 67 xml_quote_cdata/2, % +In, -Quoted 68 xml_name/1, % +In 69 xml_name/2, % +In, +Encoding 70 71 xsd_number_string/2, % ?Number, ?String 72 xsd_time_string/3, % ?Term, ?Type, ?String 73 74 xml_basechar/1, % +Code 75 xml_ideographic/1, % +Code 76 xml_combining_char/1, % +Code 77 xml_digit/1, % +Code 78 xml_extender/1, % +Code 79 80 iri_xml_namespace/2, % +IRI, -Namespace 81 iri_xml_namespace/3, % +IRI, -Namespace, -LocalName 82 xml_is_dom/1 % +Term 83 ]). 84:- use_module(library(lists)). 85:- use_module(library(option)). 86:- use_module(library(error)). 87:- use_module(library(iostream)). 88 89:- meta_predicate 90 load_structure( , , ), 91 load_html( , , ), 92 load_xml( , , ), 93 load_sgml( , , ). 94 95:- predicate_options(load_structure/3, 3, 96 [ charpos(integer), 97 defaults(boolean), 98 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])), 99 doctype(atom), 100 dtd(any), 101 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])), 102 entity(atom,atom), 103 keep_prefix(boolean), 104 file(atom), 105 line(integer), 106 offset(integer), 107 number(oneof([token,integer])), 108 qualify_attributes(boolean), 109 shorttag(boolean), 110 case_sensitive_attributes(boolean), 111 case_preserving_attributes(boolean), 112 system_entities(boolean), 113 max_memory(integer), 114 space(oneof([sgml,preserve,default,remove])), 115 xmlns(atom), 116 xmlns(atom,atom), 117 pass_to(sgml_parse/2, 2) 118 ]). 119:- predicate_options(load_html/3, 3, 120 [ pass_to(load_structure/3, 3) 121 ]). 122:- predicate_options(load_xml/3, 3, 123 [ pass_to(load_structure/3, 3) 124 ]). 125:- predicate_options(load_sgml/3, 3, 126 [ pass_to(load_structure/3, 3) 127 ]). 128:- predicate_options(load_dtd/3, 3, 129 [ dialect(oneof([sgml,xml,xmlns])), 130 pass_to(open/4, 4) 131 ]). 132:- predicate_options(sgml_parse/2, 2, 133 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]), 134 callable), 135 content_length(integer), 136 document(-any), 137 max_errors(integer), 138 parse(oneof([file,element,content,declaration,input])), 139 source(any), 140 syntax_errors(oneof([quiet,print,style])), 141 xml_no_ns(oneof([error,quiet])) 142 ]). 143:- predicate_options(new_sgml_parser/2, 2, 144 [ dtd(any) 145 ]).
175:- multifile user:file_search_path/2. 176:- dynamic user:file_search_path/2. 177 178user:file_search_path(dtd, '.'). 179user:file_search_path(dtd, swi('library/DTD')). 180 181sgml_register_catalog_file(File, Location) :- 182 prolog_to_os_filename(File, OsFile), 183 '_sgml_register_catalog_file'(OsFile, Location). 184 185:- use_foreign_library(foreign(sgml2pl)). 186 187register_catalog(Base) :- 188 absolute_file_name(dtd(Base), 189 [ extensions([soc]), 190 access(read), 191 file_errors(fail) 192 ], 193 SocFile), 194 sgml_register_catalog_file(SocFile, end). 195 196:- initialization 197 ignore(register_catalog('HTML4')). 198 199 200 /******************************* 201 * DTD HANDLING * 202 *******************************/ 203 204/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 205Note that concurrent access to DTD objects is not allowed, and hence we 206will allocate and destroy them in each thread. Possibibly it would be 207nicer to find out why concurrent access to DTD's is flawed. It is 208diagnosed to mess with the entity resolution by Fabien Todescato. 209- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ 210 211:- thread_local 212 current_dtd/2. 213:- volatile 214 current_dtd/2. 215:- thread_local 216 registered_cleanup/0. 217:- volatile 218 registered_cleanup/0. 219 220:- multifile 221 dtd_alias/2. 222 223:- create_prolog_flag(html_dialect, html5, [type(atom)]). 224 225dtd_alias(html4, 'HTML4'). 226dtd_alias(html5, 'HTML5'). 227dtd_alias(html, DTD) :- 228 current_prolog_flag(html_dialect, Dialect), 229 dtd_alias(Dialect, DTD).
dtd(Type)
. Loaded DTD
objects are cached. Note that DTD objects may not be shared
between threads. Therefore, dtd/2 maintains the pool of DTD
objects using a thread_local predicate. DTD objects are
destroyed if a thread terminates.
241dtd(Type, DTD) :- 242 current_dtd(Type, DTD), 243 !. 244dtd(Type, DTD) :- 245 new_dtd(Type, DTD), 246 ( dtd_alias(Type, Base) 247 -> true 248 ; Base = Type 249 ), 250 absolute_file_name(dtd(Base), 251 [ extensions([dtd]), 252 access(read) 253 ], DtdFile), 254 load_dtd(DTD, DtdFile), 255 register_cleanup, 256 asserta(current_dtd(Type, DTD)).
271load_dtd(DTD, DtdFile) :- 272 load_dtd(DTD, DtdFile, []). 273load_dtd(DTD, DtdFile, Options) :- 274 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions), 275 setup_call_cleanup( 276 open_dtd(DTD, DTDOptions, DtdOut), 277 setup_call_cleanup( 278 open(DtdFile, read, DtdIn, OpenOptions), 279 copy_stream_data(DtdIn, DtdOut), 280 close(DtdIn)), 281 close(DtdOut)). 282 283split_dtd_options([], [], []). 284split_dtd_options([H|T], [H|TD], S) :- 285 dtd_option(H), 286 !, 287 split_dtd_options(T, TD, S). 288split_dtd_options([H|T], TD, [H|S]) :- 289 split_dtd_options(T, TD, S). 290 291dtd_option(dialect(_)).
299destroy_dtds :-
300 ( current_dtd(_Type, DTD),
301 free_dtd(DTD),
302 fail
303 ; true
304 ).
310register_cleanup :- 311 registered_cleanup, 312 !. 313register_cleanup :- 314 catch(thread_at_exit(destroy_dtds), _, true), 315 assert(registered_cleanup). 316 317 318 /******************************* 319 * EXAMINE DTD * 320 *******************************/ 321 322prop(doctype(_), _). 323prop(elements(_), _). 324prop(entities(_), _). 325prop(notations(_), _). 326prop(entity(E, _), DTD) :- 327 ( nonvar(E) 328 -> true 329 ; '$dtd_property'(DTD, entities(EL)), 330 member(E, EL) 331 ). 332prop(element(E, _, _), DTD) :- 333 ( nonvar(E) 334 -> true 335 ; '$dtd_property'(DTD, elements(EL)), 336 member(E, EL) 337 ). 338prop(attributes(E, _), DTD) :- 339 ( nonvar(E) 340 -> true 341 ; '$dtd_property'(DTD, elements(EL)), 342 member(E, EL) 343 ). 344prop(attribute(E, A, _, _), DTD) :- 345 ( nonvar(E) 346 -> true 347 ; '$dtd_property'(DTD, elements(EL)), 348 member(E, EL) 349 ), 350 ( nonvar(A) 351 -> true 352 ; '$dtd_property'(DTD, attributes(E, AL)), 353 member(A, AL) 354 ). 355prop(notation(N, _), DTD) :- 356 ( nonvar(N) 357 -> true 358 ; '$dtd_property'(DTD, notations(NL)), 359 member(N, NL) 360 ). 361 362dtd_property(DTD, Prop) :- 363 prop(Prop, DTD), 364 '$dtd_property'(DTD, Prop). 365 366 367 /******************************* 368 * SGML * 369 *******************************/
A proper XML document contains only a single toplevel element whose name matches the document type. Nevertheless, a list is returned for consistency with the representation of element content.
The encoding(+Encoding)
option is treated special for
compatibility reasons:
iso-8859-1
, us-ascii
or utf-8
,
the stream is opened in binary mode and the option is passed
to the SGML parser.393load_structure(Spec, DOM, Options) :- 394 sgml_open_options(Options, OpenOptions, SGMLOptions), 395 setup_call_cleanup( 396 open_any(Spec, read, In, Close, OpenOptions), 397 load_structure_from_stream(In, DOM, SGMLOptions), 398 close_any(Close)). 399 400sgml_open_options(Options, OpenOptions, SGMLOptions) :- 401 Options = M:Plain, 402 ( select_option(encoding(Encoding), Plain, NoEnc) 403 -> ( sgml_encoding(Encoding) 404 -> merge_options(NoEnc, [type(binary)], OpenOptions), 405 SGMLOptions = Options 406 ; OpenOptions = Plain, 407 SGMLOptions = M:NoEnc 408 ) 409 ; merge_options(Plain, [type(binary)], OpenOptions), 410 SGMLOptions = Options 411 ). 412 413sgml_encoding(Enc) :- 414 downcase_atom(Enc, Enc1), 415 sgml_encoding_l(Enc1). 416 417sgml_encoding_l('iso-8859-1'). 418sgml_encoding_l('us-ascii'). 419sgml_encoding_l('utf-8'). 420sgml_encoding_l('utf8'). 421sgml_encoding_l('iso_latin_1'). 422sgml_encoding_l('ascii'). 423 424load_structure_from_stream(In, Term, M:Options) :- 425 !, 426 ( select_option(dtd(DTD), Options, Options1) 427 -> ExplicitDTD = true 428 ; ExplicitDTD = false, 429 Options1 = Options 430 ), 431 move_front(Options1, dialect(_), Options2), % dialect sets defaults 432 setup_call_cleanup( 433 new_sgml_parser(Parser, 434 [ dtd(DTD) 435 ]), 436 parse(Parser, M:Options2, TermRead, In), 437 free_sgml_parser(Parser)), 438 ( ExplicitDTD == true 439 -> ( DTD = dtd(_, DocType), 440 dtd_property(DTD, doctype(DocType)) 441 -> true 442 ; true 443 ) 444 ; free_dtd(DTD) 445 ), 446 Term = TermRead. 447 448move_front(Options0, Opt, Options) :- 449 selectchk(Opt, Options0, Options1), 450 !, 451 Options = [Opt|Options1]. 452move_front(Options, _, Options). 453 454 455parse(Parser, M:Options, Document, In) :- 456 set_parser_options(Options, Parser, In, Options1), 457 parser_meta_options(Options1, M, Options2), 458 set_input_location(Parser, In), 459 sgml_parse(Parser, 460 [ document(Document), 461 source(In) 462 | Options2 463 ]). 464 465set_parser_options([], _, _, []). 466set_parser_options([H|T], Parser, In, Rest) :- 467 ( set_parser_option(H, Parser, In) 468 -> set_parser_options(T, Parser, In, Rest) 469 ; Rest = [H|R2], 470 set_parser_options(T, Parser, In, R2) 471 ). 472 473set_parser_option(Var, _Parser, _In) :- 474 var(Var), 475 !, 476 instantiation_error(Var). 477set_parser_option(Option, Parser, _) :- 478 def_entity(Option, Parser), 479 !. 480set_parser_option(offset(Offset), _Parser, In) :- 481 !, 482 seek(In, Offset, bof, _). 483set_parser_option(Option, Parser, _In) :- 484 parser_option(Option), 485 !, 486 set_sgml_parser(Parser, Option). 487set_parser_option(Name=Value, Parser, In) :- 488 Option =.. [Name,Value], 489 set_parser_option(Option, Parser, In). 490 491 492parser_option(dialect(_)). 493parser_option(shorttag(_)). 494parser_option(case_sensitive_attributes(_)). 495parser_option(case_preserving_attributes(_)). 496parser_option(system_entities(_)). 497parser_option(max_memory(_)). 498parser_option(file(_)). 499parser_option(line(_)). 500parser_option(space(_)). 501parser_option(number(_)). 502parser_option(defaults(_)). 503parser_option(doctype(_)). 504parser_option(qualify_attributes(_)). 505parser_option(encoding(_)). 506parser_option(keep_prefix(_)). 507 508 509def_entity(entity(Name, Value), Parser) :- 510 get_sgml_parser(Parser, dtd(DTD)), 511 xml_quote_attribute(Value, QValue), 512 setup_call_cleanup(open_dtd(DTD, [], Stream), 513 format(Stream, '<!ENTITY ~w "~w">~n', 514 [Name, QValue]), 515 close(Stream)). 516def_entity(xmlns(URI), Parser) :- 517 set_sgml_parser(Parser, xmlns(URI)). 518def_entity(xmlns(NS, URI), Parser) :- 519 set_sgml_parser(Parser, xmlns(NS, URI)).
525parser_meta_options([], _, []). 526parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :- 527 !, 528 parser_meta_options(T0, M, T). 529parser_meta_options([H|T0], M, [H|T]) :- 530 parser_meta_options(T0, M, T).
537set_input_location(Parser, _In) :- 538 get_sgml_parser(Parser, file(_)), 539 !. 540set_input_location(Parser, In) :- 541 stream_property(In, file_name(File)), 542 !, 543 set_sgml_parser(Parser, file(File)), 544 stream_property(In, position(Pos)), 545 set_sgml_parser(Parser, position(Pos)). 546set_input_location(_, _). 547 548 /******************************* 549 * UTILITIES * 550 *******************************/
559load_sgml_file(File, Term) :-
560 load_sgml(File, Term, []).
569load_xml_file(File, Term) :-
570 load_xml(File, Term, []).
579load_html_file(File, DOM) :-
580 load_html(File, DOM, []).
dtd(html, DTD)
.html_dialect
You may also want to use the library(http/http_open)
to support
loading from HTTP and HTTPS URLs. For example:
:- use_module(library(http/http_open)). :- use_module(library(sgml)). load_html_url(URL, DOM) :- load_html(URL, DOM, []).
609load_html(File, Term, M:Options) :-
610 current_prolog_flag(html_dialect, Dialect),
611 dtd(Dialect, DTD),
612 merge_options(Options,
613 [ dtd(DTD),
614 dialect(Dialect),
615 max_errors(-1),
616 syntax_errors(quiet)
617 ], Options1),
618 load_structure(File, Term, M:Options1).
dialect(xml)
628load_xml(Input, DOM, M:Options) :-
629 merge_options(Options,
630 [ dialect(xml)
631 ], Options1),
632 load_structure(Input, DOM, M:Options1).
dialect(sgml)
642load_sgml(Input, DOM, M:Options) :- 643 merge_options(Options, 644 [ dialect(sgml) 645 ], Options1), 646 load_structure(Input, DOM, M:Options1). 647 648 649 650 /******************************* 651 * ENCODING * 652 *******************************/
662xml_quote_attribute(In, Quoted) :- 663 xml_quote_attribute(In, Quoted, ascii). 664 665xml_quote_cdata(In, Quoted) :- 666 xml_quote_cdata(In, Quoted, ascii).
672xml_name(In) :- 673 xml_name(In, ascii). 674 675 676 /******************************* 677 * XML CHARACTER CLASSES * 678 *******************************/
693 /******************************* 694 * TYPE CHECKING * 695 *******************************/
702xml_is_dom(0) :- !, fail. % catch variables 703xml_is_dom(List) :- 704 is_list(List), 705 !, 706 xml_is_content_list(List). 707xml_is_dom(Term) :- 708 xml_is_element(Term). 709 710xml_is_content_list([]). 711xml_is_content_list([H|T]) :- 712 xml_is_content(H), 713 xml_is_content_list(T). 714 715xml_is_content(0) :- !, fail. 716xml_is_content(pi(Pi)) :- 717 !, 718 atom(Pi). 719xml_is_content(CDATA) :- 720 atom(CDATA), 721 !. 722xml_is_content(CDATA) :- 723 string(CDATA), 724 !. 725xml_is_content(Term) :- 726 xml_is_element(Term). 727 728xml_is_element(element(Name, Attributes, Content)) :- 729 dom_name(Name), 730 dom_attributes(Attributes), 731 xml_is_content_list(Content). 732 733dom_name(NS:Local) :- 734 atom(NS), 735 atom(Local), 736 !. 737dom_name(Local) :- 738 atom(Local). 739 740dom_attributes(0) :- !, fail. 741dom_attributes([]). 742dom_attributes([H|T]) :- 743 dom_attribute(H), 744 dom_attributes(T). 745 746dom_attribute(Name=Value) :- 747 dom_name(Name), 748 atomic(Value). 749 750 751 /******************************* 752 * MESSAGES * 753 *******************************/ 754:- multifile 755 prolog:message/3. 756 757% Catch messages. sgml/4 is generated by the SGML2PL binding. 758 759prologmessage(sgml(Parser, File, Line, Message)) --> 760 { get_sgml_parser(Parser, dialect(Dialect)) 761 }, 762 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ]. 763 764 765 /******************************* 766 * XREF SUPPORT * 767 *******************************/ 768 769:- multifile 770 prolog:called_by/2. 771 772prologcalled_by(sgml_parse(_, Options), Called) :- 773 findall(Meta, meta_call_term(_, Meta, Options), Called). 774 775meta_call_term(T, G+N, Options) :- 776 T = call(Event, G), 777 pmember(T, Options), 778 call_params(Event, Term), 779 functor(Term, _, N). 780 781pmember(X, List) :- % member for partial lists 782 nonvar(List), 783 List = [H|T], 784 ( X = H 785 ; pmember(X, T) 786 ). 787 788call_params(begin, begin(tag,attributes,parser)). 789call_params(end, end(tag,parser)). 790call_params(cdata, cdata(cdata,parser)). 791call_params(pi, pi(cdata,parser)). 792call_params(decl, decl(cdata,parser)). 793call_params(error, error(severity,message,parser)). 794call_params(xmlns, xmlns(namespace,url,parser)). 795call_params(urlns, urlns(url,url,parser)). 796 797 /******************************* 798 * SANDBOX * 799 *******************************/ 800 801:- multifile 802 sandbox:safe_primitive/1, 803 sandbox:safe_meta_predicate/1. 804 805sandbox:safe_meta_predicate(sgml:load_structure/3). 806sandbox:safe_primitive(sgml:dtd(Dialect, _)) :- 807 dtd_alias(Dialect, _). 808sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)). 809sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)). 810sandbox:safe_primitive(sgml:xml_name(_,_)). 811sandbox:safe_primitive(sgml:xml_basechar(_)). 812sandbox:safe_primitive(sgml:xml_ideographic(_)). 813sandbox:safe_primitive(sgml:xml_combining_char(_)). 814sandbox:safe_primitive(sgml:xml_digit(_)). 815sandbox:safe_primitive(sgml:xml_extender(_)). 816sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)). 817sandbox:safe_primitive(sgml:xsd_number_string(_,_)). 818sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))
SGML, XML and HTML parser
This library allows you to parse SGML, XML and HTML data into a Prolog data structure. The library defines several families of predicates:
The DOM structure can be used by
library(xpath)
to extract information from the document.