35
36:- module(sgml,
37 [ load_html/3, 38 load_xml/3, 39 load_sgml/3, 40
41 load_sgml_file/2, 42 load_xml_file/2, 43 load_html_file/2, 44
45 load_structure/3, 46
47 load_dtd/2, 48 load_dtd/3, 49 dtd/2, 50 dtd_property/2, 51
52 new_dtd/2, 53 free_dtd/1, 54 open_dtd/3, 55
56 new_sgml_parser/2, 57 free_sgml_parser/1, 58 set_sgml_parser/2, 59 get_sgml_parser/2, 60 sgml_parse/2, 61
62 sgml_register_catalog_file/2, 63
64 xml_quote_attribute/3, 65 xml_quote_cdata/3, 66 xml_quote_attribute/2, 67 xml_quote_cdata/2, 68 xml_name/1, 69 xml_name/2, 70
71 xsd_number_string/2, 72 xsd_time_string/3, 73
74 xml_basechar/1, 75 xml_ideographic/1, 76 xml_combining_char/1, 77 xml_digit/1, 78 xml_extender/1, 79
80 iri_xml_namespace/2, 81 iri_xml_namespace/3, 82 xml_is_dom/1 83 ]). 84:- use_module(library(lists)). 85:- use_module(library(option)). 86:- use_module(library(error)). 87:- use_module(library(iostream)). 88
89:- meta_predicate
90 load_structure(+, -, :),
91 load_html(+, -, :),
92 load_xml(+, -, :),
93 load_sgml(+, -, :). 94
95:- predicate_options(load_structure/3, 3,
96 [ charpos(integer),
97 defaults(boolean),
98 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])),
99 doctype(atom),
100 dtd(any),
101 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])),
102 entity(atom,atom),
103 keep_prefix(boolean),
104 file(atom),
105 line(integer),
106 offset(integer),
107 number(oneof([token,integer])),
108 qualify_attributes(boolean),
109 shorttag(boolean),
110 case_sensitive_attributes(boolean),
111 case_preserving_attributes(boolean),
112 system_entities(boolean),
113 max_memory(integer),
114 space(oneof([sgml,preserve,default,remove])),
115 xmlns(atom),
116 xmlns(atom,atom),
117 pass_to(sgml_parse/2, 2)
118 ]). 119:- predicate_options(load_html/3, 3,
120 [ pass_to(load_structure/3, 3)
121 ]). 122:- predicate_options(load_xml/3, 3,
123 [ pass_to(load_structure/3, 3)
124 ]). 125:- predicate_options(load_sgml/3, 3,
126 [ pass_to(load_structure/3, 3)
127 ]). 128:- predicate_options(load_dtd/3, 3,
129 [ dialect(oneof([sgml,xml,xmlns])),
130 pass_to(open/4, 4)
131 ]). 132:- predicate_options(sgml_parse/2, 2,
133 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]),
134 callable),
135 content_length(integer),
136 document(-any),
137 max_errors(integer),
138 parse(oneof([file,element,content,declaration,input])),
139 source(any),
140 syntax_errors(oneof([quiet,print,style])),
141 xml_no_ns(oneof([error,quiet]))
142 ]). 143:- predicate_options(new_sgml_parser/2, 2,
144 [ dtd(any)
145 ]). 146
147
174
175:- multifile user:file_search_path/2. 176:- dynamic user:file_search_path/2. 177
178user:file_search_path(dtd, '.').
179user:file_search_path(dtd, swi('library/DTD')).
180
181sgml_register_catalog_file(File, Location) :-
182 prolog_to_os_filename(File, OsFile),
183 '_sgml_register_catalog_file'(OsFile, Location).
184
185:- use_foreign_library(foreign(sgml2pl)). 186
187register_catalog(Base) :-
188 absolute_file_name(dtd(Base),
189 [ extensions([soc]),
190 access(read),
191 file_errors(fail)
192 ],
193 SocFile),
194 sgml_register_catalog_file(SocFile, end).
195
196:- initialization
197 ignore(register_catalog('HTML4')). 198
199
200 203
210
211:- thread_local
212 current_dtd/2. 213:- volatile
214 current_dtd/2. 215:- thread_local
216 registered_cleanup/0. 217:- volatile
218 registered_cleanup/0. 219
220:- multifile
221 dtd_alias/2. 222
223:- create_prolog_flag(html_dialect, html5, [type(atom)]). 224
225dtd_alias(html4, 'HTML4').
226dtd_alias(html5, 'HTML5').
227dtd_alias(html, DTD) :-
228 current_prolog_flag(html_dialect, Dialect),
229 dtd_alias(Dialect, DTD).
230
240
241dtd(Type, DTD) :-
242 current_dtd(Type, DTD),
243 !.
244dtd(Type, DTD) :-
245 new_dtd(Type, DTD),
246 ( dtd_alias(Type, Base)
247 -> true
248 ; Base = Type
249 ),
250 absolute_file_name(dtd(Base),
251 [ extensions([dtd]),
252 access(read)
253 ], DtdFile),
254 load_dtd(DTD, DtdFile),
255 register_cleanup,
256 asserta(current_dtd(Type, DTD)).
257
270
271load_dtd(DTD, DtdFile) :-
272 load_dtd(DTD, DtdFile, []).
273load_dtd(DTD, DtdFile, Options) :-
274 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions),
275 setup_call_cleanup(
276 open_dtd(DTD, DTDOptions, DtdOut),
277 setup_call_cleanup(
278 open(DtdFile, read, DtdIn, OpenOptions),
279 copy_stream_data(DtdIn, DtdOut),
280 close(DtdIn)),
281 close(DtdOut)).
282
283split_dtd_options([], [], []).
284split_dtd_options([H|T], [H|TD], S) :-
285 dtd_option(H),
286 !,
287 split_dtd_options(T, TD, S).
288split_dtd_options([H|T], TD, [H|S]) :-
289 split_dtd_options(T, TD, S).
290
291dtd_option(dialect(_)).
292
293
298
299destroy_dtds :-
300 ( current_dtd(_Type, DTD),
301 free_dtd(DTD),
302 fail
303 ; true
304 ).
305
309
310register_cleanup :-
311 registered_cleanup,
312 !.
313register_cleanup :-
314 catch(thread_at_exit(destroy_dtds), _, true),
315 assert(registered_cleanup).
316
317
318 321
322prop(doctype(_), _).
323prop(elements(_), _).
324prop(entities(_), _).
325prop(notations(_), _).
326prop(entity(E, _), DTD) :-
327 ( nonvar(E)
328 -> true
329 ; '$dtd_property'(DTD, entities(EL)),
330 member(E, EL)
331 ).
332prop(element(E, _, _), DTD) :-
333 ( nonvar(E)
334 -> true
335 ; '$dtd_property'(DTD, elements(EL)),
336 member(E, EL)
337 ).
338prop(attributes(E, _), DTD) :-
339 ( nonvar(E)
340 -> true
341 ; '$dtd_property'(DTD, elements(EL)),
342 member(E, EL)
343 ).
344prop(attribute(E, A, _, _), DTD) :-
345 ( nonvar(E)
346 -> true
347 ; '$dtd_property'(DTD, elements(EL)),
348 member(E, EL)
349 ),
350 ( nonvar(A)
351 -> true
352 ; '$dtd_property'(DTD, attributes(E, AL)),
353 member(A, AL)
354 ).
355prop(notation(N, _), DTD) :-
356 ( nonvar(N)
357 -> true
358 ; '$dtd_property'(DTD, notations(NL)),
359 member(N, NL)
360 ).
361
362dtd_property(DTD, Prop) :-
363 prop(Prop, DTD),
364 '$dtd_property'(DTD, Prop).
365
366
367 370
392
393load_structure(Spec, DOM, Options) :-
394 sgml_open_options(Options, OpenOptions, SGMLOptions),
395 setup_call_cleanup(
396 open_any(Spec, read, In, Close, OpenOptions),
397 load_structure_from_stream(In, DOM, SGMLOptions),
398 close_any(Close)).
399
400sgml_open_options(Options, OpenOptions, SGMLOptions) :-
401 Options = M:Plain,
402 ( select_option(encoding(Encoding), Plain, NoEnc)
403 -> ( sgml_encoding(Encoding)
404 -> merge_options(NoEnc, [type(binary)], OpenOptions),
405 SGMLOptions = Options
406 ; OpenOptions = Plain,
407 SGMLOptions = M:NoEnc
408 )
409 ; merge_options(Plain, [type(binary)], OpenOptions),
410 SGMLOptions = Options
411 ).
412
413sgml_encoding(Enc) :-
414 downcase_atom(Enc, Enc1),
415 sgml_encoding_l(Enc1).
416
417sgml_encoding_l('iso-8859-1').
418sgml_encoding_l('us-ascii').
419sgml_encoding_l('utf-8').
420sgml_encoding_l('utf8').
421sgml_encoding_l('iso_latin_1').
422sgml_encoding_l('ascii').
423
424load_structure_from_stream(In, Term, M:Options) :-
425 !,
426 ( select_option(dtd(DTD), Options, Options1)
427 -> ExplicitDTD = true
428 ; ExplicitDTD = false,
429 Options1 = Options
430 ),
431 move_front(Options1, dialect(_), Options2), 432 setup_call_cleanup(
433 new_sgml_parser(Parser,
434 [ dtd(DTD)
435 ]),
436 parse(Parser, M:Options2, TermRead, In),
437 free_sgml_parser(Parser)),
438 ( ExplicitDTD == true
439 -> ( DTD = dtd(_, DocType),
440 dtd_property(DTD, doctype(DocType))
441 -> true
442 ; true
443 )
444 ; free_dtd(DTD)
445 ),
446 Term = TermRead.
447
448move_front(Options0, Opt, Options) :-
449 selectchk(Opt, Options0, Options1),
450 !,
451 Options = [Opt|Options1].
452move_front(Options, _, Options).
453
454
455parse(Parser, M:Options, Document, In) :-
456 set_parser_options(Options, Parser, In, Options1),
457 parser_meta_options(Options1, M, Options2),
458 set_input_location(Parser, In),
459 sgml_parse(Parser,
460 [ document(Document),
461 source(In)
462 | Options2
463 ]).
464
465set_parser_options([], _, _, []).
466set_parser_options([H|T], Parser, In, Rest) :-
467 ( set_parser_option(H, Parser, In)
468 -> set_parser_options(T, Parser, In, Rest)
469 ; Rest = [H|R2],
470 set_parser_options(T, Parser, In, R2)
471 ).
472
473set_parser_option(Var, _Parser, _In) :-
474 var(Var),
475 !,
476 instantiation_error(Var).
477set_parser_option(Option, Parser, _) :-
478 def_entity(Option, Parser),
479 !.
480set_parser_option(offset(Offset), _Parser, In) :-
481 !,
482 seek(In, Offset, bof, _).
483set_parser_option(Option, Parser, _In) :-
484 parser_option(Option),
485 !,
486 set_sgml_parser(Parser, Option).
487set_parser_option(Name=Value, Parser, In) :-
488 Option =.. [Name,Value],
489 set_parser_option(Option, Parser, In).
490
491
492parser_option(dialect(_)).
493parser_option(shorttag(_)).
494parser_option(case_sensitive_attributes(_)).
495parser_option(case_preserving_attributes(_)).
496parser_option(system_entities(_)).
497parser_option(max_memory(_)).
498parser_option(file(_)).
499parser_option(line(_)).
500parser_option(space(_)).
501parser_option(number(_)).
502parser_option(defaults(_)).
503parser_option(doctype(_)).
504parser_option(qualify_attributes(_)).
505parser_option(encoding(_)).
506parser_option(keep_prefix(_)).
507
508
509def_entity(entity(Name, Value), Parser) :-
510 get_sgml_parser(Parser, dtd(DTD)),
511 xml_quote_attribute(Value, QValue),
512 setup_call_cleanup(open_dtd(DTD, [], Stream),
513 format(Stream, '<!ENTITY ~w "~w">~n',
514 [Name, QValue]),
515 close(Stream)).
516def_entity(xmlns(URI), Parser) :-
517 set_sgml_parser(Parser, xmlns(URI)).
518def_entity(xmlns(NS, URI), Parser) :-
519 set_sgml_parser(Parser, xmlns(NS, URI)).
520
524
525parser_meta_options([], _, []).
526parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :-
527 !,
528 parser_meta_options(T0, M, T).
529parser_meta_options([H|T0], M, [H|T]) :-
530 parser_meta_options(T0, M, T).
531
532
536
537set_input_location(Parser, _In) :-
538 get_sgml_parser(Parser, file(_)),
539 !.
540set_input_location(Parser, In) :-
541 stream_property(In, file_name(File)),
542 !,
543 set_sgml_parser(Parser, file(File)),
544 stream_property(In, position(Pos)),
545 set_sgml_parser(Parser, position(Pos)).
546set_input_location(_, _).
547
548 551
558
559load_sgml_file(File, Term) :-
560 load_sgml(File, Term, []).
561
568
569load_xml_file(File, Term) :-
570 load_xml(File, Term, []).
571
578
579load_html_file(File, DOM) :-
580 load_html(File, DOM, []).
581
608
609load_html(File, Term, M:Options) :-
610 current_prolog_flag(html_dialect, Dialect),
611 dtd(Dialect, DTD),
612 merge_options(Options,
613 [ dtd(DTD),
614 dialect(Dialect),
615 max_errors(-1),
616 syntax_errors(quiet)
617 ], Options1),
618 load_structure(File, Term, M:Options1).
619
627
628load_xml(Input, DOM, M:Options) :-
629 merge_options(Options,
630 [ dialect(xml)
631 ], Options1),
632 load_structure(Input, DOM, M:Options1).
633
641
642load_sgml(Input, DOM, M:Options) :-
643 merge_options(Options,
644 [ dialect(sgml)
645 ], Options1),
646 load_structure(Input, DOM, M:Options1).
647
648
649
650 653
661
662xml_quote_attribute(In, Quoted) :-
663 xml_quote_attribute(In, Quoted, ascii).
664
665xml_quote_cdata(In, Quoted) :-
666 xml_quote_cdata(In, Quoted, ascii).
667
671
672xml_name(In) :-
673 xml_name(In, ascii).
674
675
676 679
691
692
693 696
701
702xml_is_dom(0) :- !, fail. 703xml_is_dom(List) :-
704 is_list(List),
705 !,
706 xml_is_content_list(List).
707xml_is_dom(Term) :-
708 xml_is_element(Term).
709
710xml_is_content_list([]).
711xml_is_content_list([H|T]) :-
712 xml_is_content(H),
713 xml_is_content_list(T).
714
715xml_is_content(0) :- !, fail.
716xml_is_content(pi(Pi)) :-
717 !,
718 atom(Pi).
719xml_is_content(CDATA) :-
720 atom(CDATA),
721 !.
722xml_is_content(CDATA) :-
723 string(CDATA),
724 !.
725xml_is_content(Term) :-
726 xml_is_element(Term).
727
728xml_is_element(element(Name, Attributes, Content)) :-
729 dom_name(Name),
730 dom_attributes(Attributes),
731 xml_is_content_list(Content).
732
733dom_name(NS:Local) :-
734 atom(NS),
735 atom(Local),
736 !.
737dom_name(Local) :-
738 atom(Local).
739
740dom_attributes(0) :- !, fail.
741dom_attributes([]).
742dom_attributes([H|T]) :-
743 dom_attribute(H),
744 dom_attributes(T).
745
746dom_attribute(Name=Value) :-
747 dom_name(Name),
748 atomic(Value).
749
750
751 754:- multifile
755 prolog:message/3. 756
758
759prolog:message(sgml(Parser, File, Line, Message)) -->
760 { get_sgml_parser(Parser, dialect(Dialect))
761 },
762 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ].
763
764
765 768
769:- multifile
770 prolog:called_by/2. 771
772prolog:called_by(sgml_parse(_, Options), Called) :-
773 findall(Meta, meta_call_term(_, Meta, Options), Called).
774
775meta_call_term(T, G+N, Options) :-
776 T = call(Event, G),
777 pmember(T, Options),
778 call_params(Event, Term),
779 functor(Term, _, N).
780
781pmember(X, List) :- 782 nonvar(List),
783 List = [H|T],
784 ( X = H
785 ; pmember(X, T)
786 ).
787
788call_params(begin, begin(tag,attributes,parser)).
789call_params(end, end(tag,parser)).
790call_params(cdata, cdata(cdata,parser)).
791call_params(pi, pi(cdata,parser)).
792call_params(decl, decl(cdata,parser)).
793call_params(error, error(severity,message,parser)).
794call_params(xmlns, xmlns(namespace,url,parser)).
795call_params(urlns, urlns(url,url,parser)).
796
797 800
801:- multifile
802 sandbox:safe_primitive/1,
803 sandbox:safe_meta_predicate/1. 804
805sandbox:safe_meta_predicate(sgml:load_structure/3).
806sandbox:safe_primitive(sgml:dtd(Dialect, _)) :-
807 dtd_alias(Dialect, _).
808sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)).
809sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)).
810sandbox:safe_primitive(sgml:xml_name(_,_)).
811sandbox:safe_primitive(sgml:xml_basechar(_)).
812sandbox:safe_primitive(sgml:xml_ideographic(_)).
813sandbox:safe_primitive(sgml:xml_combining_char(_)).
814sandbox:safe_primitive(sgml:xml_digit(_)).
815sandbox:safe_primitive(sgml:xml_extender(_)).
816sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)).
817sandbox:safe_primitive(sgml:xsd_number_string(_,_)).
818sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))