View source with raw comments or as raw
    1/*  Part of ClioPatria SeRQL and SPARQL server
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (C): 2011-2015, University of Amsterdam,
    7			      VU University Amsterdam
    8
    9    This program is free software; you can redistribute it and/or
   10    modify it under the terms of the GNU General Public License
   11    as published by the Free Software Foundation; either version 2
   12    of the License, or (at your option) any later version.
   13
   14    This program is distributed in the hope that it will be useful,
   15    but WITHOUT ANY WARRANTY; without even the implied warranty of
   16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   17    GNU General Public License for more details.
   18
   19    You should have received a copy of the GNU General Public
   20    License along with this library; if not, write to the Free Software
   21    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   22
   23    As a special exception, if you link this library with other files,
   24    compiled with a Free Software compiler, to produce an executable, this
   25    library does not by itself cause the resulting executable to be covered
   26    by the GNU General Public License. This exception does not however
   27    invalidate any other reasons why the executable file might be covered by
   28    the GNU General Public License.
   29*/
   30
   31:- module(rdf_file_type,
   32	  [ rdf_guess_data_format/2,	% +Stream, ?Format
   33	    rdf_guess_format_and_load/2	% +Stream, +Options
   34	  ]).   35:- use_module(library(semweb/rdf_db)).   36:- use_module(library(memfile)).   37:- use_module(library(sgml)).   38:- use_module(library(lists)).   39:- use_module(library(apply)).   40:- use_module(library(option)).   41:- if(exists_source(library(archive))).   42:- use_module(library(archive)).   43:- endif.

Load RDF data from unknown file-type

*/

 rdf_guess_format_and_load(+Stream, +Options) is det
Guess the RDF format in Stream and load it. Stream must be a repositional stream. Options are passed to rdf_load/2. In addition, it processed the following options:
filename(filename)
Name of the uploaded file.
   59rdf_guess_format_and_load(Stream, Options) :-
   60	option(format(_), Options), !,
   61	rdf_load(stream(Stream), Options).
   62:- if(current_predicate(archive_data_stream/3)).   63rdf_guess_format_and_load(Stream, Options) :-
   64	setup_call_cleanup(
   65	    archive_open(Stream, Archive, [format(all),format(raw)]),
   66	    forall(archive_data_stream(Archive, DataStream, [meta_data(MetaData)]),
   67		   call_cleanup(
   68		       ( member_base_uri(MetaData, Options, Options2),
   69			 option(base_uri(Base), Options2, 'http://example.org/'),
   70			 set_stream(DataStream, file_name(Base)),
   71			 (   file_base_name(Base, FileName),
   72			     non_rdf_file(FileName)
   73			 ->  true
   74			 ;   rdf_guess_data_format(DataStream, Format)
   75			 ->  rdf_load(stream(DataStream), [format(Format)|Options2])
   76			 ;   true
   77			 )
   78		       ),
   79		       close(DataStream))),
   80	    archive_close(Archive)).
   81
   82member_base_uri([_], Options, Options) :- !.
   83member_base_uri(MetaData, Options0, Options) :-
   84	append(MetaPath, [_], MetaData),
   85	maplist(get_dict(name), MetaPath, MetaSegments),
   86	select_option(base_uri(Base0), Options0, Options1, 'http://archive.org'),
   87	atomic_list_concat([Base0|MetaSegments], /, Base),
   88	Options = [base_uri(Base)|Options1].
   89:- else.   90rdf_guess_format_and_load(Stream, Options) :-
   91	rdf_guess_data_format(Stream, Format),
   92	rdf_load(stream(Stream), [format(Format)|Options]).
   93:- endif.   94
   95non_rdf_file(File) :-
   96	file_name_extension(Base, Ext, File),
   97	(   non_rdf_ext(Ext)
   98	->  true
   99	;   downcase_atom(Base, Lower),
  100	    non_rdf_base(Lower)
  101	).
  102
  103non_rdf_ext(pdf).
  104non_rdf_ext(txt).
  105non_rdf_ext(md).
  106non_rdf_ext(doc).
  107
  108non_rdf_base(readme).
  109non_rdf_base(todo).
 rdf_guess_data_format(+Stream, ?Format)
Guess the format of an RDF file from the actual content. Currently, this seeks for a valid XML document upto the rdf:RDF element before concluding that the file is RDF/XML. Otherwise it assumes that the document is Turtle.
To be done
- Recognise Turtle variations from content
  120rdf_guess_data_format(_, Format) :-
  121	nonvar(Format), !.
  122rdf_guess_data_format(Stream, xml) :-
  123	xml_doctype(Stream, _), !.
  124rdf_guess_data_format(Stream, Format) :-
  125	stream_property(Stream, file_name(File)),
  126	file_name_extension(_, Ext, File),
  127	rdf_db:rdf_file_type(Ext, Format), !.
  128rdf_guess_data_format(_, turtle).
 xml_doctype(+Stream, -DocType) is semidet
Parse a stream and get the name of the first XML element and demand that this element defines XML namespaces. Fails if the document is illegal XML before the first element.

Note that it is not possible to define valid RDF/XML without namespaces, while it is not possible to define a valid absolute Turtle URI (using <URI>) with a valid xmlns declaration.

Arguments:
Stream- denotes the input. If peek_string/3 is provided (SWI-Prolog version 7), it is not necessary that the stream can be repositioned. Older versions require a repositionable stream.
  145:- if(current_predicate(peek_string/3)).  146xml_doctype(Stream, DocType) :-
  147	peek_string(Stream, 4096, Start),
  148	setup_call_cleanup(
  149	    open_string_stream(Start, In),
  150	    xml_doctype_2(In, DocType),
  151	    close(In)).
  152:- else.  153xml_doctype(Stream, DocType) :-
  154	xml_doctype_2(Stream, DocType).
  155:- endif.  156
  157xml_doctype_2(Stream, DocType) :-
  158	catch(setup_call_cleanup(make_parser(Stream, Parser, State),
  159				 sgml_parse(Parser,
  160					    [ source(Stream),
  161					      max_errors(1),
  162					      syntax_errors(quiet),
  163					      call(begin, on_begin),
  164					      call(cdata, on_cdata)
  165					    ]),
  166				 cleanup_parser(Stream, Parser, State)),
  167	      E, true),
  168	nonvar(E),
  169	E = tag(DocType).
  170
  171make_parser(Stream, Parser, state(Pos)) :-
  172	stream_property(Stream, position(Pos)),
  173	new_sgml_parser(Parser, []),
  174	set_sgml_parser(Parser, dialect(xmlns)).
  175
  176cleanup_parser(Stream, Parser, state(Pos)) :-
  177	free_sgml_parser(Parser),
  178	set_stream_position(Stream, Pos).
  179
  180on_begin(Tag, Attributes, _Parser) :-
  181	memberchk(xmlns:_=_, Attributes),
  182	throw(tag(Tag)).
  183
  184on_cdata(_CDATA, _Parser) :-
  185	throw(error(cdata)).
  186
  187
  188open_string_stream(String, Stream) :-
  189	new_memory_file(MF),
  190	setup_call_cleanup(
  191	    open_memory_file(MF, write, Out),
  192	    format(Out, '~s', [String]),
  193	    close(Out)),
  194	open_memory_file(MF, read, Stream,
  195			 [ free_on_close(true)
  196			 ])