View source with formatted comments or as raw
    1/*  Part of ClioPatria SeRQL and SPARQL server
    2
    3    Author:        Michiel Hildebrand
    4    Author:        Jan Wielemaker
    5    E-mail:        michielh@few.vu.nl
    6    WWW:           http://www.swi-prolog.org
    7    Copyright (C): 2016, VU University Amsterdam
    8
    9    This program is free software; you can redistribute it and/or
   10    modify it under the terms of the GNU General Public License
   11    as published by the Free Software Foundation; either version 2
   12    of the License, or (at your option) any later version.
   13
   14    This program is distributed in the hope that it will be useful,
   15    but WITHOUT ANY WARRANTY; without even the implied warranty of
   16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   17    GNU General Public License for more details.
   18
   19    You should have received a copy of the GNU General Public
   20    License along with this library; if not, write to the Free Software
   21    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   22
   23    As a special exception, if you link this library with other files,
   24    compiled with a Free Software compiler, to produce an executable, this
   25    library does not by itself cause the resulting executable to be covered
   26    by the GNU General Public License. This exception does not however
   27    invalidate any other reasons why the executable file might be covered by
   28    the GNU General Public License.
   29*/
   30
   31:- module(api_lod,
   32	  [ lod_api/2			% +Request
   33	  ]).   34
   35:- use_module(library(http/http_dispatch)).   36:- use_module(library(http/http_json)).   37:- use_module(library(http/http_host)).   38:- use_module(library(http/http_request_value)).   39:- use_module(library(http/http_cors)).   40:- use_module(library(semweb/rdf_db)).   41:- use_module(library(semweb/rdf_json)).   42:- use_module(library(semweb/rdf_describe)).   43:- use_module(library(settings)).   44:- use_module(library(option)).   45:- use_module(library(rdf_write)).   46:- use_module(library(semweb/rdf_turtle_write)).   47:- use_module(library(uri)).   48:- use_module(library(debug)).   49:- use_module(library(apply)).   50:- use_module(library(dcg/basics)).   51:- use_module(library(base64)).   52:- use_module(library(utf8)).   53
   54
   55/** <module> LOD - Linked Open Data server
   56
   57Linked (Open) Data turns RDF URIs   (indentifiers) into URLs (locators).
   58Requesting the data  behind  the  URL   returns  a  description  of  the
   59resource. So, if we see   a resource http://example.com/employe/bill, we
   60get do an HTTP GET  request  and   expect  to  receive  a description of
   61_bill_.  This module adds LOD facilities to ClioPatria.
   62
   63---++ Running the LOD server
   64
   65There are several ways to run the LOD server.
   66
   67    1. The simplest way to realise LOD is to run ClioPatria there where
   68    the authority component of the URL points to (see uri_components/2
   69    for decomposing URIs).  This implies you must be able to create a
   70    DNS binding for the host and be able to run ClioPatria there.
   71
   72    2. Sometimes the above does not work, because the port is already
   73    assigned to another machine, you are not allowed to run ClioPatria
   74    on the target host, the target is behind a firewall, etc. In that
   75    case, notable if the host runs Apache, you can exploit the Apache
   76    module =mod_proxy= and proxy the connections to a location where
   77    ClioPatria runs. If you ensure that the path on Apache is the same
   78    as the path on ClioPatria, the following Apache configuration rule
   79    solves the problem:
   80
   81    ==
   82    ProxyPass /rdf/ http://cliopatria-host:3020/rdf/
   83    ==
   84
   85    3. Both above methods require no further configuration.
   86    Unfortunately, they require a registered domain control over DNS
   87    and administrative rights over certain machines.  A solution that
   88    doesn't require this is to use www.purl.org.  This allows you to
   89    redirect URLs within the purl domain to any location you control.
   90    The redirection method can be defined with purl.  In the semantic
   91    web community, we typically use *|See other|* (303).  The catch
   92    is that if the address arrives at ClioPatria, we no longer know
   93    where it came from.  This is not a problem in (1), as there was
   94    no redirect.  It is also not a problem in (2), because Apache
   95    adds a header =|x-forwarded-host|=.  Unfortunately, there is
   96    no way to tell you are activated through a redirect, let alone
   97    where the redirect came from.
   98
   99    To deal with this situation, we use the redirected_from option of
  100    lod_api/2. For example, if http://www.purl.org/vocabularies/myvoc/
  101    is redirected to /myvoc/ on ClioPatria, we use:
  102
  103    ==
  104    :- http_handler('/myvoc/',
  105		    lod_api([ redirected_from('http://www.purl.org/vocabularies/myvoc/')
  106			    ]),
  107		    [ prefix ]).
  108    ==
  109
  110By default, there is no HTTP handler pointing to lod_api/2. The example
  111above describes how to deal with redirected URIs. The cases (1) and (2)
  112must also be implemented by registering a handler. This can be as blunt
  113as registering a handler for the root of the server, but typically one
  114would use one or more handlers that deal with sub-trees that act as
  115Linked Data repositories. Handler declarations should use absolute
  116addresses to guarantee a match with the RDF URIs, even if the server is
  117relocated by means of the http:prefix setting. For example:
  118
  119    ==
  120    :- http_handler('/rdf/', lod_api([]), [prefix]).
  121    ==
  122
  123@see http://linkeddata.org/
  124*/
  125
  126:- setting(lod:redirect, boolean, false,
  127	   'If true, redirect from accept-header to extension').  128
  129%%	lod_api(+Options, +Request)
  130%
  131%	Reply to a Linked Data request. The  handler is capable of three
  132%	output formats. It decides on the   desired  format based on the
  133%	HTTP =Accept= header-field. If no acceptable format is found, it
  134%	replies with a human-readable description  of the resource using
  135%	ClioPatria RDF browser-page as defined by list_resource//2.
  136%
  137%	Options:
  138%
  139%	    * redirected_from(+URL)
  140%	    This option must be provided when using a purl.org or
  141%	    similar redirect.  See overall documentation of this
  142%	    library.
  143%
  144%	    * bounded_description(+Type)
  145%	    Description style to use.  See rdf_bounded_description/4.
  146%	    The default is =cbd= (Concise Bounded Description)
  147
  148lod_api(_Options, Request) :-
  149	\+ memberchk(path_info(_), Request), !,
  150	accepts(Request, AcceptList),
  151	preferred_format(AcceptList, Format),
  152	(   Format == html
  153	->  http_link_to_id(home, [], Redirect)
  154	;   http_link_to_id(well_known_void, [], Redirect)
  155	),
  156	http_redirect(see_other, Redirect, Request).
  157lod_api(_Options, Request) :-
  158	memberchk(path_info('/.well-known/void'), Request), !,
  159	http_link_to_id(well_known_void, [], Redirect),
  160	http_redirect(see_other, Redirect, Request).
  161lod_api(Options, Request) :-
  162	lod_uri(Request, URI, Options),
  163	debug(lod, 'LOD URI: ~q', [URI]),
  164	accepts(Request, AcceptList),
  165	triple_filter(Request, Filter),
  166	cors_enable,
  167	lod_request(URI, AcceptList, Request, Filter, Options).
  168
  169accepts(Request, AcceptList) :-
  170	(   memberchk(accept(AcceptHeader), Request)
  171	->  (   atom(AcceptHeader)	% compatibility
  172	    ->	http_parse_header_value(accept, AcceptHeader, AcceptList)
  173	    ;	AcceptList = AcceptHeader
  174	    )
  175	;   AcceptList = []
  176	).
  177
  178%%	triple_filter(+Request, -Filter) is det.
  179%
  180%	Extract Triple-Filter from Request.  Ignores the filter if it
  181%	is invalid.
  182
  183triple_filter(Request, Filter) :-
  184	catch(phrase(triple_filter(Request), Filter), E,
  185	      (print_message(warning, E),fail)), !.
  186triple_filter(_, []).
  187
  188
  189%%	triple_filter(+Text)//
  190%
  191%	Translate an RDF triple pattern into a list of rdf(S,P,O) terms.
  192
  193triple_filter([]) -->
  194	[].
  195triple_filter([triple_filter(Filter)|T]) --> !,
  196	one_triple_filter(Filter),
  197	triple_filter(T).
  198triple_filter([_|T]) -->
  199	triple_filter(T).
  200
  201one_triple_filter(Encoded) -->
  202	{ string_codes(Encoded, EncCodes),
  203	  phrase(base64(UTF8Bytes), EncCodes),
  204	  phrase(utf8_codes(PlainCodes), UTF8Bytes),
  205	  string_codes(Filter, PlainCodes),
  206	  split_string(Filter, "\r\n", "\r\n", Filters),
  207	  maplist(map_triple_filter, Filters, Triples)
  208	},
  209	string(Triples).
  210
  211map_triple_filter(String, rdf(S,P,O)) :-
  212	split_string(String, "\s\t", "\s\t", [SS,SP,SO]),
  213	triple_term(SS, S),
  214	triple_term(SP, P),
  215	triple_term(SO, O).
  216
  217triple_term("?", _) :- !.
  218triple_term(S, N) :-
  219	string_codes(S, Codes),
  220	phrase(sparql_grammar:graph_term(N), Codes).
  221
  222%%	lod_request(+URI, +AcceptList, +Request, +Filter, +Options)
  223%
  224%	Handle an LOD request.
  225
  226lod_request(URI, AcceptList, Request, Filter, Options) :-
  227	lod_resource(URI), !,
  228	preferred_format(AcceptList, Format),
  229	debug(lod, 'LOD Format: ~q', [Format]),
  230	(   cliopatria:redirect_uri(Format, URI, SeeOther)
  231	->  http_redirect(see_other, SeeOther, Request)
  232	;   setting(lod:redirect, true),
  233	    redirect(URI, AcceptList, SeeOther)
  234	->  http_redirect(see_other, SeeOther, Request)
  235	;   lod_describe(Format, URI, Request, Filter, Options)
  236	).
  237lod_request(URL, _AcceptList, Request, Filter, Options) :-
  238	format_request(URL, URI, Format), !,
  239	lod_describe(Format, URI, Request, Filter, Options).
  240lod_request(URI, _AcceptList, _Request, _Filter, _) :-
  241	throw(http_reply(not_found(URI))).
  242
  243
  244%%	lod_uri(+Request, -URI, +Options)
  245%
  246%	URI is the originally requested URI.   This predicate deals with
  247%	redirections if the HTTP handler was registered using the option
  248%	redirected_from(URL). Otherwise it resolves   the correct global
  249%	URI using http_current_host/4.
  250
  251lod_uri(Request, URI, Options) :-
  252	memberchk(redirected_from(Org), Options),
  253	memberchk(request_uri(ReqURI), Request),
  254	handler_location(Request, Location),
  255	atom_concat(Location, Rest, ReqURI),
  256	atom_concat(Org, Rest, URI).
  257lod_uri(Request, URI, _) :-
  258	memberchk(request_uri(ReqURI), Request),
  259	http_current_host(Request, Host, Port,
  260			  [ global(true)
  261			  ]),
  262	(   Port == 80
  263	->  atomic_list_concat(['http://', Host, ReqURI], URI)
  264	;   atomic_list_concat(['http://', Host, :, Port, ReqURI], URI)
  265	).
  266
  267
  268%%	handler_location(+Request, -Location) is det.
  269%
  270%	Location is the requested location on  the server. This includes
  271%	the handler location, normally concatenated with the path_info.
  272
  273handler_location(Request, Location) :-
  274	memberchk(path(Path), Request),
  275	(   memberchk(path_info(Rest), Request),
  276	    atom_concat(Location, Rest, Path)
  277	->  true
  278	;   Location = Path
  279	).
  280
  281
  282%%	redirect(+URI, +AcceptList, -RedirectURL)
  283%
  284%	Succeeds if URI is in the store and a RedirectURL is found for
  285%	it.
  286
  287redirect(URI, AcceptList, To) :-
  288	lod_resource(URI),
  289	preferred_format(AcceptList, Format),
  290	(   cliopatria:redirect_uri(Format, URI, To)
  291	->  true
  292	;   uri_components(URI, URIComponents),
  293	    uri_data(path, URIComponents, Path0),
  294	    format_suffix(Format, Suffix),
  295	    file_name_extension(Path0, Suffix, Path),
  296	    uri_data(path, URIComponents, Path, ToComponents),
  297	    uri_components(To, ToComponents)
  298	).
  299
  300
  301%%	preferred_format(+AcceptList, -Format) is det.
  302%
  303%	Format is the highest ranked mimetype found in the Acceptlist of
  304%	the request and that  we  can   support.  Expects  an AcceptList
  305%	sorted by rank.
  306
  307preferred_format(AcceptList, Format) :-
  308	member(media(MimeType,_,_,_), AcceptList),
  309	ground(MimeType),
  310	mimetype_format(MimeType, Format), !.
  311preferred_format(_, html).
  312
  313
  314%%	format_request(+URL, -URI, -Format) is semidet.
  315%
  316%	True if URL contains a suffix   that  corresponds to a supported
  317%	output format, and the global URI occurs in the database.
  318
  319format_request(URL, URI, Format) :-
  320	uri_components(URL, URLComponents),
  321	uri_data(path, URLComponents, Path),
  322	file_name_extension(Base, Ext, Path),
  323	(   format_suffix(Format, Ext),
  324	    mimetype_format(_, Format)
  325	->  true
  326	),
  327	uri_data(path, URLComponents, Base, PlainComponents),
  328	uri_components(URI, PlainComponents),
  329	lod_resource(URI).
  330
  331
  332%%	lod_describe(+Format, +URI, +Request, +Filter, +Options) is det.
  333%
  334%	Write an HTTP document  describing  URI   to  in  Format  to the
  335%	current output. Format is defined by mimetype_format/2.
  336
  337lod_describe(html, URI, Request, _, _) :- !,
  338	(   rdf_graph(URI)
  339	->  http_link_to_id(list_graph, [graph=URI], Redirect)
  340	;   http_link_to_id(list_resource, [r=URI], Redirect)
  341	),
  342	http_redirect(see_other, Redirect, Request).
  343lod_describe(Format, URI, _Request, Filter, Options) :-
  344	lod_description(URI, RDF, Filter, Options),
  345	send_graph(Format, RDF).
  346
  347send_graph(xmlrdf, RDF) :-
  348	format('Content-type: application/rdf+xml; charset=UTF-8~n~n'),
  349	rdf_write_xml(current_output, RDF).
  350send_graph(json, RDF) :-
  351	graph_json(RDF, JSON),
  352	reply_json(JSON).
  353send_graph(turtle, RDF) :-
  354	format('Content-type: text/turtle; charset=UTF-8~n~n'),
  355	rdf_save_turtle(stream(current_output),
  356			[ expand(triple_in(RDF)),
  357			  only_known_prefixes(true),
  358			  silent(true)
  359			]).
  360
  361%%	triple_in(+RDF, ?S,?P,?O, ?G) is nondet.
  362%
  363%	Lookup a triple in the graph RDF, represented as a list of
  364%	rdf(S,P,O).
  365%
  366%	@tbd	Describe required indexing from rdf_save_turtle/2 and
  367%		implement that if the graph is big.
  368
  369:- public triple_in/5.			% called from send_graph/2.
  370
  371triple_in(RDF, S,P,O,_G) :-
  372	member(rdf(S,P,O), RDF).
  373
  374
  375%%	lod_description(+URI, -RDF, +Filter, +Options) is det.
  376%
  377%	RDF is a  graph  represented  as   a  list  of  rdf(S,P,O)  that
  378%	describes URI.
  379%
  380%	This predicate is hooked   by  cliopatria:lod_description/2. The
  381%	default is implemented by resource_CBD/3.
  382%
  383%	@see SPARQL DESCRIBE
  384
  385lod_description(URI, RDF, _, _) :-
  386	cliopatria:lod_description(URI, RDF), !.
  387lod_description(URI, RDF, Filter, Options) :-
  388	option(bounded_description(Type), Options, cbd),
  389	echo_filter(Filter),
  390	rdf_bounded_description(rdf, Type, Filter, URI, RDF).
  391
  392echo_filter([]) :- !.
  393echo_filter(Filters) :-
  394	copy_term(Filters, Filters1),
  395	term_variables(Filters1, Vars),
  396	maplist(=(?), Vars),
  397	filters_to_ntriples(Filters1, NTriples),
  398	split_string(NTriples, "\n", "\n.\s", Strings0),
  399	maplist(insert_q, Strings0, Strings),
  400	atomics_to_string(Strings, "\n", String),
  401	base64(String, Encoded),
  402	format('Triple-Filter: ~w\r\n', [Encoded]).
  403
  404insert_q(String, QString) :-
  405	split_string(String, " ", "", [S,P,O|M]),
  406	map_q(S, QS),
  407	map_q(P, QP),
  408	map_q(O, QO),
  409	atomics_to_string([QS,QP,QO|M], " ", QString).
  410
  411map_q("<?>", "?") :- !.
  412map_q(S, S).
  413
  414filters_to_ntriples(Filters, String) :-
  415	with_output_to(
  416	    string(String),
  417	    rdf_save_ntriples(stream(current_output),
  418			      [ expand(api_lod:triple_in(Filters))])).
  419
  420
  421%%	mimetype_format(?MimeType, ?Format) is nondet.
  422%
  423%	Conversion between mimetypes and formats.
  424
  425mimetype_format(application/'rdf+xml',	xmlrdf).
  426mimetype_format(application/json,	json).
  427mimetype_format(application/'x-turtle',	turtle).
  428mimetype_format(text/turtle,		turtle).
  429mimetype_format(text/html,		html).
  430
  431%%	format_suffix(?Format, ?Suffix) is nondet.
  432%
  433%	Suffix is the file name extension used for Format.
  434
  435format_suffix(xmlrdf, rdf).
  436format_suffix(json,   json).
  437format_suffix(html,   html).
  438format_suffix(turtle, ttl).
  439
  440
  441%%	lod_resource(+Resource) is semidet.
  442%
  443%	True if Resource is an  existing   resource  for the LOD server.
  444%	Typically,  this  means  it  appears  as  a  subject,  but  when
  445%	considering symmetric bounded descriptions,  it should certainly
  446%	also hold for resources that only appear as object.
  447
  448lod_resource(Resource) :-
  449	(   rdf(Resource, _, _)
  450	;   rdf(_, Resource, _)
  451	;   rdf(_, _, Resource)
  452	;   rdf_graph(Resource)
  453	), !.
  454
  455
  456		 /*******************************
  457		 *	       HOOKS		*
  458		 *******************************/
  459
  460:- multifile
  461	cliopatria:redirect_uri/3,
  462	cliopatria:lod_description/2.  463
  464%%	cliopatria:redirect_uri(+Format, +URI, -RedirectURL)
  465%
  466%	Compose a RedirectionURL based on the  output Format and the URI
  467%	that is in our RDF database. For example, this could map the URI
  468%	http://example.com/employe/bill   into   Bill's    homepage   at
  469%	http://example.com/~bill if Format is =html=.  The default is to
  470%	a format-specific extension  to  the   path  component  of  URI,
  471%	returning  e.g.,  http://example.com/employe/bill.rdf    if  the
  472%	requested format is RDF.
  473%
  474%       @see This hook is used by redirect/3.
  475%       @param Format is one of =xmlrdf=, =turtle, =json= or =html=.
  476
  477
  478%%	cliopatria:lod_description(+URI, -RDF:list(rdf(s,p,o)))
  479%
  480%	RDF is list of triples describing URI. The default is to use the
  481%	Concise Bounded Description as implemented by resource_CBD/3.
  482%
  483%	@see This hook is used by lod_description/2
  484%	@see library(semweb/rdf_describe) provides several definitions
  485%	of bounded descriptions.