View source with formatted comments or as raw
    1/*  Part of ClioPatria SeRQL and SPARQL server
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@cs.vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (C): 2010, University of Amsterdam,
    7		   VU University Amsterdam
    8
    9    This program is free software; you can redistribute it and/or
   10    modify it under the terms of the GNU General Public License
   11    as published by the Free Software Foundation; either version 2
   12    of the License, or (at your option) any later version.
   13
   14    This program is distributed in the hope that it will be useful,
   15    but WITHOUT ANY WARRANTY; without even the implied warranty of
   16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   17    GNU General Public License for more details.
   18
   19    You should have received a copy of the GNU General Public
   20    License along with this library; if not, write to the Free Software
   21    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   22
   23    As a special exception, if you link this library with other files,
   24    compiled with a Free Software compiler, to produce an executable, this
   25    library does not by itself cause the resulting executable to be covered
   26    by the GNU General Public License. This exception does not however
   27    invalidate any other reasons why the executable file might be covered by
   28    the GNU General Public License.
   29*/
   30
   31:- module(url_cache,
   32	  [ url_cache/3,		% +URI, -File, -MimeType
   33	    url_cache_file/4,		% +URL, +Dir, +Ext, -Path)
   34	    url_cache_delete/1,		% +URI
   35	    url_cached/2,		% ?URL, ?Property
   36	    url_cached/3,		% +Dir, ?URL, ?Property
   37	    url_cache_reset_server_status/0,
   38	    url_cache_reset_server_status/1 % +Server
   39	  ]).   40:- use_module(library(http/http_open)).   41:- if(exists_source(library(http/http_ssl_plugin))).   42:- use_module(library(http/http_ssl_plugin)).   43:- endif.   44:- use_module(library(http/mimetype)).   45:- use_module(library(url)).   46:- use_module(library(debug)).   47:- use_module(library(error)).   48:- use_module(library(settings)).   49:- use_module(library(base64)).   50:- use_module(library(utf8)).   51:- use_module(library(lists)).   52:- use_module(library(sha)).   53
   54:- setting(cache:url_cache_directory, atom, 'cache/url',
   55	   'Directory to cache fetched remote URLs').   56
   57/** <module> Cache the content of external URLs in local files
   58
   59This library provides a cache  for  data   stored  in  extenal URLs. The
   60content of each URL is kept in a  file and described by a meta-file that
   61remembers the mime-type, the original URL, when   it was fetched and -if
   62provided by the server- the last-modified stamp.
   63
   64@tbd	The current implementation does not validate the cache content, nor
   65	does it honour the HTTP cache directives.
   66*/
   67
   68
   69%%	url_cache(+URI:atom, -Path:atom, -MimeType:atom) is det.
   70%
   71%	Return the content of URI in  a   file  at Path. MimeType is the
   72%	Mime-type returned by the server.
   73%
   74%	@error	existence_error(url, URL)
   75%		Server did not respond with 200 OK
   76%	@error  existence_error(source_sink, url_cache(.))
   77%		Cache directory does not exist
   78%	@bug	Does not check modification time and cache validity
   79
   80url_cache(URL, Path, MimeType) :-
   81	url_cache_dir(Dir),
   82	url_cache_file(URL, Dir, url, Path),
   83	atom_concat(Path, '.meta', TypeFile),
   84	(   exists_file(Path),
   85	    exists_file(TypeFile),
   86	    read_meta_file(TypeFile, mime_type(MimeType0))
   87	->  MimeType = MimeType0
   88	;   fetch_url(URL, Path, MimeType, Modified),
   89	    get_time(NowF),
   90	    Now is round(NowF),
   91	    open(TypeFile, write, Out,
   92		 [ encoding(utf8),
   93		   lock(write)
   94		 ]),
   95	    format(Out,
   96		   'mime_type(~q).~n\c
   97		    url(~q).~n\c
   98		    fetched(~q).~n',
   99		   [MimeType, URL, Now]),
  100	    (	nonvar(Modified)
  101	    ->	format(Out, 'last_modified(~q).~n', [Modified])
  102	    ;	true
  103	    ),
  104	    close(Out)
  105	).
  106
  107read_meta_file(MimeFile, Term) :-
  108	setup_call_cleanup(open(MimeFile, read, In,
  109				[ encoding(utf8),
  110				  lock(read)
  111				]),
  112			   ndet_read(In, Term),
  113			   close(In)).
  114
  115ndet_read(Stream, Term) :-
  116	repeat,
  117	read(Stream, Term0),
  118	(   Term0 == end_of_file
  119	->  !, fail
  120	;   Term = Term0
  121	).
  122
  123%%	url_cache_delete(+URL) is det.
  124%
  125%	Delete an URL from the cache. Succeeds,  even if the cache files
  126%	do not exist.
  127%
  128%	@error	Throws exceptions from delete_file/1 other than
  129%		existence errors.
  130
  131url_cache_delete(URL) :-
  132	url_cache_dir(Dir),
  133	url_cache_file(URL, Dir, url, Path),
  134	atom_concat(Path, '.meta', TypeFile),
  135	catch(delete_file(TypeFile), E0, true),
  136	catch(delete_file(Path), E1, true),
  137	error_ok(E0),
  138	error_ok(E1).
  139
  140error_ok(E) :-
  141	subsumes_term(error(existence_error(file, _), _), E), !.
  142error_ok(E) :-
  143	throw(E).
  144
  145%%	url_cache_dir(-Dir) is det
  146%
  147%	Return or create the URL caching directory
  148
  149url_cache_dir(Dir) :-
  150	setting(cache:url_cache_directory, Dir),
  151	make_directory_path(Dir).
  152
  153%%	make_directory_path(+Dir) is det.
  154%
  155%	Create Dir and all required components.
  156
  157make_directory_path(Dir) :-
  158	make_directory_path_2(Dir), !.
  159make_directory_path(Dir) :-
  160	permission_error(create, directory, Dir).
  161
  162make_directory_path_2(Dir) :-
  163	exists_directory(Dir), !.
  164make_directory_path_2(Dir) :-
  165	Dir \== (/), !,
  166	file_directory_name(Dir, Parent),
  167	make_directory_path_2(Parent),
  168	make_directory(Dir).
  169
  170%%	fetch_url(+URL:atom, +Path:atom, -MimeType:atom) is det.
  171%
  172%	@error	existence_error(url, URL)
  173
  174fetch_url(URL, File, MimeType, Modified) :-
  175	parse_url_ex(URL, Parts),
  176	server(Parts, Server),
  177	(   allow(Server)
  178	->  true
  179	;   throw(error(existence_error(url, URL),
  180			context(url_cache/3, 'Too many errors from server')))
  181	),
  182	get_time(Now),
  183	(   catch(fetch_url_raw(URL, File,
  184				MimeType, Modified), E, true)
  185	->  (   var(E)
  186	    ->	register_stats(Server, Now, true)
  187	    ;	register_stats(Server, Now, error(E)),
  188		throw(E)
  189	    )
  190	;   register_stats(Server, Now, false)
  191	).
  192
  193server(Parts, Server) :-
  194	memberchk(host(Host), Parts), !,
  195	(   memberchk(port(Port), Parts)
  196	->  Server = Host:Port
  197	;   Server = Host
  198	).
  199server(_,_) :-
  200	assertion(false).
  201
  202/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  203Server status assessment. We keep a   health-status  of the server using
  204the following rules:
  205
  206    * Range -100 .. 100
  207    * Ok if > 0
  208    * The initial status is 100 (healthy)
  209    * Possitive results add 20-4*sqrt(Time)
  210    * Negative results subtract 10
  211    * Add 1 per minute since last status.
  212
  213TBD: frequency matters: requests should not pile up.
  214- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  215
  216:- dynamic
  217	server_status/3.		% Server, Status, Last
  218
  219allow(Server) :-
  220	server_status(Server, Status),
  221	debug(url_cache, 'Status ~q: ~w', [Server, Status]),
  222	Status > 0.
  223
  224server_status(Server, Status) :-
  225	get_time(Now),
  226	with_mutex(url_cache_status,
  227		   server_status(Server, S0, T0)), !,
  228	Status is min(100, S0 + round(Now-T0)//60).
  229server_status(_, 100).
  230
  231register_stats(Server, Start, Result) :-
  232	get_time(Now),
  233	Time is Now - Start,
  234	(   server_status(Server, S0, T0)
  235	->  true
  236	;   S0 = 100,
  237	    T0 = Now
  238	),
  239	Since is Start - T0,
  240	update_status(Result, Time, Since, S0, S1),
  241	with_mutex(url_cache_status,
  242		   (   retractall(server_status(Server, _, _)),
  243		       assert(server_status(Server, S1, Start)))).
  244
  245update_status(true, Time, Since, S0, S) :- !,
  246	S is min(100, S0 + round(20-4*sqrt(Time)) + round(Since)//60).
  247update_status(_, Time, _Since, S0, S) :- !,
  248	S is max(-100, S0 - (10 + round(Time))).
  249
  250
  251%%	url_cache_reset_server_status is det.
  252%%	url_cache_reset_server_status(+Server) is det.
  253%
  254%	Reset the status of the given server or all servers.
  255
  256url_cache_reset_server_status :-
  257	with_mutex(url_cache_status,
  258		   retractall(server_status(_,_,_))).
  259url_cache_reset_server_status(Server) :-
  260	must_be(atom, Server),
  261	with_mutex(url_cache_status,
  262		   retractall(server_status(Server,_,_))).
  263
  264
  265%%	fetch_url_raw(+URL:atom, +Path:atom, -MimeType:atom, -Modified) is det.
  266%
  267%	Fetch data from URL and put it   into the file Path. MimeType is
  268%	unified  with  the  MIME-type  as  reported  by  the  server  or
  269%	text/plain if the server did not provide a MIME-Type.
  270%
  271%	@error	existence_error(url, URL)
  272
  273fetch_url_raw(URL, File, MimeType, Modified) :-
  274	debug(url_cache, 'Downloading ~w ...', [URL]),
  275	atom_concat(File, '.tmp', TmpFile),
  276	(   catch(fetch_to_file(URL, TmpFile, Code, Header), E, true)
  277	->  true
  278	;   E = predicate_failed(http_get/3)
  279	),
  280	(   var(E)
  281	->  true
  282	;   (   debugging(url_cache)
  283	    ->	print_message(error, E)
  284	    ;	true
  285	    ),
  286	    catch(delete_file(TmpFile), _, true),
  287	    (	debugging(url_cache)
  288	    ->	message_to_string(E, Msg),
  289		debug(url_cache, 'Download failed: ~w', [Msg])
  290	    ;	true
  291	    ),
  292	    throw(E)
  293	),
  294	(   Code == 200
  295	->  rename_file(TmpFile, File)
  296	;   catch(delete_file(TmpFile), _, true),
  297	    throw(error(existence_error(url, URL), _))
  298	),
  299	(   memberchk(content_type(MimeType0), Header)
  300	->  true
  301	;   MimeType0 = 'text/plain'
  302	),
  303	ignore(memberchk(last_modified(Modified), Header)),
  304	debug(url_cache, 'Downloaded ~w, mime-type: ~w',
  305	      [URL, MimeType0]),
  306	MimeType = MimeType0.
  307
  308fetch_to_file(URL, File, Code,
  309	      [ content_type(ContentType),
  310		last_modified(LastModified)
  311	      ]) :-
  312	setup_call_cleanup(
  313	    open(File, write, Out, [ type(binary) ]),
  314	    setup_call_cleanup(
  315		http_open(URL, In,
  316			  [ header(content_type, ContentType),
  317			    header(last_modified, LastModified),
  318			    status_code(Code),
  319			    cert_verify_hook(ssl_verify)
  320			  ]),
  321		copy_stream_data(In, Out),
  322		close(In)),
  323	    close(Out)).
  324
  325:- public ssl_verify/5.  326
  327%%	ssl_verify(+SSL, +ProblemCert, +AllCerts, +FirstCert, +Error)
  328%
  329%	Currently we accept  all  certificates.
  330
  331ssl_verify(_SSL,
  332	   _ProblemCertificate, _AllCertificates, _FirstCertificate,
  333	   _Error).
  334
  335parse_url_ex(URL, Parts) :-
  336	is_list(URL), !,
  337	Parts = URL.
  338parse_url_ex(URL, Parts) :-
  339	parse_url(URL, Parts), !.
  340parse_url_ex(URL, _) :-
  341	domain_error(url, URL).
  342
  343%%	url_cache_file(+URL, +Dir, +Ext, -Path) is det
  344%
  345%	Determine location of cache-file for the   given  URL in Dir. If
  346%	Ext is provided, the  returned  Path   is  ensured  to  have the
  347%	specified extension.
  348
  349url_cache_file(URL, Dir, Ext, Path) :-
  350	url_to_file(URL, Ext, File),
  351	sub_atom(File, 0, 2, _, L1),
  352	ensure_dir(Dir, L1, Dir1),
  353	sub_atom(File, 2, 2, _, L2),
  354	ensure_dir(Dir1, L2, Dir2),
  355	sub_atom(File, 4, _, 0, LocalFile),
  356	atomic_list_concat([Dir2, /, LocalFile], Path).
  357
  358ensure_dir(D0, Sub, Dir) :-
  359	atomic_list_concat([D0, /, Sub], Dir),
  360	(   exists_directory(Dir)
  361	->  true
  362	;   make_directory(Dir)
  363	).
  364
  365%%	url_to_file(+URL, +Ext, -File) is det.
  366%
  367%	File is a filename for storing URL and has extension Ext. We use
  368%	a cryptographic hash to ensure consistent naming, a name that is
  369%	guaranteed to fit in every sensible filesystem and ensure a good
  370%	distribution of the cache directories.
  371
  372url_to_file(URL, Ext, File) :-
  373	sha_hash(URL, Hash, []),
  374	phrase(hex_digits(Hash), Codes),
  375	string_to_list(String, Codes),
  376	file_name_extension(String, Ext, File).
  377
  378hex_digits([]) -->
  379	"".
  380hex_digits([H|T]) -->
  381	byte(H),
  382	hex_digits(T).
  383
  384byte(Byte) -->
  385	{ High is (Byte>>4) /\ 0xf,
  386	  Low is (Byte /\ 0xf),
  387	  code_type(H, xdigit(High)),
  388	  code_type(L, xdigit(Low))
  389	},
  390	[H,L].
  391
  392
  393		 /*******************************
  394		 *	    READ CACHE		*
  395		 *******************************/
  396
  397%%	url_cached(?URL, ?Property) is nondet.
  398%%	url_cached(+Dir, ?URL, ?Property) is nondet.
  399%
  400%	True if URL is in the cache represented by the directory Dir and
  401%	has Property.  Defined properties are:
  402%
  403%	    * file(-File)
  404%	    File is the cache-file for the given URL
  405%	    * mime_type(-Mime)
  406%	    Mime is the mime-type of the URL as reported by the server
  407%	    * fetched(-Stamp:integer)
  408%	    Timestamp that specifies when the URL was fetched
  409%	    * last_modified(-Modified:atom)
  410%	    If present, this is the modification time as provided by
  411%	    the server.
  412
  413url_cached(URL, Property) :-
  414	url_cache_dir(Dir),
  415	url_cached(Dir, URL, Property).
  416
  417url_cached(Dir, URL, Property) :-
  418	nonvar(URL), !,
  419	url_cache_file(URL, Dir, url, Path),
  420	atom_concat(Path, '.meta', MetaFile),
  421	exists_file(MetaFile),
  422	cache_file_property(Property, MetaFile).
  423url_cached(Dir, URL, Property) :-
  424	nonvar(Property),
  425	Property = file(File),
  426	atom(File),
  427	atom_concat(Dir, Rest, File),
  428	\+ sub_atom(Rest, _, _, _, '../'),
  429	file_name_extension(Base, url, File),
  430	file_name_extension(Base, meta, MetaFile),
  431	exists_file(MetaFile),
  432	once(read_meta_file(MetaFile, url(URL))).
  433url_cached(Dir, URL, Property) :-
  434	atom_concat(Dir, '/??', TopPat),
  435	expand_file_name(TopPat, TopDirs),
  436	member(TopDir, TopDirs),
  437	atom_concat(TopDir, '/??', DirPat),
  438	expand_file_name(DirPat, FileDirs),
  439	member(FileDir, FileDirs),
  440	atom_concat(FileDir, '/*.meta', FilePat),
  441	expand_file_name(FilePat, MetaFiles),
  442	member(MetaFile, MetaFiles),
  443	once(read_meta_file(MetaFile, url(URL))),
  444	check_cache_file(MetaFile, URL),
  445	cache_file_property(Property, MetaFile).
  446
  447check_cache_file(MetaFile, URL) :-
  448	file_name_extension(File, meta, MetaFile),
  449	(   exists_file(File)
  450	->  true
  451	;   print_message(warning, url_cache(no_file(File, MetaFile, URL))),
  452	    delete_file(MetaFile),
  453	    fail
  454	).
  455
  456cache_file_property(Property, MetaFile) :-
  457	var(Property), !,
  458	cache_file_property_ndet(Property, MetaFile).
  459cache_file_property(Property, MetaFile) :-
  460	cache_file_property_ndet(Property, MetaFile), !.
  461
  462
  463cache_file_property_ndet(file(File), MetaFile) :-
  464	file_name_extension(File, meta, MetaFile).
  465cache_file_property_ndet(P, MetaFile) :-
  466	read_meta_file(MetaFile, P),
  467	P \= url(_).
  468
  469		 /*******************************
  470		 *	     MESSAGES		*
  471		 *******************************/
  472
  473:- multifile
  474	prolog:message//1.  475
  476prolog:message(url_cache(no_file(File, _MetaFile, URL))) -->
  477	[ 'URL Cache: file ~q does not exist (URL=~q)'-[File, URL] ]