View source with raw comments or as raw
    1/*  Part of ClioPatria SeRQL and SPARQL server
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@cs.vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (C): 2010, University of Amsterdam,
    7		   VU University Amsterdam
    8
    9    This program is free software; you can redistribute it and/or
   10    modify it under the terms of the GNU General Public License
   11    as published by the Free Software Foundation; either version 2
   12    of the License, or (at your option) any later version.
   13
   14    This program is distributed in the hope that it will be useful,
   15    but WITHOUT ANY WARRANTY; without even the implied warranty of
   16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   17    GNU General Public License for more details.
   18
   19    You should have received a copy of the GNU General Public
   20    License along with this library; if not, write to the Free Software
   21    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   22
   23    As a special exception, if you link this library with other files,
   24    compiled with a Free Software compiler, to produce an executable, this
   25    library does not by itself cause the resulting executable to be covered
   26    by the GNU General Public License. This exception does not however
   27    invalidate any other reasons why the executable file might be covered by
   28    the GNU General Public License.
   29*/
   30
   31:- module(url_cache,
   32	  [ url_cache/3,		% +URI, -File, -MimeType
   33	    url_cache_file/4,		% +URL, +Dir, +Ext, -Path)
   34	    url_cache_delete/1,		% +URI
   35	    url_cached/2,		% ?URL, ?Property
   36	    url_cached/3,		% +Dir, ?URL, ?Property
   37	    url_cache_reset_server_status/0,
   38	    url_cache_reset_server_status/1 % +Server
   39	  ]).   40:- use_module(library(http/http_open)).   41:- if(exists_source(library(http/http_ssl_plugin))).   42:- use_module(library(http/http_ssl_plugin)).   43:- endif.   44:- use_module(library(http/mimetype)).   45:- use_module(library(url)).   46:- use_module(library(debug)).   47:- use_module(library(error)).   48:- use_module(library(settings)).   49:- use_module(library(base64)).   50:- use_module(library(utf8)).   51:- use_module(library(lists)).   52:- use_module(library(sha)).   53
   54:- setting(cache:url_cache_directory, atom, 'cache/url',
   55	   'Directory to cache fetched remote URLs').

Cache the content of external URLs in local files

This library provides a cache for data stored in extenal URLs. The content of each URL is kept in a file and described by a meta-file that remembers the mime-type, the original URL, when it was fetched and -if provided by the server- the last-modified stamp.

To be done
-
The current implementation does not validate the cache content, nor does it honour the HTTP cache directives. */
 url_cache(+URI:atom, -Path:atom, -MimeType:atom) is det
Return the content of URI in a file at Path. MimeType is the Mime-type returned by the server.
Errors
- existence_error(url, URL) Server did not respond with 200 OK
- existence_error(source_sink, url_cache(.)) Cache directory does not exist
bug
- Does not check modification time and cache validity
   80url_cache(URL, Path, MimeType) :-
   81	url_cache_dir(Dir),
   82	url_cache_file(URL, Dir, url, Path),
   83	atom_concat(Path, '.meta', TypeFile),
   84	(   exists_file(Path),
   85	    exists_file(TypeFile),
   86	    read_meta_file(TypeFile, mime_type(MimeType0))
   87	->  MimeType = MimeType0
   88	;   fetch_url(URL, Path, MimeType, Modified),
   89	    get_time(NowF),
   90	    Now is round(NowF),
   91	    open(TypeFile, write, Out,
   92		 [ encoding(utf8),
   93		   lock(write)
   94		 ]),
   95	    format(Out,
   96		   'mime_type(~q).~n\c
   97		    url(~q).~n\c
   98		    fetched(~q).~n',
   99		   [MimeType, URL, Now]),
  100	    (	nonvar(Modified)
  101	    ->	format(Out, 'last_modified(~q).~n', [Modified])
  102	    ;	true
  103	    ),
  104	    close(Out)
  105	).
  106
  107read_meta_file(MimeFile, Term) :-
  108	setup_call_cleanup(open(MimeFile, read, In,
  109				[ encoding(utf8),
  110				  lock(read)
  111				]),
  112			   ndet_read(In, Term),
  113			   close(In)).
  114
  115ndet_read(Stream, Term) :-
  116	repeat,
  117	read(Stream, Term0),
  118	(   Term0 == end_of_file
  119	->  !, fail
  120	;   Term = Term0
  121	).
 url_cache_delete(+URL) is det
Delete an URL from the cache. Succeeds, even if the cache files do not exist.
Errors
- Throws exceptions from delete_file/1 other than existence errors.
  131url_cache_delete(URL) :-
  132	url_cache_dir(Dir),
  133	url_cache_file(URL, Dir, url, Path),
  134	atom_concat(Path, '.meta', TypeFile),
  135	catch(delete_file(TypeFile), E0, true),
  136	catch(delete_file(Path), E1, true),
  137	error_ok(E0),
  138	error_ok(E1).
  139
  140error_ok(E) :-
  141	subsumes_term(error(existence_error(file, _), _), E), !.
  142error_ok(E) :-
  143	throw(E).
 url_cache_dir(-Dir) is det
Return or create the URL caching directory
  149url_cache_dir(Dir) :-
  150	setting(cache:url_cache_directory, Dir),
  151	make_directory_path(Dir).
 make_directory_path(+Dir) is det
Create Dir and all required components.
  157make_directory_path(Dir) :-
  158	make_directory_path_2(Dir), !.
  159make_directory_path(Dir) :-
  160	permission_error(create, directory, Dir).
  161
  162make_directory_path_2(Dir) :-
  163	exists_directory(Dir), !.
  164make_directory_path_2(Dir) :-
  165	Dir \== (/), !,
  166	file_directory_name(Dir, Parent),
  167	make_directory_path_2(Parent),
  168	make_directory(Dir).
 fetch_url(+URL:atom, +Path:atom, -MimeType:atom) is det
Errors
- existence_error(url, URL)
  174fetch_url(URL, File, MimeType, Modified) :-
  175	parse_url_ex(URL, Parts),
  176	server(Parts, Server),
  177	(   allow(Server)
  178	->  true
  179	;   throw(error(existence_error(url, URL),
  180			context(url_cache/3, 'Too many errors from server')))
  181	),
  182	get_time(Now),
  183	(   catch(fetch_url_raw(URL, File,
  184				MimeType, Modified), E, true)
  185	->  (   var(E)
  186	    ->	register_stats(Server, Now, true)
  187	    ;	register_stats(Server, Now, error(E)),
  188		throw(E)
  189	    )
  190	;   register_stats(Server, Now, false)
  191	).
  192
  193server(Parts, Server) :-
  194	memberchk(host(Host), Parts), !,
  195	(   memberchk(port(Port), Parts)
  196	->  Server = Host:Port
  197	;   Server = Host
  198	).
  199server(_,_) :-
  200	assertion(false).
  201
  202/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  203Server status assessment. We keep a   health-status  of the server using
  204the following rules:
  205
  206    * Range -100 .. 100
  207    * Ok if > 0
  208    * The initial status is 100 (healthy)
  209    * Possitive results add 20-4*sqrt(Time)
  210    * Negative results subtract 10
  211    * Add 1 per minute since last status.
  212
  213TBD: frequency matters: requests should not pile up.
  214- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  215
  216:- dynamic
  217	server_status/3.		% Server, Status, Last
  218
  219allow(Server) :-
  220	server_status(Server, Status),
  221	debug(url_cache, 'Status ~q: ~w', [Server, Status]),
  222	Status > 0.
  223
  224server_status(Server, Status) :-
  225	get_time(Now),
  226	with_mutex(url_cache_status,
  227		   server_status(Server, S0, T0)), !,
  228	Status is min(100, S0 + round(Now-T0)//60).
  229server_status(_, 100).
  230
  231register_stats(Server, Start, Result) :-
  232	get_time(Now),
  233	Time is Now - Start,
  234	(   server_status(Server, S0, T0)
  235	->  true
  236	;   S0 = 100,
  237	    T0 = Now
  238	),
  239	Since is Start - T0,
  240	update_status(Result, Time, Since, S0, S1),
  241	with_mutex(url_cache_status,
  242		   (   retractall(server_status(Server, _, _)),
  243		       assert(server_status(Server, S1, Start)))).
  244
  245update_status(true, Time, Since, S0, S) :- !,
  246	S is min(100, S0 + round(20-4*sqrt(Time)) + round(Since)//60).
  247update_status(_, Time, _Since, S0, S) :- !,
  248	S is max(-100, S0 - (10 + round(Time))).
 url_cache_reset_server_status is det
 url_cache_reset_server_status(+Server) is det
Reset the status of the given server or all servers.
  256url_cache_reset_server_status :-
  257	with_mutex(url_cache_status,
  258		   retractall(server_status(_,_,_))).
  259url_cache_reset_server_status(Server) :-
  260	must_be(atom, Server),
  261	with_mutex(url_cache_status,
  262		   retractall(server_status(Server,_,_))).
 fetch_url_raw(+URL:atom, +Path:atom, -MimeType:atom, -Modified) is det
Fetch data from URL and put it into the file Path. MimeType is unified with the MIME-type as reported by the server or text/plain if the server did not provide a MIME-Type.
Errors
- existence_error(url, URL)
  273fetch_url_raw(URL, File, MimeType, Modified) :-
  274	debug(url_cache, 'Downloading ~w ...', [URL]),
  275	atom_concat(File, '.tmp', TmpFile),
  276	(   catch(fetch_to_file(URL, TmpFile, Code, Header), E, true)
  277	->  true
  278	;   E = predicate_failed(http_get/3)
  279	),
  280	(   var(E)
  281	->  true
  282	;   (   debugging(url_cache)
  283	    ->	print_message(error, E)
  284	    ;	true
  285	    ),
  286	    catch(delete_file(TmpFile), _, true),
  287	    (	debugging(url_cache)
  288	    ->	message_to_string(E, Msg),
  289		debug(url_cache, 'Download failed: ~w', [Msg])
  290	    ;	true
  291	    ),
  292	    throw(E)
  293	),
  294	(   Code == 200
  295	->  rename_file(TmpFile, File)
  296	;   catch(delete_file(TmpFile), _, true),
  297	    throw(error(existence_error(url, URL), _))
  298	),
  299	(   memberchk(content_type(MimeType0), Header)
  300	->  true
  301	;   MimeType0 = 'text/plain'
  302	),
  303	ignore(memberchk(last_modified(Modified), Header)),
  304	debug(url_cache, 'Downloaded ~w, mime-type: ~w',
  305	      [URL, MimeType0]),
  306	MimeType = MimeType0.
  307
  308fetch_to_file(URL, File, Code,
  309	      [ content_type(ContentType),
  310		last_modified(LastModified)
  311	      ]) :-
  312	setup_call_cleanup(
  313	    open(File, write, Out, [ type(binary) ]),
  314	    setup_call_cleanup(
  315		http_open(URL, In,
  316			  [ header(content_type, ContentType),
  317			    header(last_modified, LastModified),
  318			    status_code(Code),
  319			    cert_verify_hook(ssl_verify)
  320			  ]),
  321		copy_stream_data(In, Out),
  322		close(In)),
  323	    close(Out)).
  324
  325:- public ssl_verify/5.
 ssl_verify(+SSL, +ProblemCert, +AllCerts, +FirstCert, +Error)
Currently we accept all certificates.
  331ssl_verify(_SSL,
  332	   _ProblemCertificate, _AllCertificates, _FirstCertificate,
  333	   _Error).
  334
  335parse_url_ex(URL, Parts) :-
  336	is_list(URL), !,
  337	Parts = URL.
  338parse_url_ex(URL, Parts) :-
  339	parse_url(URL, Parts), !.
  340parse_url_ex(URL, _) :-
  341	domain_error(url, URL).
 url_cache_file(+URL, +Dir, +Ext, -Path) is det
Determine location of cache-file for the given URL in Dir. If Ext is provided, the returned Path is ensured to have the specified extension.
  349url_cache_file(URL, Dir, Ext, Path) :-
  350	url_to_file(URL, Ext, File),
  351	sub_atom(File, 0, 2, _, L1),
  352	ensure_dir(Dir, L1, Dir1),
  353	sub_atom(File, 2, 2, _, L2),
  354	ensure_dir(Dir1, L2, Dir2),
  355	sub_atom(File, 4, _, 0, LocalFile),
  356	atomic_list_concat([Dir2, /, LocalFile], Path).
  357
  358ensure_dir(D0, Sub, Dir) :-
  359	atomic_list_concat([D0, /, Sub], Dir),
  360	(   exists_directory(Dir)
  361	->  true
  362	;   make_directory(Dir)
  363	).
 url_to_file(+URL, +Ext, -File) is det
File is a filename for storing URL and has extension Ext. We use a cryptographic hash to ensure consistent naming, a name that is guaranteed to fit in every sensible filesystem and ensure a good distribution of the cache directories.
  372url_to_file(URL, Ext, File) :-
  373	sha_hash(URL, Hash, []),
  374	phrase(hex_digits(Hash), Codes),
  375	string_to_list(String, Codes),
  376	file_name_extension(String, Ext, File).
  377
  378hex_digits([]) -->
  379	"".
  380hex_digits([H|T]) -->
  381	byte(H),
  382	hex_digits(T).
  383
  384byte(Byte) -->
  385	{ High is (Byte>>4) /\ 0xf,
  386	  Low is (Byte /\ 0xf),
  387	  code_type(H, xdigit(High)),
  388	  code_type(L, xdigit(Low))
  389	},
  390	[H,L].
  391
  392
  393		 /*******************************
  394		 *	    READ CACHE		*
  395		 *******************************/
 url_cached(?URL, ?Property) is nondet
 url_cached(+Dir, ?URL, ?Property) is nondet
True if URL is in the cache represented by the directory Dir and has Property. Defined properties are:
file(-File)
File is the cache-file for the given URL
mime_type(-Mime)
Mime is the mime-type of the URL as reported by the server
fetched(-Stamp:integer)
Timestamp that specifies when the URL was fetched
last_modified(-Modified:atom)
If present, this is the modification time as provided by the server.
  413url_cached(URL, Property) :-
  414	url_cache_dir(Dir),
  415	url_cached(Dir, URL, Property).
  416
  417url_cached(Dir, URL, Property) :-
  418	nonvar(URL), !,
  419	url_cache_file(URL, Dir, url, Path),
  420	atom_concat(Path, '.meta', MetaFile),
  421	exists_file(MetaFile),
  422	cache_file_property(Property, MetaFile).
  423url_cached(Dir, URL, Property) :-
  424	nonvar(Property),
  425	Property = file(File),
  426	atom(File),
  427	atom_concat(Dir, Rest, File),
  428	\+ sub_atom(Rest, _, _, _, '../'),
  429	file_name_extension(Base, url, File),
  430	file_name_extension(Base, meta, MetaFile),
  431	exists_file(MetaFile),
  432	once(read_meta_file(MetaFile, url(URL))).
  433url_cached(Dir, URL, Property) :-
  434	atom_concat(Dir, '/??', TopPat),
  435	expand_file_name(TopPat, TopDirs),
  436	member(TopDir, TopDirs),
  437	atom_concat(TopDir, '/??', DirPat),
  438	expand_file_name(DirPat, FileDirs),
  439	member(FileDir, FileDirs),
  440	atom_concat(FileDir, '/*.meta', FilePat),
  441	expand_file_name(FilePat, MetaFiles),
  442	member(MetaFile, MetaFiles),
  443	once(read_meta_file(MetaFile, url(URL))),
  444	check_cache_file(MetaFile, URL),
  445	cache_file_property(Property, MetaFile).
  446
  447check_cache_file(MetaFile, URL) :-
  448	file_name_extension(File, meta, MetaFile),
  449	(   exists_file(File)
  450	->  true
  451	;   print_message(warning, url_cache(no_file(File, MetaFile, URL))),
  452	    delete_file(MetaFile),
  453	    fail
  454	).
  455
  456cache_file_property(Property, MetaFile) :-
  457	var(Property), !,
  458	cache_file_property_ndet(Property, MetaFile).
  459cache_file_property(Property, MetaFile) :-
  460	cache_file_property_ndet(Property, MetaFile), !.
  461
  462
  463cache_file_property_ndet(file(File), MetaFile) :-
  464	file_name_extension(File, meta, MetaFile).
  465cache_file_property_ndet(P, MetaFile) :-
  466	read_meta_file(MetaFile, P),
  467	P \= url(_).
  468
  469		 /*******************************
  470		 *	     MESSAGES		*
  471		 *******************************/
  472
  473:- multifile
  474	prolog:message//1.  475
  476prolog:message(url_cache(no_file(File, _MetaFile, URL))) -->
  477	[ 'URL Cache: file ~q does not exist (URL=~q)'-[File, URL] ]