30
31:- module(url_cache,
32 [ url_cache/3, 33 url_cache_file/4, 34 url_cache_delete/1, 35 url_cached/2, 36 url_cached/3, 37 url_cache_reset_server_status/0,
38 url_cache_reset_server_status/1 39 ]). 40:- use_module(library(http/http_open)). 41:- if(exists_source(library(http/http_ssl_plugin))). 42:- use_module(library(http/http_ssl_plugin)). 43:- endif. 44:- use_module(library(http/mimetype)). 45:- use_module(library(url)). 46:- use_module(library(debug)). 47:- use_module(library(error)). 48:- use_module(library(settings)). 49:- use_module(library(base64)). 50:- use_module(library(utf8)). 51:- use_module(library(lists)). 52:- use_module(library(sha)). 53
54:- setting(cache:url_cache_directory, atom, 'cache/url',
55 'Directory to cache fetched remote URLs'). 56
67
68
79
80url_cache(URL, Path, MimeType) :-
81 url_cache_dir(Dir),
82 url_cache_file(URL, Dir, url, Path),
83 atom_concat(Path, '.meta', TypeFile),
84 ( exists_file(Path),
85 exists_file(TypeFile),
86 read_meta_file(TypeFile, mime_type(MimeType0))
87 -> MimeType = MimeType0
88 ; fetch_url(URL, Path, MimeType, Modified),
89 get_time(NowF),
90 Now is round(NowF),
91 open(TypeFile, write, Out,
92 [ encoding(utf8),
93 lock(write)
94 ]),
95 format(Out,
96 'mime_type(~q).~n\c
97 url(~q).~n\c
98 fetched(~q).~n',
99 [MimeType, URL, Now]),
100 ( nonvar(Modified)
101 -> format(Out, 'last_modified(~q).~n', [Modified])
102 ; true
103 ),
104 close(Out)
105 ).
106
107read_meta_file(MimeFile, Term) :-
108 setup_call_cleanup(open(MimeFile, read, In,
109 [ encoding(utf8),
110 lock(read)
111 ]),
112 ndet_read(In, Term),
113 close(In)).
114
115ndet_read(Stream, Term) :-
116 repeat,
117 read(Stream, Term0),
118 ( Term0 == end_of_file
119 -> !, fail
120 ; Term = Term0
121 ).
122
130
131url_cache_delete(URL) :-
132 url_cache_dir(Dir),
133 url_cache_file(URL, Dir, url, Path),
134 atom_concat(Path, '.meta', TypeFile),
135 catch(delete_file(TypeFile), E0, true),
136 catch(delete_file(Path), E1, true),
137 error_ok(E0),
138 error_ok(E1).
139
140error_ok(E) :-
141 subsumes_term(error(existence_error(file, _), _), E), !.
142error_ok(E) :-
143 throw(E).
144
148
149url_cache_dir(Dir) :-
150 setting(cache:url_cache_directory, Dir),
151 make_directory_path(Dir).
152
156
157make_directory_path(Dir) :-
158 make_directory_path_2(Dir), !.
159make_directory_path(Dir) :-
160 permission_error(create, directory, Dir).
161
162make_directory_path_2(Dir) :-
163 exists_directory(Dir), !.
164make_directory_path_2(Dir) :-
165 Dir \== (/), !,
166 file_directory_name(Dir, Parent),
167 make_directory_path_2(Parent),
168 make_directory(Dir).
169
173
174fetch_url(URL, File, MimeType, Modified) :-
175 parse_url_ex(URL, Parts),
176 server(Parts, Server),
177 ( allow(Server)
178 -> true
179 ; throw(error(existence_error(url, URL),
180 context(url_cache/3, 'Too many errors from server')))
181 ),
182 get_time(Now),
183 ( catch(fetch_url_raw(URL, File,
184 MimeType, Modified), E, true)
185 -> ( var(E)
186 -> register_stats(Server, Now, true)
187 ; register_stats(Server, Now, error(E)),
188 throw(E)
189 )
190 ; register_stats(Server, Now, false)
191 ).
192
193server(Parts, Server) :-
194 memberchk(host(Host), Parts), !,
195 ( memberchk(port(Port), Parts)
196 -> Server = Host:Port
197 ; Server = Host
198 ).
199server(_,_) :-
200 assertion(false).
201
215
216:- dynamic
217 server_status/3. 218
219allow(Server) :-
220 server_status(Server, Status),
221 debug(url_cache, 'Status ~q: ~w', [Server, Status]),
222 Status > 0.
223
224server_status(Server, Status) :-
225 get_time(Now),
226 with_mutex(url_cache_status,
227 server_status(Server, S0, T0)), !,
228 Status is min(100, S0 + round(Now-T0)//60).
229server_status(_, 100).
230
231register_stats(Server, Start, Result) :-
232 get_time(Now),
233 Time is Now - Start,
234 ( server_status(Server, S0, T0)
235 -> true
236 ; S0 = 100,
237 T0 = Now
238 ),
239 Since is Start - T0,
240 update_status(Result, Time, Since, S0, S1),
241 with_mutex(url_cache_status,
242 ( retractall(server_status(Server, _, _)),
243 assert(server_status(Server, S1, Start)))).
244
245update_status(true, Time, Since, S0, S) :- !,
246 S is min(100, S0 + round(20-4*sqrt(Time)) + round(Since)//60).
247update_status(_, Time, _Since, S0, S) :- !,
248 S is max(-100, S0 - (10 + round(Time))).
249
250
255
256url_cache_reset_server_status :-
257 with_mutex(url_cache_status,
258 retractall(server_status(_,_,_))).
259url_cache_reset_server_status(Server) :-
260 must_be(atom, Server),
261 with_mutex(url_cache_status,
262 retractall(server_status(Server,_,_))).
263
264
272
273fetch_url_raw(URL, File, MimeType, Modified) :-
274 debug(url_cache, 'Downloading ~w ...', [URL]),
275 atom_concat(File, '.tmp', TmpFile),
276 ( catch(fetch_to_file(URL, TmpFile, Code, Header), E, true)
277 -> true
278 ; E = predicate_failed(http_get/3)
279 ),
280 ( var(E)
281 -> true
282 ; ( debugging(url_cache)
283 -> print_message(error, E)
284 ; true
285 ),
286 catch(delete_file(TmpFile), _, true),
287 ( debugging(url_cache)
288 -> message_to_string(E, Msg),
289 debug(url_cache, 'Download failed: ~w', [Msg])
290 ; true
291 ),
292 throw(E)
293 ),
294 ( Code == 200
295 -> rename_file(TmpFile, File)
296 ; catch(delete_file(TmpFile), _, true),
297 throw(error(existence_error(url, URL), _))
298 ),
299 ( memberchk(content_type(MimeType0), Header)
300 -> true
301 ; MimeType0 = 'text/plain'
302 ),
303 ignore(memberchk(last_modified(Modified), Header)),
304 debug(url_cache, 'Downloaded ~w, mime-type: ~w',
305 [URL, MimeType0]),
306 MimeType = MimeType0.
307
308fetch_to_file(URL, File, Code,
309 [ content_type(ContentType),
310 last_modified(LastModified)
311 ]) :-
312 setup_call_cleanup(
313 open(File, write, Out, [ type(binary) ]),
314 setup_call_cleanup(
315 http_open(URL, In,
316 [ header(content_type, ContentType),
317 header(last_modified, LastModified),
318 status_code(Code),
319 cert_verify_hook(ssl_verify)
320 ]),
321 copy_stream_data(In, Out),
322 close(In)),
323 close(Out)).
324
325:- public ssl_verify/5. 326
330
331ssl_verify(_SSL,
332 _ProblemCertificate, _AllCertificates, _FirstCertificate,
333 _Error).
334
335parse_url_ex(URL, Parts) :-
336 is_list(URL), !,
337 Parts = URL.
338parse_url_ex(URL, Parts) :-
339 parse_url(URL, Parts), !.
340parse_url_ex(URL, _) :-
341 domain_error(url, URL).
342
348
349url_cache_file(URL, Dir, Ext, Path) :-
350 url_to_file(URL, Ext, File),
351 sub_atom(File, 0, 2, _, L1),
352 ensure_dir(Dir, L1, Dir1),
353 sub_atom(File, 2, 2, _, L2),
354 ensure_dir(Dir1, L2, Dir2),
355 sub_atom(File, 4, _, 0, LocalFile),
356 atomic_list_concat([Dir2, /, LocalFile], Path).
357
358ensure_dir(D0, Sub, Dir) :-
359 atomic_list_concat([D0, /, Sub], Dir),
360 ( exists_directory(Dir)
361 -> true
362 ; make_directory(Dir)
363 ).
364
371
372url_to_file(URL, Ext, File) :-
373 sha_hash(URL, Hash, []),
374 phrase(hex_digits(Hash), Codes),
375 string_to_list(String, Codes),
376 file_name_extension(String, Ext, File).
377
378hex_digits([]) -->
379 "".
380hex_digits([H|T]) -->
381 byte(H),
382 hex_digits(T).
383
384byte(Byte) -->
385 { High is (Byte>>4) /\ 0xf,
386 Low is (Byte /\ 0xf),
387 code_type(H, xdigit(High)),
388 code_type(L, xdigit(Low))
389 },
390 [H,L].
391
392
393 396
412
413url_cached(URL, Property) :-
414 url_cache_dir(Dir),
415 url_cached(Dir, URL, Property).
416
417url_cached(Dir, URL, Property) :-
418 nonvar(URL), !,
419 url_cache_file(URL, Dir, url, Path),
420 atom_concat(Path, '.meta', MetaFile),
421 exists_file(MetaFile),
422 cache_file_property(Property, MetaFile).
423url_cached(Dir, URL, Property) :-
424 nonvar(Property),
425 Property = file(File),
426 atom(File),
427 atom_concat(Dir, Rest, File),
428 \+ sub_atom(Rest, _, _, _, '../'),
429 file_name_extension(Base, url, File),
430 file_name_extension(Base, meta, MetaFile),
431 exists_file(MetaFile),
432 once(read_meta_file(MetaFile, url(URL))).
433url_cached(Dir, URL, Property) :-
434 atom_concat(Dir, '/??', TopPat),
435 expand_file_name(TopPat, TopDirs),
436 member(TopDir, TopDirs),
437 atom_concat(TopDir, '/??', DirPat),
438 expand_file_name(DirPat, FileDirs),
439 member(FileDir, FileDirs),
440 atom_concat(FileDir, '/*.meta', FilePat),
441 expand_file_name(FilePat, MetaFiles),
442 member(MetaFile, MetaFiles),
443 once(read_meta_file(MetaFile, url(URL))),
444 check_cache_file(MetaFile, URL),
445 cache_file_property(Property, MetaFile).
446
447check_cache_file(MetaFile, URL) :-
448 file_name_extension(File, meta, MetaFile),
449 ( exists_file(File)
450 -> true
451 ; print_message(warning, url_cache(no_file(File, MetaFile, URL))),
452 delete_file(MetaFile),
453 fail
454 ).
455
456cache_file_property(Property, MetaFile) :-
457 var(Property), !,
458 cache_file_property_ndet(Property, MetaFile).
459cache_file_property(Property, MetaFile) :-
460 cache_file_property_ndet(Property, MetaFile), !.
461
462
463cache_file_property_ndet(file(File), MetaFile) :-
464 file_name_extension(File, meta, MetaFile).
465cache_file_property_ndet(P, MetaFile) :-
466 read_meta_file(MetaFile, P),
467 P \= url(_).
468
469 472
473:- multifile
474 prolog:message//1. 475
476prolog:message(url_cache(no_file(File, _MetaFile, URL))) -->
477 [ 'URL Cache: file ~q does not exist (URL=~q)'-[File, URL] ]