1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2006-2015, University of Amsterdam 7 VU University Amsterdam 8 All rights reserved. 9 10 Redistribution and use in source and binary forms, with or without 11 modification, are permitted provided that the following conditions 12 are met: 13 14 1. Redistributions of source code must retain the above copyright 15 notice, this list of conditions and the following disclaimer. 16 17 2. Redistributions in binary form must reproduce the above copyright 18 notice, this list of conditions and the following disclaimer in 19 the documentation and/or other materials provided with the 20 distribution. 21 22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 32 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 POSSIBILITY OF SUCH DAMAGE. 34*/ 35 36:- module(rdf_litindex, 37 [ rdf_set_literal_index_option/1, % +Options 38 rdf_tokenize_literal/2, % +Literal, -Tokens 39 rdf_find_literal/2, % +Spec, -Literal 40 rdf_find_literals/2, % +Spec, -ListOfLiterals 41 rdf_token_expansions/2, % +Spec, -Expansions 42 rdf_stopgap_token/1, % -Token 43 44 rdf_literal_index/2, % +Type, -Index 45 rdf_delete_literal_index/1 % +Type 46 ]). 47:- use_module(rdf_db). 48:- use_module(library(debug)). 49:- use_module(library(lists)). 50:- use_module(library(error)). 51:- use_module(library(apply)). 52:- if(exists_source(library(snowball))). 53:- use_module(library(snowball)). 54:- else. 55:- use_module(library(porter_stem)). 56:- endif. 57:- use_module(library(double_metaphone)).
67:- dynamic 68 literal_map/2, % Type, -Map 69 map_building/2, % Type, -Queue 70 new_token/2, % Hook 71 setting/1, 72 stopgap/1. 73:- volatile 74 literal_map/2. 75:- multifile 76 tokenization/2, % +Literal, -Tokens 77 exclude_from_index/2. % +Which, +Token 78 79 80setting(verbose(false)). % print progress messages 81setting(index_threads(1)). % # threads for creating the index 82setting(index(thread(1))). % Use a thread for incremental updates 83setting(stopgap_threshold(50000)). % consider token a stopgap over N
true
, print progress messages while building the
index tables.self
(execute in the same thread), thread(N)
(execute
in N concurrent threads) or default
(depends on number
of cores).107rdf_set_literal_index_option([]) :- !. 108rdf_set_literal_index_option([H|T]) :- 109 !, 110 set_option(H), 111 rdf_set_literal_index_option(T). 112rdf_set_literal_index_option(Option) :- 113 set_option(Option). 114 115set_option(Term) :- 116 check_option(Term), 117 functor(Term, Name, Arity), 118 functor(General, Name, Arity), 119 retractall(setting(General)), 120 assert(setting(Term)). 121 122check_option(X) :- 123 var(X), 124 !, 125 instantiation_error(X). 126check_option(verbose(X)) :- 127 !, 128 must_be(boolean, X). 129check_option(index_threads(Count)) :- 130 !, 131 must_be(nonneg, Count). 132check_option(stopgap_threshold(Count)) :- 133 !, 134 must_be(nonneg, Count). 135check_option(index(How)) :- 136 !, 137 must_be(oneof([default,thread(_),self]), How). 138check_option(Option) :- 139 domain_error(literal_option, Option). 140 141 142 /******************************* 143 * QUERY * 144 *******************************/
Spec ::= and(Spec,Spec) Spec ::= or(Spec,Spec) Spec ::= not(Spec) Spec ::= sounds(Like) Spec ::= stem(Like) % same as stem(Like, en) Spec ::= stem(Like, Lang) Spec ::= prefix(Prefix) Spec ::= between(Low, High) % Numerical between Spec ::= ge(High) % Numerical greater-equal Spec ::= le(Low) % Numerical less-equal Spec ::= Token
sounds(Like)
and stem(Like)
both map to a disjunction. First we
compile the spec to normal form: a disjunction of conjunctions
on elementary tokens. Then we execute all the conjunctions and
generate the union using ordered-set algorithms.
Stopgaps are ignored. If the final result is only a stopgap, the predicate fails.
176rdf_find_literal(Spec, Literal) :- 177 rdf_find_literals(Spec, Literals), 178 member(Literal, Literals). 179 180rdf_find_literals(Spec, Literals) :- 181 compile_spec(Spec, DNF), 182 DNF \== @(stopgap), 183 token_index(Map), 184 lookup(DNF, Map, _, SuperSet), 185 flatten(SuperSet, Set0), 186 sort(Set0, Literals).
193rdf_token_expansions(prefix(Prefix), [prefix(Prefix, Tokens)]) :- 194 token_index(Map), 195 rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens). 196rdf_token_expansions(sounds(Like), [sounds(Like, Tokens)]) :- 197 metaphone_index(Map), 198 rdf_find_literal_map(Map, [Like], Tokens). 199rdf_token_expansions(stem(Like), [stem(Like, Tokens)]) :- 200 stem_index(Map), 201 rdf_find_literal_map(Map, [Like], Tokens). 202rdf_token_expansions(Spec, Expansions) :- 203 compile_spec(Spec, DNF), 204 token_index(Map), 205 lookup(DNF, Map, SCS, _), 206 flatten(SCS, CS), 207 sort(CS, Expansions0), 208 join_expansions(Expansions0, Expansions). 209 210join_expansions([], []). 211join_expansions([H0|T0], [H|T]) :- 212 untag(H0, Tag, V0), 213 Tag =.. L0, 214 append(L0, [[V0|Values]], L1), 215 H =.. L1, 216 join_expansions_by_tag(T0, Tag, T1, Values), 217 join_expansions(T1, T). 218 219join_expansions_by_tag([H|T0], Tag, T, [V0|VT]) :- 220 untag(H, Tag, V0), 221 !, 222 join_expansions_by_tag(T0, Tag, T, VT). 223join_expansions_by_tag(L, _, L, []). 224 225lookup(@(false), _, [], []) :- !. 226lookup(or(H0,T0), Map, [CH|CT], [H|T]) :- 227 !, 228 lookup(H0, Map, CH, H), 229 lookup(T0, Map, CT, T). 230lookup(H0, Map, [C], [H]) :- 231 lookup1(H0, Map, C, H). 232 233lookup1(Conj, Map, Cond, Literals) :- 234 phrase(conj_to_list(Conj), List), 235 !, 236 rdf_find_literal_map(Map, List, Literals), 237 ( Literals \== [] 238 -> phrase(conj_to_cond(Conj), Cond) 239 ; Cond = [] 240 ). 241lookup1(_, _, _, []). 242 243conj_to_list(and(A,B)) --> 244 !, 245 conj_to_list(A), 246 conj_to_list(B). 247conj_to_list(@(false)) --> 248 !, 249 {fail}. 250conj_to_list(Tagged) --> 251 { untag(Tagged, L) }, 252 !, 253 [L]. 254conj_to_list(L) --> 255 [L]. 256 257 258conj_to_cond(and(A,B)) --> 259 !, 260 conj_to_cond(A), 261 conj_to_cond(B). 262conj_to_cond(Tagged) --> 263 { untag(Tagged, _) }, 264 !, 265 [ Tagged ]. 266conj_to_cond(_) --> 267 [].
274compile_spec(Spec, DNF) :- 275 expand_fuzzy(Spec, Spec2), 276 nnf(Spec2, NNF), 277 dnf(NNF, DNF). 278 279 280expand_fuzzy(Var, _) :- 281 var(Var), 282 !, 283 throw(error(instantiation_error, _)). 284expand_fuzzy(sounds(Like), Or) :- 285 !, 286 ( atom(Like) 287 -> metaphone_index(Map), 288 double_metaphone(Like, Key), 289 rdf_find_literal_map(Map, [Key], Tokens), 290 list_to_or(Tokens, sounds(Like), Or) 291 ; expand_fuzzy(Like, Or) 292 ). 293expand_fuzzy(stem(Like), Or) :- 294 !, 295 expand_fuzzy(stem(Like, en), Or). 296expand_fuzzy(stem(Like, Lang), Or) :- 297 !, 298 ( atom(Like) 299 -> stem_index(Map), 300 stem(Like, Lang, Key), 301 rdf_find_literal_map(Map, [Key], Tokens), 302 list_to_or(Tokens, stem(Like), Or) 303 ; expand_fuzzy(Like, Or) 304 ). 305expand_fuzzy(prefix(Prefix), Or) :- 306 !, 307 ( atom(Prefix) 308 -> token_index(Map), 309 rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens), 310 list_to_or(Tokens, prefix(Prefix), Or) 311 ; expand_fuzzy(Prefix, Or) 312 ). 313expand_fuzzy(case(String), Or) :- 314 !, 315 ( atom(String) 316 -> token_index(Map), 317 rdf_keys_in_literal_map(Map, case(String), Tokens), 318 list_to_or(Tokens, case(String), Or) 319 ; expand_fuzzy(String, Or) 320 ). 321expand_fuzzy(or(A0, B0), E) :- 322 !, 323 expand_fuzzy(A0, A), 324 expand_fuzzy(B0, B), 325 simplify(or(A,B), E). 326expand_fuzzy(and(A0, B0), E) :- 327 !, 328 expand_fuzzy(A0, A), 329 expand_fuzzy(B0, B), 330 simplify(and(A,B), E). 331expand_fuzzy(not(A0), not(A)) :- 332 !, 333 expand_fuzzy(A0, A). 334expand_fuzzy(between(Low, High), Or) :- 335 !, 336 token_index(Map), 337 rdf_keys_in_literal_map(Map, between(Low, High), Tokens), 338 list_to_or(Tokens, between(Low, High), Or). 339expand_fuzzy(le(High), Or) :- 340 !, 341 token_index(Map), 342 rdf_keys_in_literal_map(Map, le(High), Tokens), 343 list_to_or(Tokens, le(High), Or). 344expand_fuzzy(ge(Low), Or) :- 345 !, 346 token_index(Map), 347 rdf_keys_in_literal_map(Map, ge(Low), Tokens), 348 list_to_or(Tokens, ge(Low), Or). 349expand_fuzzy(Token, Result) :- 350 atomic(Token), 351 !, 352 ( rdf_stopgap_token(Token) 353 -> Result = @(stopgap) 354 ; Result = Token 355 ). 356expand_fuzzy(Token, _) :- 357 throw(error(type_error(Token, boolean_expression), _)). 358 359simplify(Expr0, Expr) :- 360 simple(Expr0, Expr), 361 !. 362simplify(Expr, Expr). 363 364simple(and(@(false), _), @(false)). 365simple(and(_, @(false)), @(false)). 366simple(and(@(stopgap), Token), Token). 367simple(and(Token, @(stopgap)), Token). 368simple(or(@(false), X), X). 369simple(or(X, @(false)), X). 370simple(or(@(stopgap), Token), Token). 371simple(or(Token, @(stopgap)), Token). 372 373 374list_to_or([], _, @(false)) :- !. 375list_to_or([X], How, One) :- 376 !, 377 tag(How, X, One). 378list_to_or([H0|T0], How, or(H, T)) :- 379 tag(How, H0, H), 380 list_to_or(T0, How, T). 381 382tag(sounds(X), Y, sounds(X,Y)). 383tag(stem(X), Y, stem(X,Y)). 384tag(prefix(X), Y, prefix(X,Y)). 385tag(case(X), Y, case(X,Y)). 386tag(between(L,H), Y, between(L,H,Y)). 387tag(ge(L), Y, ge(L,Y)). 388tag(le(H), Y, le(H,Y)). 389 390untag(sounds(_,Y), Y). 391untag(stem(_,Y), Y). 392untag(prefix(_,Y), Y). 393untag(case(_,Y), Y). 394untag(between(_,_,Y), Y). 395untag(le(_,Y), Y). 396untag(ge(_,Y), Y). 397 398untag(sounds(X,Y), sounds(X), Y). 399untag(stem(X,Y), stem(X), Y). 400untag(prefix(X,Y), prefix(X), Y). 401untag(case(X,Y), case(X), Y). 402untag(between(L,H,Y), between(L,H), Y). 403untag(ge(L,Y), ge(L), Y). 404untag(le(H,Y), le(H), Y).
412nnf(not(not(A0)), A) :- 413 !, 414 nnf(A0, A). 415nnf(not(and(A0,B0)), or(A,B)) :- 416 !, 417 nnf(not(A0), A), 418 nnf(not(B0), B). 419nnf(not(or(A0,B0)), and(A,B)) :- 420 !, 421 nnf(not(A0), A), 422 nnf(not(B0), B). 423nnf(A, A).
430dnf(or(A0,B0), or(A, B)) :- 431 !, 432 dnf(A0, A), 433 dnf(B0, B). 434dnf(and(A0,B0), DNF):- 435 !, 436 dnf(A0, A1), 437 dnf(B0, B1), 438 dnf1(and(A1,B1), DNF). 439dnf(DNF, DNF). 440 441dnf1(and(A0, or(B,C)), or(P,Q)) :- 442 !, 443 dnf1(and(A0,B), P), 444 dnf1(and(A0,C), Q). 445dnf1(and(or(B,C), A0), or(P,Q)) :- 446 !, 447 dnf1(and(A0,B), P), 448 dnf1(and(A0,C), Q). 449dnf1(DNF, DNF). 450 451 452 /******************************* 453 * TOKEN INDEX * 454 *******************************/
462token_index(Map) :- 463 literal_map(token, Map), 464 !, 465 wait_for_map(token). 466token_index(Map) :- 467 rdf_new_literal_map(Map), 468 assert(literal_map(token, Map)), 469 register_token_updater, 470 message_queue_create(Queue), 471 assert(map_building(token, Queue)), 472 thread_create(make_literal_index(Queue), _, 473 [ alias('__rdf_tokenizer'), 474 detached(true) 475 ]), 476 wait_for_map(token). 477 478register_token_updater :- 479 Monitor = [ reset, 480 new_literal, 481 old_literal 482 ], 483 ( setting(index(default)) 484 -> create_update_literal_thread(1), 485 rdf_monitor(thread_monitor_literal, Monitor) 486 ; setting(index(thread(N))) 487 -> create_update_literal_thread(N), 488 rdf_monitor(thread_monitor_literal, Monitor) 489 ; rdf_monitor(monitor_literal, Monitor) 490 ). 491 492make_literal_index(Queue) :- 493 call_cleanup( 494 make_literal_index, 495 ( message_queue_destroy(Queue), 496 retractall(map_building(token, _)))).
502make_literal_index :- 503 setting(index_threads(N)), 504 !, 505 threaded_literal_index(N), 506 verbose('~N', []). 507make_literal_index :- 508 current_prolog_flag(cpu_count, X), 509 threaded_literal_index(X), 510 verbose('~N', []). 511 512threaded_literal_index(N) :- 513 N > 1, 514 !, 515 message_queue_create(Q, [max_size(1000)]), 516 create_index_threads(N, Q, Ids), 517 forall(rdf_current_literal(Literal), 518 thread_send_message(Q, Literal)), 519 forall(between(1, N, _), 520 thread_send_message(Q, done(true))), 521 maplist(thread_join, Ids, _). 522threaded_literal_index(_) :- 523 forall(rdf_current_literal(Literal), 524 register_literal(Literal)). 525 526create_index_threads(N, Q, [Id|T]) :- 527 N > 0, 528 !, 529 thread_create(index_worker(Q), Id, []), 530 N2 is N - 1, 531 create_index_threads(N2, Q, T). 532create_index_threads(_, _, []) :- !. 533 534index_worker(Queue) :- 535 repeat, 536 thread_get_message(Queue, Msg), 537 work(Msg). 538 539work(done(true)) :- !. 540work(Literal) :- 541 register_literal(Literal), 542 fail.
549clean_token_index :-
550 forall(literal_map(_, Map),
551 rdf_reset_literal_map(Map)),
552 retractall(stopgap(_)).
558rdf_delete_literal_index(Type) :- 559 must_be(atom, Type), 560 ( retract(literal_map(Type, Map)) 561 -> rdf_reset_literal_map(Map) % destroy is unsafe 562 ). 563 564 /******************************* 565 * THREADED UPDATE * 566 *******************************/
rdf_persistency.pl
, most of the
time is spent updating the literal token database. While loading
the RDF triples, most of the time is spend in updating the AVL
tree holding the literals. Updating the token index hangs on
updating the AVL trees holding the tokens. Both tasks however
can run concurrently.578create_update_literal_thread(Threads) :- 579 message_queue_create(_, 580 [ alias(rdf_literal_monitor_queue), 581 max_size(50000) 582 ]), 583 forall(between(1, Threads, _), 584 create_index_worker(initial)). 585 586:- dynamic 587 index_worker_id/1, 588 extra_worker_count/1. 589 590create_index_worker(Status) :- 591 ( retract(index_worker_id(Id0)) 592 -> true 593 ; Id0 = 1 594 ), 595 succ(Id0, Id1), 596 assertz(index_worker_id(Id1)), 597 atom_concat(rdf_literal_monitor_, Id0, Alias), 598 inc_extra_worker_count(Status), 599 thread_create(monitor_literals(Status), _, 600 [ alias(Alias) 601 ]). 602 603monitor_literals(initial) :- 604 set_prolog_flag(agc_margin, 0), % we don't create garbage 605 repeat, 606 thread_get_message(rdf_literal_monitor_queue, Literal), 607 register_literal(Literal), 608 fail. 609monitor_literals(extra) :- 610 set_prolog_flag(agc_margin, 0), 611 repeat, 612 ( thread_get_message(rdf_literal_monitor_queue, Literal, 613 [ timeout(1) 614 ]) 615 -> register_literal(Literal), 616 fail 617 ; ! 618 ), 619 with_mutex(create_index_worker, dec_extra_worker_count), 620 thread_self(Me), 621 thread_detach(Me). 622 623thread_monitor_literal(new_literal(Literal)) :- 624 !, 625 thread_send_message(rdf_literal_monitor_queue, Literal). 626thread_monitor_literal(Action) :- 627 !, 628 monitor_literal(Action).
635check_index_workers(Alias, Keys) :- 636 max_extra_workers(Max), 637 Max > 0, 638 message_queue_property(Queue, alias(Alias)), 639 message_queue_property(Queue, size(Size)), 640 Size > 10000, 641 \+ ( extra_worker_count(Extra), 642 Extra >= Max 643 ), 644 !, 645 debug(rdf_litindex, 646 'Creating extra literal indexer (Queue=~D, Keys=~D)', 647 [Size, Keys]), 648 with_mutex(create_index_worker, create_index_worker(extra)). 649check_index_workers(_, _). 650 651inc_extra_worker_count(extra) :- 652 !, 653 ( retract(extra_worker_count(C0)) 654 -> C is C0+1 655 ; C = 1 656 ), 657 asserta(extra_worker_count(C)). 658inc_extra_worker_count(_). 659 660dec_extra_worker_count :- 661 retract(extra_worker_count(C0)), 662 !, 663 C is C0-1, 664 asserta(extra_worker_count(C)). 665dec_extra_worker_count. 666 667max_extra_workers(Max) :- 668 current_prolog_flag(cpu_count, Count), 669 Max is Count//2. 670 671 672 /******************************* 673 * MONITORED UPDATE * 674 *******************************/ 675 676monitor_literal(new_literal(Literal)) :- 677 register_literal(Literal). 678monitor_literal(old_literal(Literal)) :- 679 unregister_literal(Literal). 680monitor_literal(transaction(begin, reset)) :- 681 rdf_monitor(monitor_literal, [-old_literal]), 682 clean_token_index. 683monitor_literal(transaction(end, reset)) :- 684 rdf_monitor(monitor_literal, [+old_literal]).
690register_literal(Literal) :- 691 ( rdf_tokenize_literal(Literal, Tokens0) 692 -> sort(Tokens0, Tokens), 693 text_of(Literal, Lang, Text), 694 literal_map(token, Map), 695 add_tokens(Tokens, Lang, Text, Map) 696 ; true 697 ). 698 699add_tokens([], _, _, _). 700add_tokens([H|T], Lang, Literal, Map) :- 701 rdf_insert_literal_map(Map, H, Literal, Keys), 702 ( var(Keys) 703 -> ( rdf_keys_in_literal_map(Map, key(H), Count), 704 setting(stopgap_threshold(Threshold)), 705 Count > Threshold 706 -> assert(stopgap(H)), 707 rdf_delete_literal_map(Map, H) 708 ; true 709 ) 710 ; forall(new_token(H, Lang), true), 711 ( Keys mod 1000 =:= 0 712 -> progress(Map, 'Tokens'), 713 ( Keys mod 10000 =:= 0 714 -> check_index_workers(rdf_literal_monitor_queue, Keys) 715 ; true 716 ) 717 ; true 718 ) 719 ), 720 add_tokens(T, Lang, Literal, Map).
729unregister_literal(Literal) :- 730 text_of(Literal, _Lang, Text), 731 ( rdf(_,_,literal(Text)) 732 -> true % still something left 733 ; rdf_tokenize_literal(Literal, Tokens0), 734 sort(Tokens0, Tokens), 735 literal_map(token, Map), 736 del_tokens(Tokens, Text, Map) 737 ). 738 739del_tokens([], _, _). 740del_tokens([H|T], Literal, Map) :- 741 rdf_delete_literal_map(Map, H, Literal), 742 del_tokens(T, Literal, Map).
750rdf_tokenize_literal(Literal, Tokens) :- 751 tokenization(Literal, Tokens), 752 !. % Hook 753rdf_tokenize_literal(Literal, Tokens) :- 754 text_of(Literal, _Lang, Text), 755 atom(Text), 756 tokenize_atom(Text, Tokens0), 757 select_tokens(Tokens0, Tokens). 758 759select_tokens([], []). 760select_tokens([H|T0], T) :- 761 ( exclude_from_index(token, H) 762 -> select_tokens(T0, T) 763 ; number(H) 764 -> ( integer(H), 765 between(-1073741824, 1073741823, H) 766 -> T = [H|T1], 767 select_tokens(T0, T1) 768 ; select_tokens(T0, T) 769 ) 770 ; atom_length(H, 1) 771 -> select_tokens(T0, T) 772 ; default_stopgap(H) 773 -> select_tokens(T0, T) 774 ; stopgap(H) 775 -> select_tokens(T0, T) 776 ; T = [H|T1], 777 select_tokens(T0, T1) 778 ).
exclude_from_index(token, Token)
is truedefault_stopgap(Token)
is true791rdf_stopgap_token(Token) :- 792 ( var(Token) 793 -> rdf_stopgap_token2(Token) 794 ; rdf_stopgap_token2(Token), ! 795 ). 796 797rdf_stopgap_token2(Token) :- 798 exclude_from_index(token, Token). 799rdf_stopgap_token2(Token) :- 800 default_stopgap(Token). 801rdf_stopgap_token2(Token) :- 802 atom(Token), 803 atom_length(Token, 1). 804rdf_stopgap_token2(Token) :- 805 stopgap(Token).
814default_stopgap(and). 815default_stopgap(an). 816default_stopgap(or). 817default_stopgap(of). 818default_stopgap(on). 819default_stopgap(in). 820default_stopgap(this). 821default_stopgap(the).
832text_of(type(xsd:string, Text), en, Text) :- !. 833text_of(type(_, Text), -, Text) :- !. 834text_of(lang(Lang, Text), Lang, Text) :- !. 835text_of(Text, en, Text) :- atom(Text), !. 836text_of(Text, -, Text) :- integer(Text). 837 838 839 /******************************* 840 * STEM INDEX * 841 *******************************/
849stem_index(Map) :- 850 literal_map(stem, Map), 851 !, 852 wait_for_map(stem). 853stem_index(Map) :- 854 rdf_new_literal_map(Map), 855 assert(literal_map(stem, Map)), 856 assert((new_token(Token, Lang) :- add_stem(Token, Lang, Map))), 857 message_queue_create(Queue), 858 assert(map_building(stem, Queue)), 859 thread_create(fill_stem_index(Map, Queue), _, 860 [ alias('__rdf_stemmer'), 861 detached(true) 862 ]), 863 wait_for_map(stem). 864 865wait_for_map(MapName) :- 866 ( map_building(MapName, Queue) 867 -> catch(thread_get_message(Queue, _), _, true), 868 wait_for_map(MapName) 869 ; true 870 ). 871 872fill_stem_index(StemMap, Queue) :- 873 call_cleanup( 874 forall(rdf_current_literal(Literal), 875 stem_literal_tokens(Literal, StemMap)), 876 ( message_queue_destroy(Queue), 877 retractall(map_building(stem, _)))). 878 879stem_literal_tokens(Literal, StemMap) :- 880 rdf_tokenize_literal(Literal, Tokens), 881 !, 882 sort(Tokens, Tokens1), 883 text_of(Literal, Lang, _Text), 884 insert_tokens_stem(Tokens1, Lang, StemMap). 885stem_literal_tokens(_,_). 886 887insert_tokens_stem([], _, _). 888insert_tokens_stem([Token|T], Lang, Map) :- 889 ( atom(Token) 890 -> ( stem(Token, Lang, Stem) 891 -> rdf_insert_literal_map(Map, Stem, Token, Keys), 892 ( integer(Keys), 893 Keys mod 1000 =:= 0 894 -> progress(Map, 'Stem') 895 ; true 896 ) 897 ; true 898 ) 899 ; true 900 ), 901 insert_tokens_stem(T, Lang, Map). 902 903 904add_stem(Token, Lang, Map) :- 905 stem(Lang, Token, Stem), 906 rdf_insert_literal_map(Map, Stem, Token, _). 907 908:- if(current_predicate(snowball/3)). 909stem(Token, LangSpec, Stem) :- 910 main_lang(LangSpec, Lang), 911 downcase_atom(Token, Lower), 912 catch(snowball(Lang, Lower, Stem), _, fail). 913:- else. 914stem(Token, _Lang, Stem) :- 915 downcase_atom(Token, Lower), 916 porter_stem(Lower, Stem). 917:- endif. 918 919main_lang(LangSpec, Lang) :- 920 sub_atom(LangSpec, Before, _, _, -), 921 !, 922 sub_atom(LangSpec, 0, Before, _, Lang). 923main_lang(LangSpec, Lang) :- 924 downcase_atom(LangSpec, Lang). 925 926 927 /******************************* 928 * METAPHONE INDEX * 929 *******************************/ 930 931 932metaphone_index(Map) :- 933 literal_map(metaphone, Map), 934 !, 935 wait_for_map(metaphone). 936metaphone_index(Map) :- 937 rdf_new_literal_map(Map), 938 assert(literal_map(metaphone, Map)), 939 assert((new_token(Token, Lang) :- add_metaphone(Token, Lang, Map))), 940 message_queue_create(Queue), 941 assert(map_building(metaphone, Queue)), 942 thread_create(fill_metaphone_index(Map, Queue), _, 943 [ alias('__rdf_metaphone_indexer'), 944 detached(true) 945 ]), 946 wait_for_map(metaphone). 947 948fill_metaphone_index(MetaphoneMap, Queue) :- 949 call_cleanup( 950 fill_metaphone_index(MetaphoneMap), 951 ( message_queue_destroy(Queue), 952 retractall(map_building(metaphone, _)))). 953 954fill_metaphone_index(MetaphoneMap) :- 955 token_index(TokenMap), 956 rdf_keys_in_literal_map(TokenMap, all, Tokens), 957 metaphone(Tokens, MetaphoneMap). 958 959metaphone([], _). 960metaphone([Token|T], Map) :- 961 ( atom(Token), 962 double_metaphone(Token, SoundEx) 963 -> rdf_insert_literal_map(Map, SoundEx, Token, Keys), 964 ( integer(Keys), 965 Keys mod 1000 =:= 0 966 -> progress(Map, 'Metaphone') 967 ; true 968 ) 969 ; true 970 ), 971 metaphone(T, Map). 972 973 974add_metaphone(Token, _Lang, Map) :- 975 atom(Token), 976 !, 977 double_metaphone(Token, SoundEx), 978 rdf_insert_literal_map(Map, SoundEx, Token). 979add_metaphone(_, _, _).
token
map maps tokens to full
literal texts.stem
map maps stemmed to full tokens.metaphone
map maps phonetic
keys to tokens.998rdf_literal_index(token, Map) :- 999 !, 1000 token_index(Map). 1001rdf_literal_index(stem, Map) :- 1002 !, 1003 stem_index(Map). 1004rdf_literal_index(metaphone, Map) :- 1005 !, 1006 metaphone_index(Map). 1007rdf_literal_index(Type, _Map) :- 1008 domain_error(literal_index, Type). 1009 1010 1011 /******************************* 1012 * UTIL * 1013 *******************************/ 1014 1015verbose(Fmt, Args) :- 1016 setting(verbose(true)), 1017 !, 1018 format(user_error, Fmt, Args). 1019verbose(_, _). 1020 1021progress(Map, Which) :- 1022 setting(verbose(true)), 1023 !, 1024 rdf_statistics_literal_map(Map, size(Keys, Values)), 1025 format(user_error, 1026 '\r~t~w: ~12|Keys: ~t~D~15+; Values: ~t~D~20+', 1027 [Which, Keys, Values]). 1028progress(_,_)
Search literals
This module finds literals of the RDF database based on words, stemming and sounds like (metaphone). The normal user-level predicate is
*/