View source with raw comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2006-2015, University of Amsterdam
    7                              VU University Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(rdf_litindex,
   37          [ rdf_set_literal_index_option/1,     % +Options
   38            rdf_tokenize_literal/2,             % +Literal, -Tokens
   39            rdf_find_literal/2,                 % +Spec, -Literal
   40            rdf_find_literals/2,                % +Spec, -ListOfLiterals
   41            rdf_token_expansions/2,             % +Spec, -Expansions
   42            rdf_stopgap_token/1,                % -Token
   43
   44            rdf_literal_index/2,                % +Type, -Index
   45            rdf_delete_literal_index/1          % +Type
   46          ]).   47:- use_module(rdf_db).   48:- use_module(library(debug)).   49:- use_module(library(lists)).   50:- use_module(library(error)).   51:- use_module(library(apply)).   52:- if(exists_source(library(snowball))).   53:- use_module(library(snowball)).   54:- else.   55:- use_module(library(porter_stem)).   56:- endif.   57:- use_module(library(double_metaphone)).

Search literals

This module finds literals of the RDF database based on words, stemming and sounds like (metaphone). The normal user-level predicate is

*/

   67:- dynamic
   68    literal_map/2,                  % Type, -Map
   69    map_building/2,                 % Type, -Queue
   70    new_token/2,                    % Hook
   71    setting/1,
   72    stopgap/1.   73:- volatile
   74    literal_map/2.   75:- multifile
   76    tokenization/2,                 % +Literal, -Tokens
   77    exclude_from_index/2.           % +Which, +Token
   78
   79
   80setting(verbose(false)).                % print progress messages
   81setting(index_threads(1)).              % # threads for creating the index
   82setting(index(thread(1))).              % Use a thread for incremental updates
   83setting(stopgap_threshold(50000)).      % consider token a stopgap over N
 rdf_set_literal_index_option(+Options:list)
Set options for the literal package. Currently defined options
verbose(Bool)
If true, print progress messages while building the index tables.
index_threads(+Count)
Number of threads to use for initial indexing of literals
index(+How)
How to deal with indexing new literals. How is one of self (execute in the same thread), thread(N) (execute in N concurrent threads) or default (depends on number of cores).
stopgap_threshold(+Count)
Add a token to the dynamic stopgap set if it appears in more than Count literals. The default is 50,000.
  107rdf_set_literal_index_option([]) :- !.
  108rdf_set_literal_index_option([H|T]) :-
  109    !,
  110    set_option(H),
  111    rdf_set_literal_index_option(T).
  112rdf_set_literal_index_option(Option) :-
  113    set_option(Option).
  114
  115set_option(Term) :-
  116    check_option(Term),
  117    functor(Term, Name, Arity),
  118    functor(General, Name, Arity),
  119    retractall(setting(General)),
  120    assert(setting(Term)).
  121
  122check_option(X) :-
  123    var(X),
  124    !,
  125    instantiation_error(X).
  126check_option(verbose(X)) :-
  127    !,
  128    must_be(boolean, X).
  129check_option(index_threads(Count)) :-
  130    !,
  131    must_be(nonneg, Count).
  132check_option(stopgap_threshold(Count)) :-
  133    !,
  134    must_be(nonneg, Count).
  135check_option(index(How)) :-
  136    !,
  137    must_be(oneof([default,thread(_),self]), How).
  138check_option(Option) :-
  139    domain_error(literal_option, Option).
  140
  141
  142                 /*******************************
  143                 *            QUERY             *
  144                 *******************************/
 rdf_find_literal(+Spec, -Literal) is nondet
 rdf_find_literals(+Spec, -Literals) is det
Find literals in the RDF database matching Spec. Spec is defined as:
Spec ::= and(Spec,Spec)
Spec ::= or(Spec,Spec)
Spec ::= not(Spec)
Spec ::= sounds(Like)
Spec ::= stem(Like)             % same as stem(Like, en)
Spec ::= stem(Like, Lang)
Spec ::= prefix(Prefix)
Spec ::= between(Low, High)     % Numerical between
Spec ::= ge(High)               % Numerical greater-equal
Spec ::= le(Low)                % Numerical less-equal
Spec ::= Token

sounds(Like) and stem(Like) both map to a disjunction. First we compile the spec to normal form: a disjunction of conjunctions on elementary tokens. Then we execute all the conjunctions and generate the union using ordered-set algorithms.

Stopgaps are ignored. If the final result is only a stopgap, the predicate fails.

To be done
- Exploit ordering of numbers and allow for > N, < N, etc.
  176rdf_find_literal(Spec, Literal) :-
  177    rdf_find_literals(Spec, Literals),
  178    member(Literal, Literals).
  179
  180rdf_find_literals(Spec, Literals) :-
  181    compile_spec(Spec, DNF),
  182    DNF \== @(stopgap),
  183    token_index(Map),
  184    lookup(DNF, Map, _, SuperSet),
  185    flatten(SuperSet, Set0),
  186    sort(Set0, Literals).
 rdf_token_expansions(+Spec, -Extensions)
Determine which extensions of a token contribute to finding literals.
  193rdf_token_expansions(prefix(Prefix), [prefix(Prefix, Tokens)]) :-
  194    token_index(Map),
  195    rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens).
  196rdf_token_expansions(sounds(Like), [sounds(Like, Tokens)]) :-
  197    metaphone_index(Map),
  198    rdf_find_literal_map(Map, [Like], Tokens).
  199rdf_token_expansions(stem(Like), [stem(Like, Tokens)]) :-
  200    stem_index(Map),
  201    rdf_find_literal_map(Map, [Like], Tokens).
  202rdf_token_expansions(Spec, Expansions) :-
  203    compile_spec(Spec, DNF),
  204    token_index(Map),
  205    lookup(DNF, Map, SCS, _),
  206    flatten(SCS, CS),
  207    sort(CS, Expansions0),
  208    join_expansions(Expansions0, Expansions).
  209
  210join_expansions([], []).
  211join_expansions([H0|T0], [H|T]) :-
  212    untag(H0, Tag, V0),
  213    Tag =.. L0,
  214    append(L0, [[V0|Values]], L1),
  215    H =.. L1,
  216    join_expansions_by_tag(T0, Tag, T1, Values),
  217    join_expansions(T1, T).
  218
  219join_expansions_by_tag([H|T0], Tag, T, [V0|VT]) :-
  220    untag(H, Tag, V0),
  221    !,
  222    join_expansions_by_tag(T0, Tag, T, VT).
  223join_expansions_by_tag(L, _, L, []).
  224
  225lookup(@(false), _, [], []) :- !.
  226lookup(or(H0,T0), Map, [CH|CT], [H|T]) :-
  227    !,
  228    lookup(H0, Map, CH, H),
  229    lookup(T0, Map, CT, T).
  230lookup(H0, Map, [C], [H]) :-
  231    lookup1(H0, Map, C, H).
  232
  233lookup1(Conj, Map, Cond, Literals) :-
  234    phrase(conj_to_list(Conj), List),
  235    !,
  236    rdf_find_literal_map(Map, List, Literals),
  237    (   Literals \== []
  238    ->  phrase(conj_to_cond(Conj), Cond)
  239    ;   Cond = []
  240    ).
  241lookup1(_, _, _, []).
  242
  243conj_to_list(and(A,B)) -->
  244    !,
  245    conj_to_list(A),
  246    conj_to_list(B).
  247conj_to_list(@(false)) -->
  248    !,
  249    {fail}.
  250conj_to_list(Tagged) -->
  251    { untag(Tagged, L) },
  252    !,
  253    [L].
  254conj_to_list(L) -->
  255    [L].
  256
  257
  258conj_to_cond(and(A,B)) -->
  259    !,
  260    conj_to_cond(A),
  261    conj_to_cond(B).
  262conj_to_cond(Tagged) -->
  263    { untag(Tagged, _) },
  264    !,
  265    [ Tagged ].
  266conj_to_cond(_) -->
  267    [].
 compile_spec(+Spec, -Compiled)
Compile a specification as above into disjunctive normal form
  274compile_spec(Spec, DNF) :-
  275    expand_fuzzy(Spec, Spec2),
  276    nnf(Spec2, NNF),
  277    dnf(NNF, DNF).
  278
  279
  280expand_fuzzy(Var, _) :-
  281    var(Var),
  282    !,
  283    throw(error(instantiation_error, _)).
  284expand_fuzzy(sounds(Like), Or) :-
  285    !,
  286    (   atom(Like)
  287    ->  metaphone_index(Map),
  288        double_metaphone(Like, Key),
  289        rdf_find_literal_map(Map, [Key], Tokens),
  290        list_to_or(Tokens, sounds(Like), Or)
  291    ;   expand_fuzzy(Like, Or)
  292    ).
  293expand_fuzzy(stem(Like), Or) :-
  294    !,
  295    expand_fuzzy(stem(Like, en), Or).
  296expand_fuzzy(stem(Like, Lang), Or) :-
  297    !,
  298    (   atom(Like)
  299    ->  stem_index(Map),
  300        stem(Like, Lang, Key),
  301        rdf_find_literal_map(Map, [Key], Tokens),
  302        list_to_or(Tokens, stem(Like), Or)
  303    ;   expand_fuzzy(Like, Or)
  304    ).
  305expand_fuzzy(prefix(Prefix), Or) :-
  306    !,
  307    (   atom(Prefix)
  308    ->  token_index(Map),
  309        rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens),
  310        list_to_or(Tokens, prefix(Prefix), Or)
  311    ;   expand_fuzzy(Prefix, Or)
  312    ).
  313expand_fuzzy(case(String), Or) :-
  314    !,
  315    (   atom(String)
  316    ->  token_index(Map),
  317        rdf_keys_in_literal_map(Map, case(String), Tokens),
  318        list_to_or(Tokens, case(String), Or)
  319    ;   expand_fuzzy(String, Or)
  320    ).
  321expand_fuzzy(or(A0, B0), E) :-
  322    !,
  323    expand_fuzzy(A0, A),
  324    expand_fuzzy(B0, B),
  325    simplify(or(A,B), E).
  326expand_fuzzy(and(A0, B0), E) :-
  327    !,
  328    expand_fuzzy(A0, A),
  329    expand_fuzzy(B0, B),
  330    simplify(and(A,B), E).
  331expand_fuzzy(not(A0), not(A)) :-
  332    !,
  333    expand_fuzzy(A0, A).
  334expand_fuzzy(between(Low, High), Or) :-
  335    !,
  336    token_index(Map),
  337    rdf_keys_in_literal_map(Map, between(Low, High), Tokens),
  338    list_to_or(Tokens, between(Low, High), Or).
  339expand_fuzzy(le(High), Or) :-
  340    !,
  341    token_index(Map),
  342    rdf_keys_in_literal_map(Map, le(High), Tokens),
  343    list_to_or(Tokens, le(High), Or).
  344expand_fuzzy(ge(Low), Or) :-
  345    !,
  346    token_index(Map),
  347    rdf_keys_in_literal_map(Map, ge(Low), Tokens),
  348    list_to_or(Tokens, ge(Low), Or).
  349expand_fuzzy(Token, Result) :-
  350    atomic(Token),
  351    !,
  352    (   rdf_stopgap_token(Token)
  353    ->  Result = @(stopgap)
  354    ;   Result = Token
  355    ).
  356expand_fuzzy(Token, _) :-
  357    throw(error(type_error(Token, boolean_expression), _)).
  358
  359simplify(Expr0, Expr) :-
  360    simple(Expr0, Expr),
  361    !.
  362simplify(Expr, Expr).
  363
  364simple(and(@(false), _), @(false)).
  365simple(and(_, @(false)), @(false)).
  366simple(and(@(stopgap), Token), Token).
  367simple(and(Token, @(stopgap)), Token).
  368simple(or(@(false), X), X).
  369simple(or(X, @(false)), X).
  370simple(or(@(stopgap), Token), Token).
  371simple(or(Token, @(stopgap)), Token).
  372
  373
  374list_to_or([], _, @(false)) :- !.
  375list_to_or([X], How, One) :-
  376    !,
  377    tag(How, X, One).
  378list_to_or([H0|T0], How, or(H, T)) :-
  379    tag(How, H0, H),
  380    list_to_or(T0, How, T).
  381
  382tag(sounds(X),    Y, sounds(X,Y)).
  383tag(stem(X),      Y, stem(X,Y)).
  384tag(prefix(X),    Y, prefix(X,Y)).
  385tag(case(X),      Y, case(X,Y)).
  386tag(between(L,H), Y, between(L,H,Y)).
  387tag(ge(L),        Y, ge(L,Y)).
  388tag(le(H),        Y, le(H,Y)).
  389
  390untag(sounds(_,Y),    Y).
  391untag(stem(_,Y),      Y).
  392untag(prefix(_,Y),    Y).
  393untag(case(_,Y),      Y).
  394untag(between(_,_,Y), Y).
  395untag(le(_,Y),        Y).
  396untag(ge(_,Y),        Y).
  397
  398untag(sounds(X,Y),    sounds(X),    Y).
  399untag(stem(X,Y),      stem(X),      Y).
  400untag(prefix(X,Y),    prefix(X),    Y).
  401untag(case(X,Y),      case(X),      Y).
  402untag(between(L,H,Y), between(L,H), Y).
  403untag(ge(L,Y),        ge(L),        Y).
  404untag(le(H,Y),        le(H),        Y).
 nnf(+Formula, -NNF)
Rewrite to Negative Normal Form, meaning negations only appear around literals.
  412nnf(not(not(A0)), A) :-
  413    !,
  414    nnf(A0, A).
  415nnf(not(and(A0,B0)), or(A,B)) :-
  416    !,
  417    nnf(not(A0), A),
  418    nnf(not(B0), B).
  419nnf(not(or(A0,B0)), and(A,B)) :-
  420    !,
  421    nnf(not(A0), A),
  422    nnf(not(B0), B).
  423nnf(A, A).
 dnf(+NNF, -DNF)
Convert a formula in NNF to Disjunctive Normal Form (DNF)
  430dnf(or(A0,B0), or(A, B)) :-
  431    !,
  432    dnf(A0, A),
  433    dnf(B0, B).
  434dnf(and(A0,B0), DNF):-
  435    !,
  436    dnf(A0, A1),
  437    dnf(B0, B1),
  438    dnf1(and(A1,B1), DNF).
  439dnf(DNF, DNF).
  440
  441dnf1(and(A0, or(B,C)), or(P,Q)) :-
  442    !,
  443    dnf1(and(A0,B), P),
  444    dnf1(and(A0,C), Q).
  445dnf1(and(or(B,C), A0), or(P,Q)) :-
  446    !,
  447    dnf1(and(A0,B), P),
  448    dnf1(and(A0,C), Q).
  449dnf1(DNF, DNF).
  450
  451
  452                 /*******************************
  453                 *          TOKEN INDEX         *
  454                 *******************************/
 token_index(-Map)
Get the index of tokens. If not present, create one from the current database. Once created, the map is kept up-to-date using a monitor hook.
  462token_index(Map) :-
  463    literal_map(token, Map),
  464    !,
  465    wait_for_map(token).
  466token_index(Map) :-
  467    rdf_new_literal_map(Map),
  468    assert(literal_map(token, Map)),
  469    register_token_updater,
  470    message_queue_create(Queue),
  471    assert(map_building(token, Queue)),
  472    thread_create(make_literal_index(Queue), _,
  473                  [ alias('__rdf_tokenizer'),
  474                    detached(true)
  475                  ]),
  476    wait_for_map(token).
  477
  478register_token_updater :-
  479    Monitor = [ reset,
  480                new_literal,
  481                old_literal
  482              ],
  483    (   setting(index(default))
  484    ->  create_update_literal_thread(1),
  485        rdf_monitor(thread_monitor_literal, Monitor)
  486    ;   setting(index(thread(N)))
  487    ->  create_update_literal_thread(N),
  488        rdf_monitor(thread_monitor_literal, Monitor)
  489    ;   rdf_monitor(monitor_literal, Monitor)
  490    ).
  491
  492make_literal_index(Queue) :-
  493    call_cleanup(
  494        make_literal_index,
  495        ( message_queue_destroy(Queue),
  496          retractall(map_building(token, _)))).
 make_literal_index
Create the initial literal index.
  502make_literal_index :-
  503    setting(index_threads(N)),
  504    !,
  505    threaded_literal_index(N),
  506    verbose('~N', []).
  507make_literal_index :-
  508    current_prolog_flag(cpu_count, X),
  509    threaded_literal_index(X),
  510    verbose('~N', []).
  511
  512threaded_literal_index(N) :-
  513    N > 1,
  514    !,
  515    message_queue_create(Q, [max_size(1000)]),
  516    create_index_threads(N, Q, Ids),
  517    forall(rdf_current_literal(Literal),
  518           thread_send_message(Q, Literal)),
  519    forall(between(1, N, _),
  520           thread_send_message(Q, done(true))),
  521    maplist(thread_join, Ids, _).
  522threaded_literal_index(_) :-
  523    forall(rdf_current_literal(Literal),
  524           register_literal(Literal)).
  525
  526create_index_threads(N, Q, [Id|T]) :-
  527    N > 0,
  528    !,
  529    thread_create(index_worker(Q), Id, []),
  530    N2 is N - 1,
  531    create_index_threads(N2, Q, T).
  532create_index_threads(_, _, []) :- !.
  533
  534index_worker(Queue) :-
  535    repeat,
  536        thread_get_message(Queue, Msg),
  537        work(Msg).
  538
  539work(done(true)) :- !.
  540work(Literal) :-
  541    register_literal(Literal),
  542    fail.
 clean_token_index
Clean after a reset.
  549clean_token_index :-
  550    forall(literal_map(_, Map),
  551           rdf_reset_literal_map(Map)),
  552    retractall(stopgap(_)).
 rdf_delete_literal_index(+Type)
Fully delete a literal index
  558rdf_delete_literal_index(Type) :-
  559    must_be(atom, Type),
  560    (   retract(literal_map(Type, Map))
  561    ->  rdf_reset_literal_map(Map)          % destroy is unsafe
  562    ).
  563
  564                 /*******************************
  565                 *        THREADED UPDATE       *
  566                 *******************************/
 create_update_literal_thread(+Threads)
Setup literal monitoring using threads. While loading databases through rdf_attach_db/2 from rdf_persistency.pl, most of the time is spent updating the literal token database. While loading the RDF triples, most of the time is spend in updating the AVL tree holding the literals. Updating the token index hangs on updating the AVL trees holding the tokens. Both tasks however can run concurrently.
  578create_update_literal_thread(Threads) :-
  579    message_queue_create(_,
  580                         [ alias(rdf_literal_monitor_queue),
  581                           max_size(50000)
  582                         ]),
  583    forall(between(1, Threads, _),
  584           create_index_worker(initial)).
  585
  586:- dynamic
  587    index_worker_id/1,
  588    extra_worker_count/1.  589
  590create_index_worker(Status) :-
  591    (   retract(index_worker_id(Id0))
  592    ->  true
  593    ;   Id0 = 1
  594    ),
  595    succ(Id0, Id1),
  596    assertz(index_worker_id(Id1)),
  597    atom_concat(rdf_literal_monitor_, Id0, Alias),
  598    inc_extra_worker_count(Status),
  599    thread_create(monitor_literals(Status), _,
  600                  [ alias(Alias)
  601                  ]).
  602
  603monitor_literals(initial) :-
  604    set_prolog_flag(agc_margin, 0), % we don't create garbage
  605    repeat,
  606        thread_get_message(rdf_literal_monitor_queue, Literal),
  607        register_literal(Literal),
  608    fail.
  609monitor_literals(extra) :-
  610    set_prolog_flag(agc_margin, 0),
  611    repeat,
  612        (   thread_get_message(rdf_literal_monitor_queue, Literal,
  613                               [ timeout(1)
  614                               ])
  615        ->  register_literal(Literal),
  616            fail
  617        ;   !
  618        ),
  619    with_mutex(create_index_worker, dec_extra_worker_count),
  620    thread_self(Me),
  621    thread_detach(Me).
  622
  623thread_monitor_literal(new_literal(Literal)) :-
  624    !,
  625    thread_send_message(rdf_literal_monitor_queue, Literal).
  626thread_monitor_literal(Action) :-
  627    !,
  628    monitor_literal(Action).
 check_index_workers(+Queue, +Keys)
Increase the number of workers indexing literals sent to Queue if the queue gets overful.
  635check_index_workers(Alias, Keys) :-
  636    max_extra_workers(Max),
  637    Max > 0,
  638    message_queue_property(Queue, alias(Alias)),
  639    message_queue_property(Queue, size(Size)),
  640    Size > 10000,
  641    \+ ( extra_worker_count(Extra),
  642         Extra >= Max
  643       ),
  644    !,
  645    debug(rdf_litindex,
  646          'Creating extra literal indexer (Queue=~D, Keys=~D)',
  647          [Size, Keys]),
  648    with_mutex(create_index_worker, create_index_worker(extra)).
  649check_index_workers(_, _).
  650
  651inc_extra_worker_count(extra) :-
  652    !,
  653    (   retract(extra_worker_count(C0))
  654    ->  C is C0+1
  655    ;   C = 1
  656    ),
  657    asserta(extra_worker_count(C)).
  658inc_extra_worker_count(_).
  659
  660dec_extra_worker_count :-
  661    retract(extra_worker_count(C0)),
  662    !,
  663    C is C0-1,
  664    asserta(extra_worker_count(C)).
  665dec_extra_worker_count.
  666
  667max_extra_workers(Max) :-
  668    current_prolog_flag(cpu_count, Count),
  669    Max is Count//2.
  670
  671
  672                 /*******************************
  673                 *       MONITORED UPDATE       *
  674                 *******************************/
  675
  676monitor_literal(new_literal(Literal)) :-
  677    register_literal(Literal).
  678monitor_literal(old_literal(Literal)) :-
  679    unregister_literal(Literal).
  680monitor_literal(transaction(begin, reset)) :-
  681    rdf_monitor(monitor_literal, [-old_literal]),
  682    clean_token_index.
  683monitor_literal(transaction(end, reset)) :-
  684    rdf_monitor(monitor_literal, [+old_literal]).
 register_literal(+Literal)
Associate the tokens of a literal with the literal itself.
  690register_literal(Literal) :-
  691    (   rdf_tokenize_literal(Literal, Tokens0)
  692    ->  sort(Tokens0, Tokens),
  693        text_of(Literal, Lang, Text),
  694        literal_map(token, Map),
  695        add_tokens(Tokens, Lang, Text, Map)
  696    ;   true
  697    ).
  698
  699add_tokens([], _, _, _).
  700add_tokens([H|T], Lang, Literal, Map) :-
  701    rdf_insert_literal_map(Map, H, Literal, Keys),
  702    (   var(Keys)
  703    ->  (   rdf_keys_in_literal_map(Map, key(H), Count),
  704            setting(stopgap_threshold(Threshold)),
  705            Count > Threshold
  706        ->  assert(stopgap(H)),
  707            rdf_delete_literal_map(Map, H)
  708        ;   true
  709        )
  710    ;   forall(new_token(H, Lang), true),
  711        (   Keys mod 1000 =:= 0
  712        ->  progress(Map, 'Tokens'),
  713            (   Keys mod 10000 =:= 0
  714            ->  check_index_workers(rdf_literal_monitor_queue, Keys)
  715            ;   true
  716            )
  717        ;   true
  718        )
  719    ),
  720    add_tokens(T, Lang, Literal, Map).
 unregister_literal(+Literal)
Literal is removed from the database. As we abstract from lang and type qualifiers we first have to check this is the last one that is destroyed.
  729unregister_literal(Literal) :-
  730    text_of(Literal, _Lang, Text),
  731    (   rdf(_,_,literal(Text))
  732    ->  true                        % still something left
  733    ;   rdf_tokenize_literal(Literal, Tokens0),
  734        sort(Tokens0, Tokens),
  735        literal_map(token, Map),
  736        del_tokens(Tokens, Text, Map)
  737    ).
  738
  739del_tokens([], _, _).
  740del_tokens([H|T], Literal, Map) :-
  741    rdf_delete_literal_map(Map, H, Literal),
  742    del_tokens(T, Literal, Map).
 rdf_tokenize_literal(+Literal, -Tokens) is semidet
Tokenize a literal. We make this hookable as tokenization is generally domain dependent.
  750rdf_tokenize_literal(Literal, Tokens) :-
  751    tokenization(Literal, Tokens),
  752    !.               % Hook
  753rdf_tokenize_literal(Literal, Tokens) :-
  754    text_of(Literal, _Lang, Text),
  755    atom(Text),
  756    tokenize_atom(Text, Tokens0),
  757    select_tokens(Tokens0, Tokens).
  758
  759select_tokens([], []).
  760select_tokens([H|T0], T) :-
  761    (   exclude_from_index(token, H)
  762    ->  select_tokens(T0, T)
  763    ;   number(H)
  764    ->  (   integer(H),
  765            between(-1073741824, 1073741823, H)
  766        ->  T = [H|T1],
  767            select_tokens(T0, T1)
  768        ;   select_tokens(T0, T)
  769        )
  770    ;   atom_length(H, 1)
  771    ->  select_tokens(T0, T)
  772    ;   default_stopgap(H)
  773    ->  select_tokens(T0, T)
  774    ;   stopgap(H)
  775    ->  select_tokens(T0, T)
  776    ;   T = [H|T1],
  777        select_tokens(T0, T1)
  778    ).
 rdf_stopgap_token(-Token) is nondet
True when Token is a stopgap token. Currently, this implies one of:
  791rdf_stopgap_token(Token) :-
  792    (   var(Token)
  793    ->  rdf_stopgap_token2(Token)
  794    ;   rdf_stopgap_token2(Token), !
  795    ).
  796
  797rdf_stopgap_token2(Token) :-
  798    exclude_from_index(token, Token).
  799rdf_stopgap_token2(Token) :-
  800    default_stopgap(Token).
  801rdf_stopgap_token2(Token) :-
  802    atom(Token),
  803    atom_length(Token, 1).
  804rdf_stopgap_token2(Token) :-
  805    stopgap(Token).
 default_stopgap(?Token)
Tokens we do not wish to index, as they creat huge amounts of data with little or no value. Is there a more general way to describe this? Experience shows that simply word count is not a good criterium as it often rules out popular domain terms.
  814default_stopgap(and).
  815default_stopgap(an).
  816default_stopgap(or).
  817default_stopgap(of).
  818default_stopgap(on).
  819default_stopgap(in).
  820default_stopgap(this).
  821default_stopgap(the).
 text_of(+LiteralArg, -Lang, -Text) is semidet
Get the textual or (integer) numerical information from a literal value. Lang is the language to use for stemming. Currently we use English for untyped plain literals or literals typed xsd:string. Formally, these should not be tokenized, but a lot of data out there does not tag strings with their language.
  832text_of(type(xsd:string, Text), en, Text) :- !.
  833text_of(type(_, Text), -, Text) :- !.
  834text_of(lang(Lang, Text), Lang, Text) :- !.
  835text_of(Text, en, Text) :- atom(Text), !.
  836text_of(Text, -, Text) :- integer(Text).
  837
  838
  839                 /*******************************
  840                 *         STEM INDEX           *
  841                 *******************************/
 stem_index(-Map) is det
Get the stemming literal index. This index is created on demand. If some thread is creating the index, other threads wait for its completion.
  849stem_index(Map) :-
  850    literal_map(stem, Map),
  851    !,
  852    wait_for_map(stem).
  853stem_index(Map) :-
  854    rdf_new_literal_map(Map),
  855    assert(literal_map(stem, Map)),
  856    assert((new_token(Token, Lang) :- add_stem(Token, Lang, Map))),
  857    message_queue_create(Queue),
  858    assert(map_building(stem, Queue)),
  859    thread_create(fill_stem_index(Map, Queue), _,
  860                  [ alias('__rdf_stemmer'),
  861                    detached(true)
  862                  ]),
  863    wait_for_map(stem).
  864
  865wait_for_map(MapName) :-
  866    (   map_building(MapName, Queue)
  867    ->  catch(thread_get_message(Queue, _), _, true),
  868        wait_for_map(MapName)
  869    ;   true
  870    ).
  871
  872fill_stem_index(StemMap, Queue) :-
  873    call_cleanup(
  874        forall(rdf_current_literal(Literal),
  875               stem_literal_tokens(Literal, StemMap)),
  876        ( message_queue_destroy(Queue),
  877          retractall(map_building(stem, _)))).
  878
  879stem_literal_tokens(Literal, StemMap) :-
  880    rdf_tokenize_literal(Literal, Tokens),
  881    !,
  882    sort(Tokens, Tokens1),
  883    text_of(Literal, Lang, _Text),
  884    insert_tokens_stem(Tokens1, Lang, StemMap).
  885stem_literal_tokens(_,_).
  886
  887insert_tokens_stem([], _, _).
  888insert_tokens_stem([Token|T], Lang, Map) :-
  889    (   atom(Token)
  890    ->  (   stem(Token, Lang, Stem)
  891        ->  rdf_insert_literal_map(Map, Stem, Token, Keys),
  892            (   integer(Keys),
  893                Keys mod 1000 =:= 0
  894            ->  progress(Map, 'Stem')
  895            ;   true
  896            )
  897        ;   true
  898        )
  899    ;   true
  900    ),
  901    insert_tokens_stem(T, Lang, Map).
  902
  903
  904add_stem(Token, Lang, Map) :-
  905    stem(Lang, Token, Stem),
  906    rdf_insert_literal_map(Map, Stem, Token, _).
  907
  908:- if(current_predicate(snowball/3)).  909stem(Token, LangSpec, Stem) :-
  910    main_lang(LangSpec, Lang),
  911    downcase_atom(Token, Lower),
  912    catch(snowball(Lang, Lower, Stem), _, fail).
  913:- else.  914stem(Token, _Lang, Stem) :-
  915    downcase_atom(Token, Lower),
  916    porter_stem(Lower, Stem).
  917:- endif.  918
  919main_lang(LangSpec, Lang) :-
  920    sub_atom(LangSpec, Before, _, _, -),
  921    !,
  922    sub_atom(LangSpec, 0, Before, _, Lang).
  923main_lang(LangSpec, Lang) :-
  924    downcase_atom(LangSpec, Lang).
  925
  926
  927                 /*******************************
  928                 *        METAPHONE INDEX       *
  929                 *******************************/
  930
  931
  932metaphone_index(Map) :-
  933    literal_map(metaphone, Map),
  934    !,
  935    wait_for_map(metaphone).
  936metaphone_index(Map) :-
  937    rdf_new_literal_map(Map),
  938    assert(literal_map(metaphone, Map)),
  939    assert((new_token(Token, Lang) :- add_metaphone(Token, Lang, Map))),
  940    message_queue_create(Queue),
  941    assert(map_building(metaphone, Queue)),
  942    thread_create(fill_metaphone_index(Map, Queue), _,
  943                  [ alias('__rdf_metaphone_indexer'),
  944                    detached(true)
  945                  ]),
  946    wait_for_map(metaphone).
  947
  948fill_metaphone_index(MetaphoneMap, Queue) :-
  949    call_cleanup(
  950        fill_metaphone_index(MetaphoneMap),
  951        ( message_queue_destroy(Queue),
  952          retractall(map_building(metaphone, _)))).
  953
  954fill_metaphone_index(MetaphoneMap) :-
  955    token_index(TokenMap),
  956    rdf_keys_in_literal_map(TokenMap, all, Tokens),
  957    metaphone(Tokens, MetaphoneMap).
  958
  959metaphone([], _).
  960metaphone([Token|T], Map) :-
  961    (   atom(Token),
  962        double_metaphone(Token, SoundEx)
  963    ->  rdf_insert_literal_map(Map, SoundEx, Token, Keys),
  964        (   integer(Keys),
  965            Keys mod 1000 =:= 0
  966        ->  progress(Map, 'Metaphone')
  967        ;   true
  968        )
  969    ;   true
  970    ),
  971    metaphone(T, Map).
  972
  973
  974add_metaphone(Token, _Lang, Map) :-
  975    atom(Token),
  976    !,
  977    double_metaphone(Token, SoundEx),
  978    rdf_insert_literal_map(Map, SoundEx, Token).
  979add_metaphone(_, _, _).
 rdf_literal_index(+Type, -Index) is det
True when Index is a literal map containing the index of Type. Type is one of:
token
Tokens are basically words of literal values. See rdf_tokenize_literal/2. The token map maps tokens to full literal texts.
stem
Index of stemmed tokens. If the language is available, the tokens are stemmed using the matching snowball stemmer. The stem map maps stemmed to full tokens.
metaphone
Phonetic index of tokens. The metaphone map maps phonetic keys to tokens.
  998rdf_literal_index(token, Map) :-
  999    !,
 1000    token_index(Map).
 1001rdf_literal_index(stem, Map) :-
 1002    !,
 1003    stem_index(Map).
 1004rdf_literal_index(metaphone, Map) :-
 1005    !,
 1006    metaphone_index(Map).
 1007rdf_literal_index(Type, _Map) :-
 1008    domain_error(literal_index, Type).
 1009
 1010
 1011                 /*******************************
 1012                 *             UTIL             *
 1013                 *******************************/
 1014
 1015verbose(Fmt, Args) :-
 1016    setting(verbose(true)),
 1017    !,
 1018    format(user_error, Fmt, Args).
 1019verbose(_, _).
 1020
 1021progress(Map, Which) :-
 1022    setting(verbose(true)),
 1023    !,
 1024    rdf_statistics_literal_map(Map, size(Keys, Values)),
 1025    format(user_error,
 1026           '\r~t~w: ~12|Keys: ~t~D~15+; Values: ~t~D~20+',
 1027           [Which, Keys, Values]).
 1028progress(_,_)