rdf_litindex.pl

View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2006-2015, University of Amsterdam
    7                              VU University Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(rdf_litindex,
   37          [ rdf_set_literal_index_option/1,     % +Options
   38            rdf_tokenize_literal/2,             % +Literal, -Tokens
   39            rdf_find_literal/2,                 % +Spec, -Literal
   40            rdf_find_literals/2,                % +Spec, -ListOfLiterals
   41            rdf_token_expansions/2,             % +Spec, -Expansions
   42            rdf_stopgap_token/1,                % -Token
   43
   44            rdf_literal_index/2,                % +Type, -Index
   45            rdf_delete_literal_index/1          % +Type
   46          ]).   47:- use_module(rdf_db).   48:- use_module(library(debug)).   49:- use_module(library(lists)).   50:- use_module(library(error)).   51:- use_module(library(apply)).   52:- if(exists_source(library(snowball))).   53:- use_module(library(snowball)).   54:- else.   55:- use_module(library(porter_stem)).   56:- endif.   57:- use_module(library(double_metaphone)).   58
   59/** <module> Search literals
   60
   61This module finds literals of the RDF  database based on words, stemming
   62and _sounds like_ (metaphone).  The normal user-level predicate is
   63
   64  - rdf_find_literals/2
   65*/
   66
   67:- dynamic
   68    literal_map/2,                  % Type, -Map
   69    map_building/2,                 % Type, -Queue
   70    new_token/2,                    % Hook
   71    setting/1,
   72    stopgap/1.   73:- volatile
   74    literal_map/2.   75:- multifile
   76    tokenization/2,                 % +Literal, -Tokens
   77    exclude_from_index/2.           % +Which, +Token
   78
   79
   80setting(verbose(false)).                % print progress messages
   81setting(index_threads(1)).              % # threads for creating the index
   82setting(index(thread(1))).              % Use a thread for incremental updates
   83setting(stopgap_threshold(50000)).      % consider token a stopgap over N
   84
   85%!  rdf_set_literal_index_option(+Options:list)
   86%
   87%   Set options for the literal package.  Currently defined options
   88%
   89%           * verbose(Bool)
   90%           If =true=, print progress messages while building the
   91%           index tables.
   92%
   93%           * index_threads(+Count)
   94%           Number of threads to use for initial indexing of
   95%           literals
   96%
   97%           * index(+How)
   98%           How to deal with indexing new literals.  How is one of
   99%           =self= (execute in the same thread), thread(N) (execute
  100%           in N concurrent threads) or =default= (depends on number
  101%           of cores).
  102%
  103%           * stopgap_threshold(+Count)
  104%           Add a token to the dynamic stopgap set if it appears in
  105%           more than Count literals.  The default is 50,000.
  106
  107rdf_set_literal_index_option([]) :- !.
  108rdf_set_literal_index_option([H|T]) :-
  109    !,
  110    set_option(H),
  111    rdf_set_literal_index_option(T).
  112rdf_set_literal_index_option(Option) :-
  113    set_option(Option).
  114
  115set_option(Term) :-
  116    check_option(Term),
  117    functor(Term, Name, Arity),
  118    functor(General, Name, Arity),
  119    retractall(setting(General)),
  120    assert(setting(Term)).
  121
  122check_option(X) :-
  123    var(X),
  124    !,
  125    instantiation_error(X).
  126check_option(verbose(X)) :-
  127    !,
  128    must_be(boolean, X).
  129check_option(index_threads(Count)) :-
  130    !,
  131    must_be(nonneg, Count).
  132check_option(stopgap_threshold(Count)) :-
  133    !,
  134    must_be(nonneg, Count).
  135check_option(index(How)) :-
  136    !,
  137    must_be(oneof([default,thread(_),self]), How).
  138check_option(Option) :-
  139    domain_error(literal_option, Option).
  140
  141
  142                 /*******************************
  143                 *            QUERY             *
  144                 *******************************/
  145
  146%!  rdf_find_literal(+Spec, -Literal) is nondet.
  147%!  rdf_find_literals(+Spec, -Literals) is det.
  148%
  149%   Find literals in the RDF database matching Spec.  Spec is defined
  150%   as:
  151%
  152%   ==
  153%   Spec ::= and(Spec,Spec)
  154%   Spec ::= or(Spec,Spec)
  155%   Spec ::= not(Spec)
  156%   Spec ::= sounds(Like)
  157%   Spec ::= stem(Like)             % same as stem(Like, en)
  158%   Spec ::= stem(Like, Lang)
  159%   Spec ::= prefix(Prefix)
  160%   Spec ::= between(Low, High)     % Numerical between
  161%   Spec ::= ge(High)               % Numerical greater-equal
  162%   Spec ::= le(Low)                % Numerical less-equal
  163%   Spec ::= Token
  164%   ==
  165%
  166%   sounds(Like) and stem(Like) both map to  a disjunction. First we
  167%   compile the spec to normal form:   a disjunction of conjunctions
  168%   on elementary tokens. Then we execute   all the conjunctions and
  169%   generate the union using ordered-set algorithms.
  170%
  171%   Stopgaps are ignored. If the final result is only a stopgap, the
  172%   predicate fails.
  173%
  174%   @tbd Exploit ordering of numbers and allow for > N, < N, etc.
  175
  176rdf_find_literal(Spec, Literal) :-
  177    rdf_find_literals(Spec, Literals),
  178    member(Literal, Literals).
  179
  180rdf_find_literals(Spec, Literals) :-
  181    compile_spec(Spec, DNF),
  182    DNF \== @(stopgap),
  183    token_index(Map),
  184    lookup(DNF, Map, _, SuperSet),
  185    flatten(SuperSet, Set0),
  186    sort(Set0, Literals).
  187
  188%!  rdf_token_expansions(+Spec, -Extensions)
  189%
  190%   Determine which extensions of  a   token  contribute  to finding
  191%   literals.
  192
  193rdf_token_expansions(prefix(Prefix), [prefix(Prefix, Tokens)]) :-
  194    token_index(Map),
  195    rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens).
  196rdf_token_expansions(sounds(Like), [sounds(Like, Tokens)]) :-
  197    metaphone_index(Map),
  198    rdf_find_literal_map(Map, [Like], Tokens).
  199rdf_token_expansions(stem(Like), [stem(Like, Tokens)]) :-
  200    stem_index(Map),
  201    rdf_find_literal_map(Map, [Like], Tokens).
  202rdf_token_expansions(Spec, Expansions) :-
  203    compile_spec(Spec, DNF),
  204    token_index(Map),
  205    lookup(DNF, Map, SCS, _),
  206    flatten(SCS, CS),
  207    sort(CS, Expansions0),
  208    join_expansions(Expansions0, Expansions).
  209
  210join_expansions([], []).
  211join_expansions([H0|T0], [H|T]) :-
  212    untag(H0, Tag, V0),
  213    Tag =.. L0,
  214    append(L0, [[V0|Values]], L1),
  215    H =.. L1,
  216    join_expansions_by_tag(T0, Tag, T1, Values),
  217    join_expansions(T1, T).
  218
  219join_expansions_by_tag([H|T0], Tag, T, [V0|VT]) :-
  220    untag(H, Tag, V0),
  221    !,
  222    join_expansions_by_tag(T0, Tag, T, VT).
  223join_expansions_by_tag(L, _, L, []).
  224
  225lookup(@(false), _, [], []) :- !.
  226lookup(or(H0,T0), Map, [CH|CT], [H|T]) :-
  227    !,
  228    lookup(H0, Map, CH, H),
  229    lookup(T0, Map, CT, T).
  230lookup(H0, Map, [C], [H]) :-
  231    lookup1(H0, Map, C, H).
  232
  233lookup1(Conj, Map, Cond, Literals) :-
  234    phrase(conj_to_list(Conj), List),
  235    !,
  236    rdf_find_literal_map(Map, List, Literals),
  237    (   Literals \== []
  238    ->  phrase(conj_to_cond(Conj), Cond)
  239    ;   Cond = []
  240    ).
  241lookup1(_, _, _, []).
  242
  243conj_to_list(and(A,B)) -->
  244    !,
  245    conj_to_list(A),
  246    conj_to_list(B).
  247conj_to_list(@(false)) -->
  248    !,
  249    {fail}.
  250conj_to_list(Tagged) -->
  251    { untag(Tagged, L) },
  252    !,
  253    [L].
  254conj_to_list(L) -->
  255    [L].
  256
  257
  258conj_to_cond(and(A,B)) -->
  259    !,
  260    conj_to_cond(A),
  261    conj_to_cond(B).
  262conj_to_cond(Tagged) -->
  263    { untag(Tagged, _) },
  264    !,
  265    [ Tagged ].
  266conj_to_cond(_) -->
  267    [].
  268
  269
  270%!  compile_spec(+Spec, -Compiled)
  271%
  272%   Compile a specification as above into disjunctive normal form
  273
  274compile_spec(Spec, DNF) :-
  275    expand_fuzzy(Spec, Spec2),
  276    nnf(Spec2, NNF),
  277    dnf(NNF, DNF).
  278
  279
  280expand_fuzzy(Var, _) :-
  281    var(Var),
  282    !,
  283    throw(error(instantiation_error, _)).
  284expand_fuzzy(sounds(Like), Or) :-
  285    !,
  286    (   atom(Like)
  287    ->  metaphone_index(Map),
  288        double_metaphone(Like, Key),
  289        rdf_find_literal_map(Map, [Key], Tokens),
  290        list_to_or(Tokens, sounds(Like), Or)
  291    ;   expand_fuzzy(Like, Or)
  292    ).
  293expand_fuzzy(stem(Like), Or) :-
  294    !,
  295    expand_fuzzy(stem(Like, en), Or).
  296expand_fuzzy(stem(Like, Lang), Or) :-
  297    !,
  298    (   atom(Like)
  299    ->  stem_index(Map),
  300        stem(Like, Lang, Key),
  301        rdf_find_literal_map(Map, [Key], Tokens),
  302        list_to_or(Tokens, stem(Like), Or)
  303    ;   expand_fuzzy(Like, Or)
  304    ).
  305expand_fuzzy(prefix(Prefix), Or) :-
  306    !,
  307    (   atom(Prefix)
  308    ->  token_index(Map),
  309        rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens),
  310        list_to_or(Tokens, prefix(Prefix), Or)
  311    ;   expand_fuzzy(Prefix, Or)
  312    ).
  313expand_fuzzy(case(String), Or) :-
  314    !,
  315    (   atom(String)
  316    ->  token_index(Map),
  317        rdf_keys_in_literal_map(Map, case(String), Tokens),
  318        list_to_or(Tokens, case(String), Or)
  319    ;   expand_fuzzy(String, Or)
  320    ).
  321expand_fuzzy(or(A0, B0), E) :-
  322    !,
  323    expand_fuzzy(A0, A),
  324    expand_fuzzy(B0, B),
  325    simplify(or(A,B), E).
  326expand_fuzzy(and(A0, B0), E) :-
  327    !,
  328    expand_fuzzy(A0, A),
  329    expand_fuzzy(B0, B),
  330    simplify(and(A,B), E).
  331expand_fuzzy(not(A0), not(A)) :-
  332    !,
  333    expand_fuzzy(A0, A).
  334expand_fuzzy(between(Low, High), Or) :-
  335    !,
  336    token_index(Map),
  337    rdf_keys_in_literal_map(Map, between(Low, High), Tokens),
  338    list_to_or(Tokens, between(Low, High), Or).
  339expand_fuzzy(le(High), Or) :-
  340    !,
  341    token_index(Map),
  342    rdf_keys_in_literal_map(Map, le(High), Tokens),
  343    list_to_or(Tokens, le(High), Or).
  344expand_fuzzy(ge(Low), Or) :-
  345    !,
  346    token_index(Map),
  347    rdf_keys_in_literal_map(Map, ge(Low), Tokens),
  348    list_to_or(Tokens, ge(Low), Or).
  349expand_fuzzy(Token, Result) :-
  350    atomic(Token),
  351    !,
  352    (   rdf_stopgap_token(Token)
  353    ->  Result = @(stopgap)
  354    ;   Result = Token
  355    ).
  356expand_fuzzy(Token, _) :-
  357    throw(error(type_error(Token, boolean_expression), _)).
  358
  359simplify(Expr0, Expr) :-
  360    simple(Expr0, Expr),
  361    !.
  362simplify(Expr, Expr).
  363
  364simple(and(@(false), _), @(false)).
  365simple(and(_, @(false)), @(false)).
  366simple(and(@(stopgap), Token), Token).
  367simple(and(Token, @(stopgap)), Token).
  368simple(or(@(false), X), X).
  369simple(or(X, @(false)), X).
  370simple(or(@(stopgap), Token), Token).
  371simple(or(Token, @(stopgap)), Token).
  372
  373
  374list_to_or([], _, @(false)) :- !.
  375list_to_or([X], How, One) :-
  376    !,
  377    tag(How, X, One).
  378list_to_or([H0|T0], How, or(H, T)) :-
  379    tag(How, H0, H),
  380    list_to_or(T0, How, T).
  381
  382tag(sounds(X),    Y, sounds(X,Y)).
  383tag(stem(X),      Y, stem(X,Y)).
  384tag(prefix(X),    Y, prefix(X,Y)).
  385tag(case(X),      Y, case(X,Y)).
  386tag(between(L,H), Y, between(L,H,Y)).
  387tag(ge(L),        Y, ge(L,Y)).
  388tag(le(H),        Y, le(H,Y)).
  389
  390untag(sounds(_,Y),    Y).
  391untag(stem(_,Y),      Y).
  392untag(prefix(_,Y),    Y).
  393untag(case(_,Y),      Y).
  394untag(between(_,_,Y), Y).
  395untag(le(_,Y),        Y).
  396untag(ge(_,Y),        Y).
  397
  398untag(sounds(X,Y),    sounds(X),    Y).
  399untag(stem(X,Y),      stem(X),      Y).
  400untag(prefix(X,Y),    prefix(X),    Y).
  401untag(case(X,Y),      case(X),      Y).
  402untag(between(L,H,Y), between(L,H), Y).
  403untag(ge(L,Y),        ge(L),        Y).
  404untag(le(H,Y),        le(H),        Y).
  405
  406
  407%!  nnf(+Formula, -NNF)
  408%
  409%   Rewrite to Negative Normal Form, meaning negations only appear
  410%   around literals.
  411
  412nnf(not(not(A0)), A) :-
  413    !,
  414    nnf(A0, A).
  415nnf(not(and(A0,B0)), or(A,B)) :-
  416    !,
  417    nnf(not(A0), A),
  418    nnf(not(B0), B).
  419nnf(not(or(A0,B0)), and(A,B)) :-
  420    !,
  421    nnf(not(A0), A),
  422    nnf(not(B0), B).
  423nnf(A, A).
  424
  425
  426%!  dnf(+NNF, -DNF)
  427%
  428%   Convert a formula in NNF to Disjunctive Normal Form (DNF)
  429
  430dnf(or(A0,B0), or(A, B)) :-
  431    !,
  432    dnf(A0, A),
  433    dnf(B0, B).
  434dnf(and(A0,B0), DNF):-
  435    !,
  436    dnf(A0, A1),
  437    dnf(B0, B1),
  438    dnf1(and(A1,B1), DNF).
  439dnf(DNF, DNF).
  440
  441dnf1(and(A0, or(B,C)), or(P,Q)) :-
  442    !,
  443    dnf1(and(A0,B), P),
  444    dnf1(and(A0,C), Q).
  445dnf1(and(or(B,C), A0), or(P,Q)) :-
  446    !,
  447    dnf1(and(A0,B), P),
  448    dnf1(and(A0,C), Q).
  449dnf1(DNF, DNF).
  450
  451
  452                 /*******************************
  453                 *          TOKEN INDEX         *
  454                 *******************************/
  455
  456%!  token_index(-Map)
  457%
  458%   Get the index of tokens. If  not   present,  create one from the
  459%   current database. Once created, the map is kept up-to-date using
  460%   a monitor hook.
  461
  462token_index(Map) :-
  463    literal_map(token, Map),
  464    !,
  465    wait_for_map(token).
  466token_index(Map) :-
  467    rdf_new_literal_map(Map),
  468    assert(literal_map(token, Map)),
  469    register_token_updater,
  470    message_queue_create(Queue),
  471    assert(map_building(token, Queue)),
  472    thread_create(make_literal_index(Queue), _,
  473                  [ alias('__rdf_tokenizer'),
  474                    detached(true)
  475                  ]),
  476    wait_for_map(token).
  477
  478register_token_updater :-
  479    Monitor = [ reset,
  480                new_literal,
  481                old_literal
  482              ],
  483    (   setting(index(default))
  484    ->  create_update_literal_thread(1),
  485        rdf_monitor(thread_monitor_literal, Monitor)
  486    ;   setting(index(thread(N)))
  487    ->  create_update_literal_thread(N),
  488        rdf_monitor(thread_monitor_literal, Monitor)
  489    ;   rdf_monitor(monitor_literal, Monitor)
  490    ).
  491
  492make_literal_index(Queue) :-
  493    call_cleanup(
  494        make_literal_index,
  495        ( message_queue_destroy(Queue),
  496          retractall(map_building(token, _)))).
  497
  498%!  make_literal_index
  499%
  500%   Create the initial literal index.
  501
  502make_literal_index :-
  503    setting(index_threads(N)),
  504    !,
  505    threaded_literal_index(N),
  506    verbose('~N', []).
  507make_literal_index :-
  508    current_prolog_flag(cpu_count, X),
  509    threaded_literal_index(X),
  510    verbose('~N', []).
  511
  512threaded_literal_index(N) :-
  513    N > 1,
  514    !,
  515    message_queue_create(Q, [max_size(1000)]),
  516    create_index_threads(N, Q, Ids),
  517    forall(rdf_current_literal(Literal),
  518           thread_send_message(Q, Literal)),
  519    forall(between(1, N, _),
  520           thread_send_message(Q, done(true))),
  521    maplist(thread_join, Ids, _).
  522threaded_literal_index(_) :-
  523    forall(rdf_current_literal(Literal),
  524           register_literal(Literal)).
  525
  526create_index_threads(N, Q, [Id|T]) :-
  527    N > 0,
  528    !,
  529    thread_create(index_worker(Q), Id, []),
  530    N2 is N - 1,
  531    create_index_threads(N2, Q, T).
  532create_index_threads(_, _, []) :- !.
  533
  534index_worker(Queue) :-
  535    repeat,
  536        thread_get_message(Queue, Msg),
  537        work(Msg).
  538
  539work(done(true)) :- !.
  540work(Literal) :-
  541    register_literal(Literal),
  542    fail.
  543
  544
  545%!  clean_token_index
  546%
  547%   Clean after a reset.
  548
  549clean_token_index :-
  550    forall(literal_map(_, Map),
  551           rdf_reset_literal_map(Map)),
  552    retractall(stopgap(_)).
  553
  554%!  rdf_delete_literal_index(+Type)
  555%
  556%   Fully delete a literal index
  557
  558rdf_delete_literal_index(Type) :-
  559    must_be(atom, Type),
  560    (   retract(literal_map(Type, Map))
  561    ->  rdf_reset_literal_map(Map)          % destroy is unsafe
  562    ).
  563
  564                 /*******************************
  565                 *        THREADED UPDATE       *
  566                 *******************************/
  567
  568%!  create_update_literal_thread(+Threads)
  569%
  570%   Setup literal monitoring using threads.  While loading databases
  571%   through rdf_attach_db/2 from  rdf_persistency.pl,   most  of the
  572%   time is spent updating the literal token database. While loading
  573%   the RDF triples, most of the time   is spend in updating the AVL
  574%   tree holding the literals. Updating  the   token  index hangs on
  575%   updating the AVL trees holding the   tokens.  Both tasks however
  576%   can run concurrently.
  577
  578create_update_literal_thread(Threads) :-
  579    message_queue_create(_,
  580                         [ alias(rdf_literal_monitor_queue),
  581                           max_size(50000)
  582                         ]),
  583    forall(between(1, Threads, _),
  584           create_index_worker(initial)).
  585
  586:- dynamic
  587    index_worker_id/1,
  588    extra_worker_count/1.  589
  590create_index_worker(Status) :-
  591    (   retract(index_worker_id(Id0))
  592    ->  true
  593    ;   Id0 = 1
  594    ),
  595    succ(Id0, Id1),
  596    assertz(index_worker_id(Id1)),
  597    atom_concat(rdf_literal_monitor_, Id0, Alias),
  598    inc_extra_worker_count(Status),
  599    thread_create(monitor_literals(Status), _,
  600                  [ alias(Alias)
  601                  ]).
  602
  603monitor_literals(initial) :-
  604    set_prolog_flag(agc_margin, 0), % we don't create garbage
  605    repeat,
  606        thread_get_message(rdf_literal_monitor_queue, Literal),
  607        register_literal(Literal),
  608    fail.
  609monitor_literals(extra) :-
  610    set_prolog_flag(agc_margin, 0),
  611    repeat,
  612        (   thread_get_message(rdf_literal_monitor_queue, Literal,
  613                               [ timeout(1)
  614                               ])
  615        ->  register_literal(Literal),
  616            fail
  617        ;   !
  618        ),
  619    with_mutex(create_index_worker, dec_extra_worker_count),
  620    thread_self(Me),
  621    thread_detach(Me).
  622
  623thread_monitor_literal(new_literal(Literal)) :-
  624    !,
  625    thread_send_message(rdf_literal_monitor_queue, Literal).
  626thread_monitor_literal(Action) :-
  627    !,
  628    monitor_literal(Action).
  629
  630%!  check_index_workers(+Queue, +Keys)
  631%
  632%   Increase the number of workers indexing   literals sent to Queue
  633%   if the queue gets overful.
  634
  635check_index_workers(Alias, Keys) :-
  636    max_extra_workers(Max),
  637    Max > 0,
  638    message_queue_property(Queue, alias(Alias)),
  639    message_queue_property(Queue, size(Size)),
  640    Size > 10000,
  641    \+ ( extra_worker_count(Extra),
  642         Extra >= Max
  643       ),
  644    !,
  645    debug(rdf_litindex,
  646          'Creating extra literal indexer (Queue=~D, Keys=~D)',
  647          [Size, Keys]),
  648    with_mutex(create_index_worker, create_index_worker(extra)).
  649check_index_workers(_, _).
  650
  651inc_extra_worker_count(extra) :-
  652    !,
  653    (   retract(extra_worker_count(C0))
  654    ->  C is C0+1
  655    ;   C = 1
  656    ),
  657    asserta(extra_worker_count(C)).
  658inc_extra_worker_count(_).
  659
  660dec_extra_worker_count :-
  661    retract(extra_worker_count(C0)),
  662    !,
  663    C is C0-1,
  664    asserta(extra_worker_count(C)).
  665dec_extra_worker_count.
  666
  667max_extra_workers(Max) :-
  668    current_prolog_flag(cpu_count, Count),
  669    Max is Count//2.
  670
  671
  672                 /*******************************
  673                 *       MONITORED UPDATE       *
  674                 *******************************/
  675
  676monitor_literal(new_literal(Literal)) :-
  677    register_literal(Literal).
  678monitor_literal(old_literal(Literal)) :-
  679    unregister_literal(Literal).
  680monitor_literal(transaction(begin, reset)) :-
  681    rdf_monitor(monitor_literal, [-old_literal]),
  682    clean_token_index.
  683monitor_literal(transaction(end, reset)) :-
  684    rdf_monitor(monitor_literal, [+old_literal]).
  685
  686%!  register_literal(+Literal)
  687%
  688%   Associate the tokens of a literal with the literal itself.
  689
  690register_literal(Literal) :-
  691    (   rdf_tokenize_literal(Literal, Tokens0)
  692    ->  sort(Tokens0, Tokens),
  693        text_of(Literal, Lang, Text),
  694        literal_map(token, Map),
  695        add_tokens(Tokens, Lang, Text, Map)
  696    ;   true
  697    ).
  698
  699add_tokens([], _, _, _).
  700add_tokens([H|T], Lang, Literal, Map) :-
  701    rdf_insert_literal_map(Map, H, Literal, Keys),
  702    (   var(Keys)
  703    ->  (   rdf_keys_in_literal_map(Map, key(H), Count),
  704            setting(stopgap_threshold(Threshold)),
  705            Count > Threshold
  706        ->  assert(stopgap(H)),
  707            rdf_delete_literal_map(Map, H)
  708        ;   true
  709        )
  710    ;   forall(new_token(H, Lang), true),
  711        (   Keys mod 1000 =:= 0
  712        ->  progress(Map, 'Tokens'),
  713            (   Keys mod 10000 =:= 0
  714            ->  check_index_workers(rdf_literal_monitor_queue, Keys)
  715            ;   true
  716            )
  717        ;   true
  718        )
  719    ),
  720    add_tokens(T, Lang, Literal, Map).
  721
  722
  723%!  unregister_literal(+Literal)
  724%
  725%   Literal is removed from the database.   As we abstract from lang
  726%   and type qualifiers we first have to  check this is the last one
  727%   that is destroyed.
  728
  729unregister_literal(Literal) :-
  730    text_of(Literal, _Lang, Text),
  731    (   rdf(_,_,literal(Text))
  732    ->  true                        % still something left
  733    ;   rdf_tokenize_literal(Literal, Tokens0),
  734        sort(Tokens0, Tokens),
  735        literal_map(token, Map),
  736        del_tokens(Tokens, Text, Map)
  737    ).
  738
  739del_tokens([], _, _).
  740del_tokens([H|T], Literal, Map) :-
  741    rdf_delete_literal_map(Map, H, Literal),
  742    del_tokens(T, Literal, Map).
  743
  744
  745%!  rdf_tokenize_literal(+Literal, -Tokens) is semidet.
  746%
  747%   Tokenize a literal. We make  this   hookable  as tokenization is
  748%   generally domain dependent.
  749
  750rdf_tokenize_literal(Literal, Tokens) :-
  751    tokenization(Literal, Tokens),
  752    !.               % Hook
  753rdf_tokenize_literal(Literal, Tokens) :-
  754    text_of(Literal, _Lang, Text),
  755    atom(Text),
  756    tokenize_atom(Text, Tokens0),
  757    select_tokens(Tokens0, Tokens).
  758
  759select_tokens([], []).
  760select_tokens([H|T0], T) :-
  761    (   exclude_from_index(token, H)
  762    ->  select_tokens(T0, T)
  763    ;   number(H)
  764    ->  (   integer(H),
  765            between(-1073741824, 1073741823, H)
  766        ->  T = [H|T1],
  767            select_tokens(T0, T1)
  768        ;   select_tokens(T0, T)
  769        )
  770    ;   atom_length(H, 1)
  771    ->  select_tokens(T0, T)
  772    ;   default_stopgap(H)
  773    ->  select_tokens(T0, T)
  774    ;   stopgap(H)
  775    ->  select_tokens(T0, T)
  776    ;   T = [H|T1],
  777        select_tokens(T0, T1)
  778    ).
  779
  780%!  rdf_stopgap_token(-Token) is nondet.
  781%
  782%   True when Token is a stopgap  token. Currently, this implies one
  783%   of:
  784%
  785%     - exclude_from_index(token, Token) is true
  786%     - default_stopgap(Token) is true
  787%     - Token is an atom of length 1
  788%     - Token was added to the dynamic stopgap token set because
  789%       it appeared in more than _stopgap_threshold_ literals.
  790
  791rdf_stopgap_token(Token) :-
  792    (   var(Token)
  793    ->  rdf_stopgap_token2(Token)
  794    ;   rdf_stopgap_token2(Token), !
  795    ).
  796
  797rdf_stopgap_token2(Token) :-
  798    exclude_from_index(token, Token).
  799rdf_stopgap_token2(Token) :-
  800    default_stopgap(Token).
  801rdf_stopgap_token2(Token) :-
  802    atom(Token),
  803    atom_length(Token, 1).
  804rdf_stopgap_token2(Token) :-
  805    stopgap(Token).
  806
  807%!  default_stopgap(?Token)
  808%
  809%   Tokens we do not wish to index,   as  they creat huge amounts of
  810%   data with little or no value.  Is   there  a more general way to
  811%   describe this? Experience shows that simply  word count is not a
  812%   good criterium as it often rules out popular domain terms.
  813
  814default_stopgap(and).
  815default_stopgap(an).
  816default_stopgap(or).
  817default_stopgap(of).
  818default_stopgap(on).
  819default_stopgap(in).
  820default_stopgap(this).
  821default_stopgap(the).
  822
  823
  824%!  text_of(+LiteralArg, -Lang, -Text) is semidet.
  825%
  826%   Get the textual  or  (integer)   numerical  information  from  a
  827%   literal value. Lang  is  the  language   to  use  for  stemming.
  828%   Currently we use English for untyped  plain literals or literals
  829%   typed xsd:string. Formally, these should not be tokenized, but a
  830%   lot of data out there does not tag strings with their language.
  831
  832text_of(type(xsd:string, Text), en, Text) :- !.
  833text_of(type(_, Text), -, Text) :- !.
  834text_of(lang(Lang, Text), Lang, Text) :- !.
  835text_of(Text, en, Text) :- atom(Text), !.
  836text_of(Text, -, Text) :- integer(Text).
  837
  838
  839                 /*******************************
  840                 *         STEM INDEX           *
  841                 *******************************/
  842
  843%!  stem_index(-Map) is det.
  844%
  845%   Get the stemming literal index. This index is created on demand.
  846%   If some thread is creating the index, other threads wait for its
  847%   completion.
  848
  849stem_index(Map) :-
  850    literal_map(stem, Map),
  851    !,
  852    wait_for_map(stem).
  853stem_index(Map) :-
  854    rdf_new_literal_map(Map),
  855    assert(literal_map(stem, Map)),
  856    assert((new_token(Token, Lang) :- add_stem(Token, Lang, Map))),
  857    message_queue_create(Queue),
  858    assert(map_building(stem, Queue)),
  859    thread_create(fill_stem_index(Map, Queue), _,
  860                  [ alias('__rdf_stemmer'),
  861                    detached(true)
  862                  ]),
  863    wait_for_map(stem).
  864
  865wait_for_map(MapName) :-
  866    (   map_building(MapName, Queue)
  867    ->  catch(thread_get_message(Queue, _), _, true),
  868        wait_for_map(MapName)
  869    ;   true
  870    ).
  871
  872fill_stem_index(StemMap, Queue) :-
  873    call_cleanup(
  874        forall(rdf_current_literal(Literal),
  875               stem_literal_tokens(Literal, StemMap)),
  876        ( message_queue_destroy(Queue),
  877          retractall(map_building(stem, _)))).
  878
  879stem_literal_tokens(Literal, StemMap) :-
  880    rdf_tokenize_literal(Literal, Tokens),
  881    !,
  882    sort(Tokens, Tokens1),
  883    text_of(Literal, Lang, _Text),
  884    insert_tokens_stem(Tokens1, Lang, StemMap).
  885stem_literal_tokens(_,_).
  886
  887insert_tokens_stem([], _, _).
  888insert_tokens_stem([Token|T], Lang, Map) :-
  889    (   atom(Token)
  890    ->  (   stem(Token, Lang, Stem)
  891        ->  rdf_insert_literal_map(Map, Stem, Token, Keys),
  892            (   integer(Keys),
  893                Keys mod 1000 =:= 0
  894            ->  progress(Map, 'Stem')
  895            ;   true
  896            )
  897        ;   true
  898        )
  899    ;   true
  900    ),
  901    insert_tokens_stem(T, Lang, Map).
  902
  903
  904add_stem(Token, Lang, Map) :-
  905    stem(Lang, Token, Stem),
  906    rdf_insert_literal_map(Map, Stem, Token, _).
  907
  908:- if(current_predicate(snowball/3)).  909stem(Token, LangSpec, Stem) :-
  910    main_lang(LangSpec, Lang),
  911    downcase_atom(Token, Lower),
  912    catch(snowball(Lang, Lower, Stem), _, fail).
  913:- else.  914stem(Token, _Lang, Stem) :-
  915    downcase_atom(Token, Lower),
  916    porter_stem(Lower, Stem).
  917:- endif.  918
  919main_lang(LangSpec, Lang) :-
  920    sub_atom(LangSpec, Before, _, _, -),
  921    !,
  922    sub_atom(LangSpec, 0, Before, _, Lang).
  923main_lang(LangSpec, Lang) :-
  924    downcase_atom(LangSpec, Lang).
  925
  926
  927                 /*******************************
  928                 *        METAPHONE INDEX       *
  929                 *******************************/
  930
  931
  932metaphone_index(Map) :-
  933    literal_map(metaphone, Map),
  934    !,
  935    wait_for_map(metaphone).
  936metaphone_index(Map) :-
  937    rdf_new_literal_map(Map),
  938    assert(literal_map(metaphone, Map)),
  939    assert((new_token(Token, Lang) :- add_metaphone(Token, Lang, Map))),
  940    message_queue_create(Queue),
  941    assert(map_building(metaphone, Queue)),
  942    thread_create(fill_metaphone_index(Map, Queue), _,
  943                  [ alias('__rdf_metaphone_indexer'),
  944                    detached(true)
  945                  ]),
  946    wait_for_map(metaphone).
  947
  948fill_metaphone_index(MetaphoneMap, Queue) :-
  949    call_cleanup(
  950        fill_metaphone_index(MetaphoneMap),
  951        ( message_queue_destroy(Queue),
  952          retractall(map_building(metaphone, _)))).
  953
  954fill_metaphone_index(MetaphoneMap) :-
  955    token_index(TokenMap),
  956    rdf_keys_in_literal_map(TokenMap, all, Tokens),
  957    metaphone(Tokens, MetaphoneMap).
  958
  959metaphone([], _).
  960metaphone([Token|T], Map) :-
  961    (   atom(Token),
  962        double_metaphone(Token, SoundEx)
  963    ->  rdf_insert_literal_map(Map, SoundEx, Token, Keys),
  964        (   integer(Keys),
  965            Keys mod 1000 =:= 0
  966        ->  progress(Map, 'Metaphone')
  967        ;   true
  968        )
  969    ;   true
  970    ),
  971    metaphone(T, Map).
  972
  973
  974add_metaphone(Token, _Lang, Map) :-
  975    atom(Token),
  976    !,
  977    double_metaphone(Token, SoundEx),
  978    rdf_insert_literal_map(Map, SoundEx, Token).
  979add_metaphone(_, _, _).
  980
  981%!  rdf_literal_index(+Type, -Index) is det.
  982%
  983%   True when Index is a literal map   containing the index of Type.
  984%   Type is one of:
  985%
  986%     - token
  987%     Tokens are basically words of literal values. See
  988%     rdf_tokenize_literal/2.  The `token` map maps tokens to full
  989%     literal texts.
  990%     - stem
  991%     Index of stemmed tokens.  If the language is available, the
  992%     tokens are stemmed using the matching _snowball_ stemmer.
  993%     The `stem` map maps stemmed to full tokens.
  994%     - metaphone
  995%     Phonetic index of tokens.  The `metaphone` map maps phonetic
  996%     keys to tokens.
  997
  998rdf_literal_index(token, Map) :-
  999    !,
 1000    token_index(Map).
 1001rdf_literal_index(stem, Map) :-
 1002    !,
 1003    stem_index(Map).
 1004rdf_literal_index(metaphone, Map) :-
 1005    !,
 1006    metaphone_index(Map).
 1007rdf_literal_index(Type, _Map) :-
 1008    domain_error(literal_index, Type).
 1009
 1010
 1011                 /*******************************
 1012                 *             UTIL             *
 1013                 *******************************/
 1014
 1015verbose(Fmt, Args) :-
 1016    setting(verbose(true)),
 1017    !,
 1018    format(user_error, Fmt, Args).
 1019verbose(_, _).
 1020
 1021progress(Map, Which) :-
 1022    setting(verbose(true)),
 1023    !,
 1024    rdf_statistics_literal_map(Map, size(Keys, Values)),
 1025    format(user_error,
 1026           '\r~t~w: ~12|Keys: ~t~D~15+; Values: ~t~D~20+',
 1027           [Which, Keys, Values]).
 1028progress(_,_)