rdf_optimise.pl

View source with raw comments or as raw

    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (C): 1985-2012, University of Amsterdam
    7			      VU University Amsterdam
    8
    9    This program is free software; you can redistribute it and/or
   10    modify it under the terms of the GNU General Public License
   11    as published by the Free Software Foundation; either version 2
   12    of the License, or (at your option) any later version.
   13
   14    This program is distributed in the hope that it will be useful,
   15    but WITHOUT ANY WARRANTY; without even the implied warranty of
   16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   17    GNU General Public License for more details.
   18
   19    You should have received a copy of the GNU Lesser General Public
   20    License along with this library; if not, write to the Free Software
   21    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   22
   23    As a special exception, if you link this library with other files,
   24    compiled with a Free Software compiler, to produce an executable, this
   25    library does not by itself cause the resulting executable to be covered
   26    by the GNU General Public License. This exception does not however
   27    invalidate any other reasons why the executable file might be covered by
   28    the GNU General Public License.
   29*/
   30
   31:- module(rdf_optimise,
   32	  [ rdf_optimise/1,		% :Query
   33	    rdf_optimise/2,		% +Query, -Optimised
   34	    rdf_optimise/4,		% +Query, -Optimised, -Space, -Time
   35	    rdf_complexity/3,		% :Goal, -SpaceEstimate, -TimeEstimate
   36	    serql_select_bind_null/2	% +Goal, -WithBind
   37	  ]).   38:- use_module(library(semweb/rdf_db)).   39:- use_module(library(debug)).   40:- use_module(library(lists)).   41:- use_module(library(pairs)).   42:- use_module(library(ordsets)).   43:- use_module(library(ugraphs)).   44
   45:- meta_predicate
   46	rdf_optimise(0).   47
   48/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   49Queries  as  returned  by  serql_compile_path/2    consists  of  a  path
   50expression which is compiled to a conjunction  of calls to rdf/3 and the
   51translation of the WHERE clause acting as an additional filter.
   52
   53Optimisation of a query basically means moving conditions into the rdf/3
   54calls where possible, moving other conditions   as  early as possibly in
   55the graph-matching and reordering graph matching calls (rdf/3) to reduce
   56non-determinism.
   57
   58Reordering
   59----------
   60
   61Reordering of graph expressions  is   required  to  reduce backtracking.
   62Roughly I see three approaches:
   63
   64	* Learning
   65	Create permutations of the query and make them run under time
   66	constraints.  Try to learn patterns that work (fast).
   67
   68	* Use statistics
   69	Given the number of solutions on a certain partially instantiated
   70	rdf/3 call (and the required execution time), reorganise them to
   71	minimalise the cost.
   72
   73	* Use constraint solving
   74	Instead of trying to solve an rdf/3 call, create a constraint from
   75	it and only try to solve it if there is more information.  This
   76	is especially attractive if some form of high-level reasoning
   77	from the language entailment rules can be applied or set-theory
   78	is a possibility.
   79
   80After experiments with using constraint solving,   this  module now uses
   81planning based on statistical information provided by the rdf_db module.
   82This  algorithm  reaches  optimal  performance  under  quite  reasonable
   83assumptions  while  the  planning  overhead   is  very  reasonable.  The
   84algorithm is described in "An  optimised   Semantic  Web  query language
   85implementation        in        Prolog",          available         from
   86http://www.swi-prolog.org/download/publications/ICLP05-SeRQL.pdf
   87
   88NOTES:
   89
   90	* SeRQL LIKE works on resources *and* literals.  Do we want this?
   91	  See http://rdf4j.org/forum/mvnforum/viewthread?thread=275
   92- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
   93
   94:- multifile
   95	user:goal_expansion/2.   96
   97user:goal_expansion(rdf_complexity(G0, C), rdf_complexity(G, C)) :-
   98	expand_goal(G0, G).
   99user:goal_expansion(rdf_optimise(G0, O), rdf_optimise(G, O)) :-
  100	expand_goal(G0, G).
  101user:goal_expansion(rdf_optimise(G0, O, C, E), rdf_optimise(G, O, C, E)) :-
  102	expand_goal(G0, G).
  103
  104
  105/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  106Plan (conjunctions)
  107
  108	* Generate permutations and costs.  Select cheapest
  109	* The above moves tests right after the RDF call filling
  110	  its input argument.  As a last optimisation some of these
  111	  searches may be integrated in the rdf/3 call to help using
  112	  indexing of the RDF database.
  113
  114complexity/2 needs to update the order of clauses inside meta calls
  115(notably optional path expressions).
  116- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

rdf_optimise(:Goal) is nondet

Optimise Goal and execute the result. Semantically equivalent to call/1.

To be done: - Module handling is not correct.

  125rdf_optimise(Module:Goal) :-
  126	rdf_optimise(Goal, Optimised),
  127	call(Module:Optimised).

rdf_optimise(+Goal, -Optimized) is det: Goal is a Prolog control-structure with calls to the rdf_db.pl and SeRQL runtime predicates. Optimized is a semantically equivalent goal, obtained by re-ordering conjunctions in Goal and processing sub-queries that do not share variables independently.

  138rdf_optimise(Conj, Optimised) :-
  139	rdf_optimise(Conj, Optimised, _, _).
  140
  141rdf_optimise(Conj, Optimised, Space, Time) :-
  142	debug(rdf_optimise, '*** OPTIMIZING ***~n', []),
  143	dbg_portray_body(Conj),
  144	term_variables(Conj, Vars),
  145	rdf_complexity(Conj, Conj, S0, E0),
  146	State = state(Vars-Conj, S0, E0, 1),
  147	debug(rdf_optimise, 'C0 = ~w~n', [E0]),
  148	(   reorder(Conj, Perm),
  149	    (	debugging(rdf_optimise(all))
  150	    ->	dbg_portray_body(Perm)
  151	    ;	true
  152	    ),
  153	    rdf_complexity(Perm, Perm1, S, C),
  154	    debug(rdf_optimise(all), '--> space=~w, time=~w~n', [S, C]),
  155
  156	    arg(4, State, N),
  157	    N2 is N + 1,
  158	    nb_setarg(4, State, N2),
  159
  160	    (	arg(3, State, C0),
  161		C < C0
  162	    ->	debug(rdf_optimise,
  163		      'BETTER ONE [~D]: --> space=~w, time=~w~n', [N, S, C]),
  164		dbg_portray_body(Perm1),
  165	        nb_setarg(3, State, C),
  166		nb_setarg(2, State, S),
  167		nb_setarg(1, State, Vars-Perm1)
  168	    ;	true
  169	    ),
  170	    fail
  171	;   arg(1, State, Vars-Optimised),
  172	    arg(2, State, Space),
  173	    arg(3, State, Time),
  174	    debug(rdf_optimise, '  --> optimised: s/t = ~w/~w --> ~w/~w~n',
  175		  [S0, E0, Space, Time]),
  176	    dbg_portray_body(Optimised)
  177	), !.
  178optimise_order(Conj, Conj, -1, -1) :-
  179	debug(rdf_optimise, 'Failed to optimise:~n', []),
  180	dbg_portray_body(Conj).
  181
  182
  183		 /*******************************
  184		 *	     REORDERING		*
  185		 *******************************/

reorder(Goal, Reordered): Assuming Goal is a conjunction, create permutations of its order. Instead of blindly generating permutations however, we get an arbitrary element of the conjunction to the front and compute subgraphs of goals connected through variables after executing the first goal.

  195reorder(Goal, Reordered) :-
  196	State = bindings([]),
  197	conj_to_list(Goal, Conj0),
  198	reorder_conj(Conj0, State, Conj1),
  199	list_to_conj(Conj1, Reordered),
  200	arg(1, State, Bindings),
  201	unbind(Bindings).
  202
  203reorder_conj([One], _, [One]) :- !.
  204reorder_conj(List, State, Perm) :-
  205	group_by_cut(List, Before, Cut, After), !,
  206	reorder_conj(Before, State, PermBefore),
  207	bind_args(Before, State),		% this part is done
  208	reorder_conj(After, State, PermAfter),
  209	append(PermBefore, [Cut|PermAfter], Perm).
  210reorder_conj(List, State, Perm) :-
  211	group_by_optional(List, Normal, Optional), !,
  212	reorder_conj(Normal, State, PermNormal),
  213	bind_args(Normal, State),		% this part is done
  214	reorder_conj(Optional, State, PermOptional),
  215	append(PermNormal, PermOptional, Perm).
  216reorder_conj(List, State, [goal(_,independent(SubPerms),_)]) :-
  217	make_subgraphs(List, SubGraphs),
  218	SubGraphs \= [_], !,
  219	reorder_subgraph_conjs(SubGraphs, State, SubPerms).
  220reorder_conj(List, State, [Prefix|Perm]) :-
  221	select(Prefix, List, Rest),
  222	bind_args(Prefix, State),
  223	make_subgraphs(Rest, SubGraphs),
  224	(   SubGraphs = [SubGraph]
  225	->  reorder_conj2(SubGraph, State, Perm)
  226	;   Perm = [goal(_,independent(SubPerms),_)],
  227	    reorder_subgraph_conjs(SubGraphs, State, SubPerms)
  228	).

reorder_subgraph_conjs(SubGraphs, -ReorderedSubGraphs): Reorder the individual subgraphs. As we know these are independent there is no need to order the subgraphs themselves. we also know they are fully connected, and no longer contain cuts, so we use the simplified version of reorder_conj/2 called reorder_conj2/2.

  239reorder_subgraph_conjs([], _, []).
  240reorder_subgraph_conjs([H0|T0], State, [H|T]) :-
  241	reorder_conj2(H0, State, H1),
  242	list_to_conj(H1, H),
  243	reorder_subgraph_conjs(T0, State, T).
  244
  245reorder_conj2([One], _, [One]) :- !.
  246reorder_conj2(List, State, [Prefix|Perm]) :-
  247	select(Prefix, List, Rest),
  248	bind_args(Prefix, State),
  249	make_subgraphs(Rest, SubGraphs),
  250	(   SubGraphs = [SubGraph]
  251	->  reorder_conj2(SubGraph, State, Perm)
  252	;   Perm = [goal(_,independent(SubPerms),_)],
  253	    reorder_subgraph_conjs(SubGraphs, State, SubPerms)
  254	).

group_by_cut(+List, -BeforeCut, -Cut, -AfterCut): Split the conjunction over a cut (!) as we cannot guarantee proper behaviour when moving goals to the other side of a cut.

  261group_by_cut(Goals, Before, Cut, After) :-
  262	Cut = goal(_, !, _),
  263	append(Before, [Cut|After], Goals), !.

group_by_optional(+List, -NotOptional, -Optional): Split the conjunction in optional and non-optional part as we always want the optional part to happen last.

  271group_by_optional(List, Normal, Optional) :-
  272	split_optional(List, Normal, Optional),
  273	Normal \== [],
  274	Optional \== [].
  275
  276split_optional([], [], []).
  277split_optional([H|T0], Normal, Optional) :-
  278	(   optional(H)
  279	->  Optional = [H|T],
  280	    split_optional(T0, Normal, T)
  281	;   Normal = [H|T],
  282	    split_optional(T0, T, Optional)
  283	).
  284
  285optional(G) :-
  286	goal(G, (_*->_;_)).

bind_args(Goal, !State): Bind all arguments in Goal or list of goals. Assumes that executing a goal grounds all its arguments. Only the goal A = literal(B), generated by optimising where constraints is handled special.
State is a term bindings(List) that is destructively maintained by instantiate/4.

  298bind_args([], _) :- !.
  299bind_args([H|T], State) :- !,
  300	bind_args(H, State),
  301	bind_args(T, State).
  302bind_args(H, State) :-
  303	goal(H, A=literal(B)),
  304	var(A), var(B), !,
  305	(   instantiated(A, I),
  306	    I \== (-)
  307	->  instantiate(B, _, I, State)
  308	;   instantiated(B, I),
  309	    I \== (-)
  310	->  instantiate(A, _, I, State)
  311	;   true
  312	).
  313bind_args(Goal, State) :-
  314	vars(Goal, Vars),
  315	ground_vars(Vars, State).
  316
  317ground_vars([], _).
  318ground_vars([H|T], State) :-
  319	instantiate(H, _, r, State),
  320	ground_vars(T, State).

make_subgraphs(+Goals, -SubGraphs)

Create a list of connected subgraphs from Goals, assuming the variables in the assoc Grounded have been bound.

Arguments:

Goals - is a list goal(Id, Goal, Vars).

  330make_subgraphs([Goal], [[Goal]]) :- !.
  331make_subgraphs([G1,G2], Graphs) :- !,
  332	unbound_vars(G1, V1),
  333	unbound_vars(G2, V2),
  334	(   ord_intersect(V1, V2)
  335	->  Graphs = [[G1,G2]]
  336	;   Graphs = [[G1],[G2]]
  337	).
  338make_subgraphs(Goals, SubGraphs) :-
  339	map_list_to_pairs(unbound_vars, Goals, UnBoundKeyed),
  340	connected_pairs(UnBoundKeyed, Edges),
  341	vertices_edges_to_ugraph(Goals, Edges, UGraph),
  342	connected_vertices(UGraph, SubGraphs).
  343
  344connected_pairs([], []).
  345connected_pairs([H|T], Edges) :-
  346	connected_pairs(T, H, Edges, EdgeTail),
  347	connected_pairs(T, EdgeTail).
  348
  349connected_pairs([], _, Edges, Edges).
  350connected_pairs([H|T], To, Edges, EdgeTail) :-
  351	(   connected(H, To, GH, GT)
  352	->  Edges = [GH-GT,GT-GH|Edges1],
  353	    connected_pairs(T, To, Edges1, EdgeTail)
  354	;   connected_pairs(T, To, Edges, EdgeTail)
  355	).
  356
  357connected(V1-G1, V2-G2, G1, G2) :-
  358	ord_intersect(V1, V2).
  359
  360connected_vertices([], []) :- !.
  361connected_vertices(UGraph, [Set1|Sets]) :-
  362	UGraph = [V1-_|_],
  363	reachable(V1, UGraph, Set1),
  364	del_vertices(UGraph, Set1, UGraph2),
  365	connected_vertices(UGraph2, Sets).

unbound_vars(+Goal, -Vars) is det: True when Vars is an ordered set of unbound variables in Goal.

  372unbound_vars(Goal, Vars) :-
  373	vars(Goal, AllVars),
  374	unbound(AllVars, Vars0),
  375	sort(Vars0, Vars).
  376
  377unbound([], []).
  378unbound([H|T0], [H|T]) :-
  379	instantiated(H, -), !,
  380	unbound(T0, T).
  381unbound([_|T0], T) :-
  382	unbound(T0, T).

conj_to_list(+Conj, -List)

Translate a conjunction into a list of elements of the format

% goal(Id, Goal, Vars)

Where Id is a goal identifier, Goal  is the goal itself and Vars
is a list of variables inside the  Goal. Variables are sorted to
the standard order of terms.

  394conj_to_list(Conj, List) :-
  395	phrase(conj_to_list(Conj, 1, _), List).
  396
  397conj_to_list((A,B), N0, N) --> !,
  398	conj_to_list(A, N0, N1),
  399	conj_to_list(B, N1, N).
  400conj_to_list(true, N, N) --> !,
  401	[].
  402conj_to_list(G, N0, N) -->
  403	{ term_variables(G, Vars0),
  404	  sort(Vars0, Vars),
  405	  N is N0 + 1
  406	},
  407	[ goal(N0, G, Vars)
  408	].

list_to_conj(+List, -Conj)

  414list_to_conj([], true).
  415list_to_conj([goal(_,G,_)], G) :- !.
  416list_to_conj([goal(_,G,_)|T0], (G,T)) :-
  417	list_to_conj(T0, T).

id(G, Id) is det
goal(G, Goal) is det
vars(G, Vars) is det: Extract fields from the goal structure.

  426%id(goal(Id, _, _),     Id).
  427goal(goal(_, Goal, _), Goal).
  428vars(goal(_, _, Vars), Vars).
  429
  430
  431		 /*******************************
  432		 *	       BINDING		*
  433		 *******************************/
  434
  435/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  436Keep track of binding status of a   variable.  There are basically three
  437ways. One is to use a seperate   (assoc) table. Alternatively we can use
  438attributed variables and delete the attributes   at the end, and finally
  439we can use normal variables and recreate  the original goal by unbinding
  440them again.
  441
  442Using assocs requires us to pass  these   things  around  and involves a
  443log(N) complexity lookup. Using real terms  has the disadvantage that to
  444unbind we have to copy the term, thus  loosing bindings it may have with
  445the environment.  Using attributes suffers neither of these problems and
  446its only drawback is relying on non-standard Prolog features.
  447
  448Note that the remainder of the  algorithm   uses  sets  organised to the
  449standard order of terms.  As  putting   attributes  does  not change the
  450identity of global stack variables and goals are global stack terms this
  451is guaranteed.
  452- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

instantiate_obj(+Arg, -Old, +New, !Bindings) is det: Called to change the state of an RDF object after running some goal. Old is the current instantiation. After executing the RDF Goal the new instantiation is New (typically b). Bindings is a term bindings(List), which is updated using destructive assignment. List is the list of all variables to which we added attributes.

  463instantiate_obj(Arg, Old, New, Bindings) :-
  464	(   var(Arg)
  465	;   ground(Arg)
  466	), !,
  467	instantiate(Arg, Old, New, Bindings).
  468instantiate_obj(literal(Pattern, Var), +(Pattern), New, Bindings) :-
  469	instantiate(Var, _, New, Bindings).
  470
  471instantiate(Var, Old, New, Bindings) :-
  472	instantiated(Var, Old),
  473	(   Old == (-)
  474	->  put_attr(Var, instantiated, New),
  475	    arg(1, Bindings, B0),
  476	    setarg(1, Bindings, [Var|B0])
  477	;   true
  478	).
  479
  480instantiated(Term, How) :-
  481	(   nonvar(Term)
  482	->  How = +(+)
  483	;   get_attr(Term, instantiated, H)
  484	->  How = +(H)
  485	;   How = -
  486	).
  487
  488uninstantiate(Term, How) :-
  489	(   get_attr(Term, instantiated, How)
  490	->  del_attr(Term, instantiated)
  491	;   true
  492	).

instantiate_unify(A, B, State) is det: We encounter a plain unification. Ideally, we determine the most specific binding and propagate this and execute the unification. The latter however must be undone if we evaluate a disjunction. For now, the code is somewhat simplified and we merely decide that if one side is instantiated, the other will be too, disregarding the actual value we might know.

  503instantiate_unify(A, B, State) :-
  504	instantiated(B, +(_)), !,
  505	instantiate(A, _, b, State).
  506instantiate_unify(A, B, State) :-
  507	instantiated(A, +(_)), !,
  508	instantiate(B, _, b, State).
  509instantiate_unify(_, _, _).

attr_unify_hook(+Attribute, +Value): For now, the attribute unify hook only allows unifying with a variable with the same attribute. This deals with the unification that takes place in rdf_optimise/3 for the variables of the saved copy.

  519instantiated:attr_unify_hook(Attr, Value) :-
  520	get_attr(Value, instantiated, Attr).
  521
  522instantiated:attr_portray_hook(Value, _Var) :-
  523	write(+(Value)).

instantiate_term(+Term): Instantiate all unbound variables

  529instantiate_term(Term, How) :-
  530	compound(Term), !,
  531	functor(Term, _, Arity),
  532	instantiate_args(Arity, Term, How).
  533instantiate_term(Term, How) :-
  534	(   var(Term),
  535	    \+ get_attr(Term, instantiated, _)
  536	->  put_attr(Term, instantiated, How)
  537	;   true
  538	).
  539
  540instantiate_args(0, _, _) :- !.
  541instantiate_args(N, Term, How) :-
  542	arg(N, Term, A),
  543	instantiate_term(A, How),
  544	N2 is N - 1,
  545	instantiate_args(N2, Term, How).

uninstantiate_term(+Term, +How): Remove all skolem instantiations from Term.

  552uninstantiate_term(Term, How) :-
  553	compound(Term), !,
  554	functor(Term, _, Arity),
  555	uninstantiate_args(Arity, Term, How).
  556uninstantiate_term(Term, How) :-
  557	uninstantiate(Term, How).
  558
  559uninstantiate_args(0, _, _) :- !.
  560uninstantiate_args(N, Term, How) :-
  561	arg(N, Term, A),
  562	uninstantiate_term(A, How),
  563	N2 is N - 1,
  564	uninstantiate_args(N2, Term, How).

delete_instantiated(+List, -Unbound): Delete all elements of List that are not instantiated (i.e. var and without an instantiated attribute).

  572delete_instantiated([], []).
  573delete_instantiated([H|T0], L) :-
  574	(   instantiated(H, -)
  575	->  L = [H|T],
  576	    delete_instantiated(T0, T)
  577	;   delete_instantiated(T0, L)
  578	).
  579
  580
  581		 /*******************************
  582		 *	    COMPLEXITY		*
  583		 *******************************/

rdf_complexity(+GoalIn, -GoalOut, -SpaceEstimate, -TimeEstimate)

Provide an estimate for the size of the search space for executing Goal. For this we estimate the branching factor of each subgoal in the conjunction. If the branching factors are B0, B1, ... then the total complexity estimate is

E = 1 + B0 + B0*B1 + B0*B1*B2, ...

Non-RDF calls are supposed to be boolean tests that can be executed at the first opportunity all arguments are bound by RDF calls. They have a probability of failure, reducing the search space. Using the above formula, any number lower than 1 moves the test as far as possible to the head of the conjunction.

If GoalIn and GoalOut are the same the system will not try to optimize local conjunctions.

ISSUES: control structures ;, if-then-else, etc.

  605rdf_complexity(Goal, SpaceEstimate, TimeEstimate) :-
  606	rdf_complexity(Goal, Goal, SpaceEstimate, TimeEstimate).
  607
  608rdf_complexity(Goal0, Goal, Space, Time) :-
  609	State = bindings([]),
  610	complexity(Goal0, Goal, State, 1, Space, 0, Time),
  611	arg(1, State, Bindings),
  612	unbind(Bindings).
  613
  614unbind([]).
  615unbind([H|T]) :-
  616	del_attr(H, instantiated),
  617	unbind(T).
  618
  619%	complexity(:GoalIn, -GoalOut,
  620%		   +State,
  621%		   +SpaceIn, -SpaceOut,
  622%		   +CountIn, -CountOut)
  623%
  624%	Compute the complexity of Goal.  Vars   is  an assoc holding the
  625%	variables bound earlier in the conjunction. Space keeps the size
  626%	of the search space and Count is   the  cummulative count of the
  627%	costs for exploring the search space  espressed in the number of
  628%	nodes that will be visited.
  629%
  630%	The (G*->_=true;_=false) clause deals with   the  code generated
  631%	from optional graph specs as provided by SeRQL.
  632
  633complexity((A0,B0), (A,B), State, Sz0, Sz, C0, C) :- !,
  634	complexity(A0, A, State, Sz0, Sz1, C0, C1),
  635	complexity(B0, B, State, Sz1, Sz, C1, C).
  636complexity((G0*->True;False),
  637	   ( G*->True;False), State, Sz0, Sz, C0, C) :- !,
  638	(   var(G)
  639	->  optimise_order(G0, G, Sz1, C1),
  640	    Sz is Sz0 * Sz1,
  641	    C is C0+Sz0*C1
  642	;   complexity(G, G, State, Sz0, Sz, C0, C)
  643	).
  644complexity((If0->Then0;Else0),		% dubious
  645	   ( If->Then; Else), State, Sz0, Sz, C0, C) :- !,
  646	(   var(If)
  647	->  optimise_order(If0,   If,   Sz1, C1),
  648	    optimise_order(Then0, Then, Sz2, C2),
  649	    optimise_order(Else0, Else, Sz3, C3),
  650	    Sz is max(Sz0 * Sz1 * Sz2, Sz0 * Sz3),
  651	    C  is C0 + max(Sz0*C1+Sz0*Sz1*C2, Sz0*C1+Sz0*Sz1*C3)
  652	;   complexity(If,   If,   State, Sz0, Sz1, C0, C1),
  653	    complexity(Then, Then, State, Sz1, Sz2, C1, C2),
  654	    complexity(Else, Else, State, Sz0, Sz3, C0, C3),
  655	    Sz is max(Sz2, Sz3),
  656	    C  is max(C2, C3)
  657	).
  658complexity((A0;B0), (A;B), State, Sz0, Sz, C0, C) :- !,
  659	(   var(A)
  660	->  optimise_order(A0, A, _, _),	% First try the cheap one?
  661	    optimise_order(B0, B, _, _)
  662	;   A = A0,
  663	    B = B0
  664	),
  665	complexity(A, A, State, Sz0, SzA, C0, CA),
  666	complexity(B, B, State, Sz0, SzB, C0, CB),
  667	Sz is SzA + SzB,
  668	C is CA + CB.
  669complexity(sparql_group(G0), sparql_group(G), State, Sz0, Sz, C0, C) :- !,
  670	(   var(G)
  671	->  rdf_optimise(G0, G, Sz1, C1),
  672	    Sz is Sz0 * Sz1,
  673	    C is C0+Sz0*C1
  674	;   complexity(G, G, State, Sz0, Sz, C0, C)
  675	).
  676complexity(sparql_group(G0, OV, IV),
  677	   sparql_group(G, OV, IV),
  678	   State, Sz0, Sz, C0, C) :- !,
  679	(   var(G)
  680	->  rdf_optimise(G0, G, Sz1, C1),
  681	    Sz is Sz0 * Sz1,
  682	    C is C0+Sz0*C1
  683	;   complexity(G, G, State, Sz0, Sz, C0, C)
  684	).
  685complexity(rdfql_carthesian(Bags),
  686	   rdfql_carthesian(Bags), State, Sz0, Sz, C0, C) :- !,
  687	carth_complexity(Bags, State, Sz0, Sz, C0, 0, C).
  688complexity(independent(Goals0), Goal, State, Sz0, Sz, C0, C) :- !,
  689	independent_complexity(Goals0, Goal, State, Sz0, Sz, C0, 0, C).
  690complexity(Goal, Goal, State, Sz0, Sz, C0, C) :-
  691	Goal = member(Var, List), !,	% List is list of resources
  692	instantiate(Var, _, b, State),
  693	length(List, Branch),
  694	Sz is Sz0 * Branch,
  695	C is C0 + Sz0*0.2 + Sz*0.2.
  696complexity(Goal, Goal, State, Sz, Sz, C0, C) :-
  697	Goal = (A=B), !,
  698	instantiate_unify(A, B, State),
  699	C is C0 + 0.2.
  700complexity(Goal, Goal, State, Sz, Sz, C0, C) :-
  701	Goal = (Var=literal(V)), !,
  702	instantiated(V, +(_)),
  703	instantiate(Var, _, b, State),
  704	C is C0 + 0.2.
  705complexity(Goal, Goal, State, Sz0, Sz, C0, C) :-
  706	rdf_db_goal(Goal, S, P, O), !,
  707	instantiate(S, SI, b, State),
  708	instantiate(P, PI, b, State),
  709	instantiate_obj(O, OI, b, State),
  710	complexity0(SI, PI, OI, P, Goal, SetUp, PerAlt, Branch),
  711	Sz is Sz0 * Branch,
  712	C is C0 + Sz0*SetUp + Sz*PerAlt,
  713	debug(rdf_optimise(complexity), 'Complexity ~p: (~w) ~w --> ~w',
  714	      [i(SI,PI,OI), Goal, Branch, C]).
  715complexity(sparql_eval(E,V), sparql_eval(E,V), _, Sz0, Sz, C0, C) :- !,
  716	term_variables(E, Vars),
  717	all_bound(Vars),
  718	Sz is Sz0,			% probability of failure
  719	C is C0 + Sz*20.		% Sz * CostOfEval
  720complexity(sparql_true(E), sparql_true(E), _, Sz0, Sz, C0, C) :- !,
  721	term_variables(E, Vars),
  722	all_bound(Vars),
  723	Sz is Sz0 * 0.5,		% probability of failure
  724	C is C0 + Sz0*20.		% Sz * CostOfEval
  725complexity(G, G, _, Sz0, Sz, C0, C) :-	% non-rdf tests
  726	term_variables(G, Vars),
  727	all_bound(Vars),
  728	Sz is Sz0 * 0.5,		% probability of failure
  729	C is C0 + Sz0.			% Sz * CostOfTest
  730
  731all_bound([]).
  732all_bound([H|T]) :-
  733	instantiated(H, +(_)),
  734	all_bound(T).

carth_complexity(+Bags, +State, +Size0, -Size, +Time0, +TimeSum0, -TimeSum) is det: Compute the time and space efficiency of the carthesian product. the total cost in time is the sum of the costs of all branches, The search space at the end is still the same product.

  744carth_complexity([], _, Sz, Sz, _, C, C).
  745carth_complexity([bag(_,G)|T], State,
  746		 Sz0, Sz,
  747		 C0, Csum0, Csumz) :-
  748	complexity(G, G, State, Sz0, Sz1, C0, C1),
  749	Csum1 is Csum0 + C1,
  750	carth_complexity(T, State, Sz1, Sz, C0, Csum1, Csumz).

independent_complexity(+GoalsIn, -Goals, +State, +Size0, -Size, +Time0, +TimeSum0, -TimeSum) is det

Compute the complexity of an independent conjunction.

Arguments:

Goals - is a list of g(Goal, Branch, Cost), where Branch is the number of solutions expected fromGoal and Cost is the estimated CPU time.

  763independent_complexity(GoalsIn, Goal, State,
  764		       Size0, Size,
  765		       Time0, TimeSum0, TimeSum) :-
  766	indep_complexity(GoalsIn, Goals1, State,
  767			 Size0, Size,
  768			 Time0, TimeSum0, TimeSum),
  769	keysort(Goals1, ByCost),
  770	pairs_values(ByCost, Goals2),
  771	simplify_carthesian(Goals2, Goal).
  772
  773indep_complexity([], [], _, Sz, Sz, _, C, C).
  774indep_complexity([G0|GT0], [SzG-bag(Vars,G,SzG,CG)|GT], State,
  775		       Sz0, Sz,
  776		       C0, Csum0, Csumz) :-
  777	complexity(G0, G, State, Sz0, Sz1, C0, C1),
  778	term_variables(G, VList),
  779	Vars =.. [v|VList],
  780	(   Sz0 =:= 0
  781	->  SzG = 0,
  782	    CG = 0
  783	;   SzG is Sz1/Sz0,
  784	    CG is (C1-C0)/Sz0
  785	),
  786	Csum1 is Csum0 + C1,
  787	indep_complexity(GT0, GT, State, Sz1, Sz, C0, Csum1, Csumz).

simplify_carthesian(+Bags, -Goal) is det: Peer of the leading little branching goals from the carthesian handler. If there is a bag of one left, turn it into a normal goal.

  795simplify_carthesian([], true).
  796simplify_carthesian([bag(_,Goal,_Branch,_Cost)], Goal) :- !.
  797simplify_carthesian([bag(_,Goal,Branch,_Cost)|Bags], Final) :-
  798	(   Branch < 1.5
  799	->  !, Final = (Goal, Final1),
  800	    simplify_carthesian(Bags, Final1)
  801	;   Bags == []
  802	->  !, Final = Goal
  803	).
  804simplify_carthesian(Bags, rdfql_carthesian(Bags)).

complexity0(+SI, +PI, +OI, +P, +G, -Setup, -PerAlt, -Branch)

SI,PI,OI describe the instantiation pattern. P is the predicate and G is the actual goal. Complexity is unified to an estimate of the size of the search space and therefore an estimate of the execution time required to prove the goal.

Literal `like' matches come out as +(like(Pattern)). We must estimate the percentage of the literals that match this pattern. Suppose the factor is 1,000. This means the branching is reduced by 1,000, but finding each solution is slow as it requires a linear scan. It is faster than going all the way back to Prolog backtracking however, so we estimate a factor 10 (TBD: verify that number).

ISSUES: rdf_has/3 vs rdf_reachable/3.

  824complexity0(+(_),+(_),+(_), _, _, 1, 0, 1) :- !.
  825complexity0(+(b),+(+),-, P, G, 1, 1, B) :- !,
  826	subj_branch_factor(G, B, Prop),
  827	rdf_predicate_property(P, Prop).
  828complexity0(-,+(+),+(b), P, G, 1, 1, B) :- !,
  829	obj_branch_factor(G, B, Prop),
  830	rdf_predicate_property(P, Prop).
  831complexity0(+(b), -, -, _, _, 1, 1, B) :- !,
  832	rdf_statistics(triples(Total)),
  833	subject_count(Subjs),
  834	(   Total == 0
  835	->  B = 0
  836	;   B is Total/Subjs
  837	).
  838complexity0(_,_,+(like(Pat)),_, G, Factor, Factor, B) :- !,
  839	rdf_estimate_complexity(G, B0),
  840	pattern_filter(Pat, Factor0),
  841	Factor is max(1, min(B0, Factor0)/10),
  842	B is B0/Factor.
  843complexity0(_,_,_, _, G, 1, 1, B) :-
  844	rdf_estimate_complexity(G, B).
  845
  846:- if(rdf_statistics(subjects(_))).  847subject_count(Count) :-				% RDF-DB 2.x
  848	rdf_statistics(subjects(Count)).
  849:- else.  850subject_count(Count) :-				% RDF-DB 3.x
  851	rdf_statistics(resources(Count)).
  852:- endif.  853
  854:- multifile
  855        subj_branch_factor/3,
  856        obj_branch_factor/3.  857
  858subj_branch_factor(rdf(_,_,_),           X, rdf_subject_branch_factor(X)).
  859subj_branch_factor(rdf_has(_,_,_),       X, rdfs_subject_branch_factor(X)).
  860subj_branch_factor(rdf_reachable(_,_,_), X, rdfs_subject_branch_factor(X)).
  861
  862obj_branch_factor(rdf(_,_,_),            X, rdf_object_branch_factor(X)).
  863obj_branch_factor(rdf_has(_,_,_),        X, rdfs_object_branch_factor(X)).
  864obj_branch_factor(rdf_reachable(_,_,_),  X, rdfs_object_branch_factor(X)).

rdf_db_goal(+Goal, -Subject, -Predicate, -Object)

True if Goal is a pure (logical) predicate on the RDF database involving the given Subject, Predicate and Object. Defined multifile, allowing the optimiser to understand user-defined rdf/3 like predicates.

To be done: - Allow specifying different costs and branching factors

  876:- multifile
  877	rdf_db_goal/4.  878
  879rdf_db_goal(rdf(S,P,O),			S,P,O).
  880rdf_db_goal(rdf_has(S,P,O),		S,P,O).
  881rdf_db_goal(rdf_reachable(S,P,O),	S,P,O).
  882rdf_db_goal(rdf(S,P,O, _DB),		S,P,O). % TBD: less hits

pattern_filter(+Like, -Factor): Estimate the efficiency of a pattern. This is a bit hard, as characters are not independent.

  889pattern_filter(Like, Factor) :-
  890	atom_codes(Like, Codes),
  891	pattern_factor(Codes, 1, Factor).
  892
  893pattern_factor([], F, F).
  894pattern_factor([0'*|T], F0, F) :- !,
  895	pattern_factor(T, F0, F).
  896pattern_factor([_|T], F0, F) :-
  897	F1 is F0*10,
  898	pattern_factor(T, F1, F).

rdf_estimate_complexity(+Goal, -Complexity): Estimate the branching factor introduced by Goal. This uses the rdf_db statistics of the hash chains which are based on exploiting the RDFS subPropertyOf reasoning.
In addition, rdf_reachable/3 introduces its own complexity which must be estimate using the branching factor of the relation.

  910rdf_estimate_complexity(G, C) :-
  911	rdf_db_goal(G, S, P0, O),
  912	map_predicate(P0, P),
  913	rdf_estimate_complexity(S, P, O, C).
  914
  915map(map_predicate(_,_)).
  916map(map_predicate(_,_):-_).
  917
  918term_expansion(In, Out) :-
  919	map(In), !,
  920	rdf_global_term(In, Out).
  921
  922map_predicate(X, X) :-
  923	var(X), !.
  924map_predicate(serql:directSubClassOf,    rdfs:subClassOf) :- !.
  925map_predicate(serql:directType,          rdf:type) :- !.
  926map_predicate(serql:directSubPropertyOf, rdfs:subPropertyOf) :- !.
  927map_predicate(X, X).
  928
  929
  930		 /*******************************
  931		 *     INSTANTIATE OPTIONAL	*
  932		 *******************************/
  933
  934/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  935In  SELECT  queries,  optional  parts  of   the  path  expression  leave
  936uninstantiated variables. These must be bound to  '$null$' to be able to
  937do correct merging for  DISTINCT.  The  naive   way  to  do  this  is to
  938instantiate all variables at the end  of   the  query.  On large selects
  939(i.e. involving many variables) this appears to be quite costly.  Doing
  940the job early, as in
  941
  942	(   Goal
  943	*-> true
  944	;   bind_null(VarsInGoal)
  945	)
  946
  947is not correct as well, as VarsInGoal may  be involved in other parts of
  948the code either before or after the optional path expression. So we need
  949to:
  950
  951	* Do abtract execution and find the bindings done before arriving
  952	  at Goal.
  953	* continue the execution, and watch for new bindings to these
  954	  variables.  If we find a binding for the second time, remove
  955	  it from the first and make a conditional binding for it.
  956
  957If we bind an argument unconditionally, we place an attribute 'b'. If it
  958is conditionally bound, we place an attribute   c(Set), where Set is the
  959set in which it was bound or plain  'c' if it was conditionally bound in
  960multiple places.
  961
  962TBD:	disjunctions and other control structures.
  963	queries that do not bind (such as SPARQL bound(X))
  964- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
  965
  966serql_select_bind_null(Goal0, Goal) :-
  967	State = bindings([]),
  968	select_bind_null(Goal0, Goal1, State),
  969	arg(1, State, Bindings),
  970	c_unbind(Bindings, Left),
  971	(   Left == []
  972	->  Goal = Goal1
  973	;   Goal = (Goal1, rdfql_cond_bind_null(Left))
  974	).
  975
  976c_unbind([], []).
  977c_unbind([H|T0], L) :-
  978	(   get_attr(H, instantiated, c)
  979	->  L = [H|T]
  980	;   L = T
  981	),
  982	del_attr(H, instantiated),
  983	c_unbind(T0, T).
  984
  985
  986select_bind_null((A0,B0), (A,B), State) :- !,
  987	select_bind_null(A0, A, State),
  988	select_bind_null(B0, B, State).
  989select_bind_null((G0 *-> true; true),
  990		 ( G *-> true; Bind),
  991		 State) :- !,
  992	arg(1, State, B0),
  993	select_bind_null(G0, G, State),
  994	arg(1, State, B),
  995	Bind = rdfql_bind_null(Vars),
  996	c_bindings(B, B0, c(Bind), Vars).
  997select_bind_null(rdfql_carthesian(List0),
  998		 rdfql_carthesian(List), State) :- !,
  999	select_carth_bind_null(List0, List, State).
 1000select_bind_null(sparql_group(A0), sparql_group(A), State) :- !,
 1001	select_bind_null(A0, A, State).
 1002select_bind_null(sparql_group(A0,Outer,Inner),
 1003		 sparql_group(A, Outer,Inner),
 1004		 State) :- !,
 1005	select_bind_null(A0, A, State),
 1006	term_variables(Outer, Vars),
 1007	c_bind(Vars, State).
 1008select_bind_null(Goal, Goal, State) :-
 1009	term_variables(Goal, Vars),
 1010	c_bind(Vars, State).

c_bindings(+AtEnd, +AtStart, +CVars, -Vars)

The variables of the difference-list AtEnd..AtStart are conditionally bound. Tag each such variable with CVars.

Arguments:

CVars - Term c(Vars), where Vars are the other variables that have the same conditional binding.

 1020c_bindings(B, B0, _, Vars)  :-
 1021	B == B0, !,
 1022	Vars = [].
 1023c_bindings([H|T0], B0, Attr, [H|Vars]) :-
 1024	get_attr(H, instantiated, I),
 1025	is_instantiated(I), !,
 1026	put_attr(H, instantiated, Attr),
 1027	c_bindings(T0, B0, Attr, Vars).
 1028c_bindings([_|T0], B0, Attr, Vars) :-
 1029	c_bindings(T0, B0, Attr, Vars).
 1030
 1031
 1032is_instantiated(b).		% unconditionally bound
 1033is_instantiated(c(_)).		% bound either by call or rdfql_bind_null/1

c_bind(+Vars, +State): Do unconditional binding of Vars. Var may be in a rdfql_bind_null/1 set. In that case, delete it from the set, and set the class to 'c' to make a conditional binding at the end.

 1042c_bind([], _).
 1043c_bind([H|T], State) :-
 1044	(   get_attr(H, instantiated, I)
 1045	->  (   I == b			% already unconditionally bound
 1046	    ->	true
 1047	    ;	I = c(Set)
 1048	    ->	arg(1, Set, Vars0),
 1049		del_var(H, Vars0, Vars),
 1050		setarg(1, Set, Vars),
 1051		put_attr(H, instantiated, c)
 1052	    ;	I == c
 1053	    ->	true
 1054	    )
 1055	;   put_attr(H, instantiated, b),
 1056	    arg(1, State, B0),
 1057	    setarg(1, State, [H|B0])
 1058	),
 1059	c_bind(T, State).
 1060
 1061del_var(H, [X|T0], T) :-
 1062	(   H == X
 1063	->  T = T0
 1064	;   T = [X|T1],
 1065	    del_var(H, T0, T1)
 1066	).
 1067
 1068select_carth_bind_null([], [], _).
 1069select_carth_bind_null([bag(Vars, G0)|T0], [bag(Vars, G)|T], State) :-
 1070	select_bind_null(G0, G, State),
 1071	select_carth_bind_null(T0, T, State).
 1072
 1073
 1074		 /*******************************
 1075		 *	  DEBUG SUPPORT		*
 1076		 *******************************/
 1077
 1078dbg_portray_body(G) :-
 1079	debugging(rdf_optimise), !,
 1080	portray_body(G).
 1081dbg_portray_body(_).
 1082
 1083portray_body(G) :-
 1084	(   pp_instantiate_term(G),
 1085	    debug(_, '~@',
 1086		  [ portray_clause(current_output, (<> :- G), [module(sparql_runtime)])
 1087		  ]),
 1088	    fail
 1089	;   true
 1090	).
 1091
 1092pp_instantiate_term(Term) :-
 1093	compound(Term), !,
 1094	functor(Term, _, Arity),
 1095	pp_instantiate_args(Arity, Term).
 1096pp_instantiate_term(Term) :-
 1097	var(Term),
 1098	get_attr(Term, instantiated, H), !,
 1099	del_attr(Term, instantiated),
 1100	Term = +(H).
 1101pp_instantiate_term(_).
 1102
 1103pp_instantiate_args(0, _) :- !.
 1104pp_instantiate_args(N, Term) :-
 1105	arg(N, Term, A),
 1106	pp_instantiate_term(A),
 1107	N2 is N - 1,
 1108	pp_instantiate_args(N2, Term).
 1109
 1110
 1111		 /*******************************
 1112		 *	      SANDBOX		*
 1113		 *******************************/
 1114
 1115:- multifile sandbox:safe_meta_predicate/1. 1116
 1117sandbox:safe_meta_predicate(rdf_optimise:rdf_optimise/1)