/*****************************************************************************

REGI -- A PROLOG INTERPRETER FOR EXTENDED REGULAR EXPRESSIONS
=============================================================

Version: 0.7
Author: Torbjörn Lager
Date: 980909

REGI (pronounced 'reggae') is an attempt to build an INTERPRETER for extended regular expressions in Prolog. That is, expressions are NOT compiled into finite-state machines, but rather interpreted directly in Prolog. Here's why I think it's worthwhile:

o	While it is certainly true that interpretation can become very
	inefficient, it is sometimes nice to work with an interpreter 
	rather than a compiler, since for small 'toy-examples' -- for 
	'explorative' regular expressions programming -- the interpreter 
	is (usually) fast enough, and there's no time to waste on compilation.

o	And anyway, perhaps 'partial evaluation' with respect to a particular
	goal will allow us to obtain a significant increase of efficiency? 
	(Has anybody tried this?)

o	Since the (kernel) of the program is completely declarative, new 
	insights into a calculus of regular expressions might be gained just 
	by carefully studying it.

o	On the top of an interpreter it is might be possible to build a 
	'development environment' for regular expressions, exploiting standard
	logic programming techniques for generating traces, proofs, and so on.

The set of REGI operators is a subset of the operators over regular languages and relations defined by Karttunen et al., in the context of the the Xerox Finite-State Tools (XFST). Notation differs slightly from the XFST notation, however (it is just that Prolog operators are less flexible than one would have liked...). For more information about XFST, see http://www.xrce.xerox.com/research/mltt/fst/ .

REGI supports the following operators:

Basic operators over regular languages:

A + B                   Concatenation   
A | B (or A ; B)        Union           
A & B                   Intersection    
A*                      Kleene star     
A - B                   Difference      

Extended operators over regular languages:

A+                      Kleene plus     
~A                      Complement      
A\B                     Ignore
opt(A)                  Optionality     
$A                      Containment     
A => B .. C             Restriction     

Basic operators over regular relations:

A x B                   Crossproduct
A o B                   Composition

Extended operators over regular relations:

A -> B                  Replacement
A <- B                  Inverse replacement
A1 -> B1, A2 -> B2      Parallel replacement
A -> B .. C             Markup
A -> B # C .. D         Conditional replacement

Also, there is a special operator for debugging:

trace(A)				Writes A and the pair of strings 
                        denoted by A to stdout.

Brackets, (), are used for grouping expressions.

Note the following simple expressions:

[]             The empty-string language
?              The any symbol


REGI is written in SICStus Prolog, and is probably not very portable.
Towards the end of this file, you'll find a couple of examples.

Enjoy!

******************************************************************************/

:- use_module(library(lists)).


/* Operators
*/

:- op( 1200, xfx, iff).
:- op( 1050, xfy,->).
:- op( 1100, xfy,;).


/* Regex operators
*/
:- op( 700,  fy, [ trace ]).

:- op( 350,  fy, [ ~, $]).
:- op( 400,  xf, [ *, + ]).
:- op( 450, xfy, [ + ]).
:- op( 500, xfy, [ - , & ]).       % | is declared further down
:- op( 500, xfx, ['..']).
:- op( 600,xfx,#).
:- op( 550, xfy, [ => , <- ]).     % -> is declared further down
:- op( 600, xfy, [ x, o ]).


/* Settings
*/

string_length_limit(100).
recursive_limit(100).


/* regex/3

regex(+Regex,?Upper,?Lower) is true when the regular expression Regex denotes the pair of strings Upper and Lower. Note that when regex/3 is called with Upper and/or Lower uninstantiated, it/they will (on backtracking) be instantiated to strings of increasing length (but no longer than a predefined maximum length).

The alphabet (Sigma) is assumed to be the set of non-operator symbols occurring in Expr, Upper or Lower.

*/

regex(Expr,Upper,Lower) :-
	extract_symbols(Expr,[],Symbols),
	append_if_nonvar(Symbols,Upper,Symbols1),
	append_if_nonvar(Symbols1,Lower,Symbols2),
	sort(['>','<'|Symbols2],Sigma),
	format("Sigma: ~p \n",[Sigma]),
	string_length_limit(MaxLength),
	between(0,MaxLength,Length),
	length(Upper,Length),
	recursive_limit(Lim),
%	between(1,Lim,Lim1),
	regex(Expr,Sigma,Upper,Lower,Lim).


append_if_nonvar(L1,L2,L3) :-
	(	nonvar(L2) ->
		append(L1,L2,L3)
	;	L3 = L1
	).
	

/* extract_symbols/3
Extracts all non-operator (ground) symbols from a regular expression.
*/

extract_symbols(Expr,Sigma0,Sigma) :-
	(	memberchk(Expr,[(A + B),(A | B),(A & B),
	          (A - B),(A x B),(A o B),(A -> B),
			  (A <- B),(A,B),(A => B),(A .. B),(A/B),(A:B)]) ->
		extract_symbols(A,Sigma0,Sigma1),
		extract_symbols(B,Sigma1,Sigma)
	;	memberchk(Expr,[((A -> B) # (C .. D))]) ->
		extract_symbols(A,Sigma0,Sigma1),
		extract_symbols(B,Sigma1,Sigma2),
		extract_symbols(C,Sigma2,Sigma3),
		extract_symbols(D,Sigma3,Sigma)
	;	memberchk(Expr,[A*,A+,$A,~A,trace(A),opt(A)]) ->
		extract_symbols(A,Sigma0,Sigma)
	;	memberchk(Expr,[[],?]) ->
		Sigma = Sigma0
	;	ground(Expr) ->
		Sigma = [Expr|Sigma0]
	;	Sigma = Sigma0
	).


/* decrement_limit/2
*/

decrement_limit(Limit0,Limit) :-
	(	Limit0 > 0 ->
		Limit is Limit0-1
	;	%write('Recursive limit exceeded. '),
		fail
	).


/* More regex operators
WARNING! Do not use Prolog if-then-else or disjunction below this point!
*/

:- op( 500, yfx, ; ).
:- op( 550, yfx, [ -> ]).



/* regex/5
This is the regular expressions interpreter proper.
*/

regex(Expr,Sigma,Upper,Lower,Lim) :-
	regex(Expr,Sigma,Upper,[],Lower,[],Lim).


/* regex/7
Definitions of the basic operators.
*/

regex((A + B),Sigma,Upper0,Upper,Lower0,Lower,D) :-
	regex(A,Sigma,Upper0,Upper1,Lower0,Lower1,D),
	regex(B,Sigma,Upper1,Upper,Lower1,Lower,D).
regex((A | _B),Sigma,Upper0,Upper,Lower0,Lower,D) :-
	regex(A,Sigma,Upper0,Upper,Lower0,Lower,D).
regex((_A | B),Sigma,Upper0,Upper,Lower0,Lower,D) :-
	regex(B,Sigma,Upper0,Upper,Lower0,Lower,D).
regex((A & B),Sigma,Upper0,Upper,Lower0,Lower,D) :-
	regex(A,Sigma,Upper0,Upper,Lower0,Lower,D),
	regex(B,Sigma,Upper0,Upper,Lower0,Lower,D).
regex(_A*,_Sigma,Upper,Upper,Lower,Lower,_D).
regex(A*,Sigma,Upper0,Upper,Lower0,Lower,D) :-
	decrement_limit(D,D1),
	regex(A,Sigma,Upper0,Upper1,Lower0,Lower1,D),
	regex(A*,Sigma,Upper1,Upper,Lower1,Lower,D1).
regex((A - B),Sigma,Upper0,Upper,Lower0,Lower,D) :-
	regex(A,Sigma,Upper0,Upper,Lower0,Lower,D),
	\+ regex(B,Sigma,Upper0,Upper,Lower0,Lower,D).
regex((A x B),Sigma,Upper0,Upper,Lower0,Lower,D) :-
	regex(A,Sigma,Upper0,Upper,Upper0,Upper,D),
	regex(B,Sigma,Lower0,Lower,Lower0,Lower,D).
regex((A o B),Sigma,Upper0,Upper,Lower0,Lower,D) :-
	regex(A,Sigma,Upper0,Upper,Middle0,[],D),
	regex(B,Sigma,Middle0,[],Lower0,Lower,D).
regex(Expr,Sigma,Upper0,Upper,Lower0,Lower,D) :-
	(Expr iff Expr1),
	regex(Expr1,Sigma,Upper0,Upper,Lower0,Lower,D).
regex(trace(Expr),Sigma,Upper0,Upper,Lower0,Lower,D) :-
	regex(Expr,Sigma,Upper0,Upper,Lower0,Lower,D),
	copy_term(Upper0-Upper,NewUpper0-NewUpper),	
	copy_term(Lower0-Lower,NewLower0-NewLower),
	append(UU,NewUpper,NewUpper0),
	append(LL,NewLower,NewLower0),
	format("Regex: ~p \nUpper: ~p \nLower: ~p\n\n",[Expr,UU,LL]).
regex([],_Sigma,Upper,Upper,Lower,Lower,_D).
regex(?,Sigma,Upper0,Upper,Lower0,Lower,_D) :-
	member(Symbol,Sigma),
	Upper0 = [Symbol|Upper],
	Lower0 = [Symbol|Lower].	
regex(A:B,Sigma,Upper0,Upper,Lower0,Lower,_D) :-
	member(A,Sigma),
	member(B,Sigma),
	Upper0 = [A|Upper],
	Lower0 = [B|Lower].	
regex(Expr,Sigma,Upper0,Upper,Lower0,Lower,_D) :-
	member(Expr,Sigma),
	Upper0 = [Expr|Upper],
	Lower0 = [Expr|Lower].



/* iff/2
Definitions of extensions to the basic operators. Note that also 'common' operators such as Kleene plus, optionality and complement are regarded as extensions to the set of basic operators.
*/

% Kleene plus
A+ iff A + A* .

% Optionality
opt(A) iff A |[].

% Complement
~A iff (?)* - A.

% Containment (Karttunen et al. 1997)
$A iff (?)* + A + (?)* .

% Restriction (Karttunen et al. 1997)
A => B .. C iff
	~((~((?)* + B) + (A + (?)*) | (?)* + (A + ~(C + (?)*)))).

% Replacement (Karttunen et al. 1997)
A -> B iff
	(~($(A-[])) + (A x B))* + ~($(A-[])).

% Inverse replacement (Karttunen et al. 1997)
A <- B iff
	(~($(B-[])) + (A x B))* + ~($(B-[])).

% Parallel replacement (Karttunen 199?)
(A1 -> B1, A2 -> B2) iff
	(~($((A1|A2)-[])) + ((A1 x B1) | (A2 x B2)))* + ~($((A1|A2)-[])).

% Mark-up (Karttunen et al. 1997)
A -> B .. C iff 
	(~($(A-[])) + ([] x B) + A + ([] x C))* + ~($(A-[])).

% Conditional replacement 
/* Unfortunately, the definition in ( Karttunen 1995) cannot be used (in practice) here, since it takes ages! The definition below may not be correct (if anyone comes up with a counterexample, I would like to see it..).
*/

A -> B # C .. D iff
	C -> [] .. < 
		o
	D  -> > .. [] 
		o
	( < + >) -> (> + <) 
		o
	( < + A/(>;<) + >) -> B 
		o
	(>;<) -> [] .
		

% Ignore
(A/B iff C) :- ignore(A,B,C).


ignore(A,B,B* + C) :-
	ignore1(A,B,C).
	
ignore1((A + B),Ign,(A1 + B1)) :- !,
	ignore1(B,Ign,B1),
	ignore1(A,Ign,A1).
ignore1((A ; B),Ign,(A1 ; B1)) :- !,
	ignore1(B,Ign,B1),
	ignore1(A,Ign,A1).
ignore1((A - B),Ign,(A1 - B1)) :- !,
	ignore1(B,Ign,B1),
	ignore1(A,Ign,A1).
ignore1((A & B),Ign,(A1 & B1)) :- !,
	ignore1(B,Ign,B1),
	ignore1(A,Ign,A1).
ignore1(~A,Ign,~A1) :- !,
	ignore1(A,Ign,A1).
ignore1($A,Ign,$A1) :- !,
	ignore1(A,Ign,A1).
ignore1(A*,Ign,(A1)*) :- !,
	ignore1(A,Ign,A1).
ignore1(A+,Ign,(A1)+) :- !,
	ignore1(A,Ign,A1).
ignore1(opt(A),Ign,opt(A1)) :- !,
	ignore1(A,Ign,A1).
ignore1([],_Ign,[]) :- !.
ignore1(A,Ign,A + Ign*).



/* between/3
between(+Lower,+Upper,?Number) is true when Lower =< Number =< Upper. Numbers are generated in ascending order.
*/

between(L, U, L) :- 
	L =< U.
between(L, U, N) :- 
	L < U,
	M is L+1,
	between(M, U, N).



/* Examples
*/

test1(L) :-
	regex(a+b;b;b+a;a+b+a -> x,[a,b,a],L).

test2(L) :-
	regex(a -> '[' .. ']',[a,p,a],L).

test3(L) :-
	regex(a+b -> x # a+b .. a,[a,b,a,b,a,b,a],L).

% Translation into 'bandits' language

translate(L1,L2) :-
	regex((
	( b -> b + o + b ) o 
	( c -> c + o + c ) o 
	( d -> d + o + d ) o 
	( f -> f + o + f ) o 
	( g -> g + o + g ) o 
	( h -> h + o + h ) o 
	( j -> j + o + j ) o 
	( k -> k + o + k ) o 
	( l -> l + o + l ) o 
	( m -> m + o + m ) o 
	( n -> n + o + n ) o 
	( p -> p + o + p ) o 
	( q -> q + o + q ) o 
	( r -> r + o + r ) o 
	( s -> s + o + s ) o 
	( t -> t + o + t ) o 
	( v -> v + o + v ) o 
	( x -> x + o + x ) o 
	( z -> z + o + z ) ),L1,L2).


% Finite-State Intersection Grammar (Koskenniemi et al. 199?)

fsig(L) :-
	regex(((what^det|what^pron)+(question^n|question^v) & 
		~(_^det + _^v) &
		$(_^v)),L,L).



