/*   CONTAINSWORDS - perform a comparison of two series of words, 'searchlist, and 'testlist'.
 *	Searchlist is the word(s) to search for, and testlist is the word(s) being tested.
 *	Useful in search engine applications, since it uses a syntax similar in functionality 
 *	to the way google, ebay, etc. seem to work.
 *
 * 	The quality of the match is represented by a integer score value 0 - 20, which is
 *	returned.  A score of 20 indicates no match.  The scoring takes into account number
 *	of search words present and word order and position (specifically, the number of 
 *	non-adjacencies found).
 *
 *	All matching is case insensitive.  Words are parsed on any combination of whitespace or 
 *	punctuation characters.  Search words can include "quoted terms" which are
 *	considered one term, and must match exactly (except for case).  Otherwise, each
 *	word is considered one term.  Individual words of any length can match exactly.
 *	Words over two characters long can also match based only on the search word, eg. 
 *	'case' would match 'case' and 'case' would also match 'casey' (with a slightly
 *	worse score).
 *
 *   
 *      Scoring:
 *      0	 all terms present and adjacent in the order given
 *      1,2,3    all terms present, but 1, 2, or 3 non-adjacencies
 *      5,6,7    all but 1 term present, with 0, 1, or 2+ non-adjacencies
 *      9,10,11	 all but 2 terms present, with 0, 1, or 2+ non-adjacencies
 *      13	 all but 3 terms present
 *      15	 at least one term present
 *      20	 no terms present
 *
 *              * if any non-exact word matches were found (outside of quoted terms), score 
 *		  is increased by one point.
*/	



#include <stdio.h>
#include <ctype.h>
#include <string.h>

#define MAXWORDS 20 
#define MINSTARSIZE 3	/* search words shorter than this must get an exact match */
#define strnicmp(a,b,c)  strncasecmp(a,b,c)

int
GL_containswords( searchlist, testlist )
char *testlist;	/* sample being tested */
char *searchlist;   	/* search words */
{
int i, j, k;

int nsw;			/* number of search words, regardless of quoting */
char *searchword[MAXWORDS];
int searchwordlen[MAXWORDS];
int inqterm[MAXWORDS], qorder[MAXWORDS], revqorder[MAXWORDS];

int ntw;
char *testword[MAXWORDS];
int testwordlen[MAXWORDS];

int wassep, laststart, diff;
int qflag, qcount, qw, prevq;  /* qflag is 1 if within quotes; qcount uniquely ids quoted terms; qw notes word order with quoted term */

int nterms, termmatch[MAXWORDS], nmatchingterms;
int nadjacent, score, nonexact;


/* scan search words string and mark beginnings of words, and keep track of lengths of words.. */
nsw = -1;
qflag = qcount = 0; qw = 1; /* quote handling state */
for( i = 0, wassep = 1, laststart = 0; searchlist[i] != '\0'; i++ ) {
	if( isspace( (int) searchlist[i] ) || ispunct( (int) searchlist[i] ) ) {
		if( ! wassep ) { /* end of a word */
			searchwordlen[ nsw ] = i - laststart;
			if( qflag ) { inqterm[ nsw ] = qcount; qorder[ nsw ] = qw++; }
			else { inqterm[ nsw ] = 0;  qorder[ nsw ] = 0; }
			wassep = 1;
			}
		}
	else 	{
		if( wassep ) {  /* begining of a word */
			nsw++;
			if( nsw >= MAXWORDS ) goto PARSE_TESTLIST;
			searchword[ nsw ] = &searchlist[i];
			laststart = i;
			}
		wassep = 0;
		}
	if( searchlist[i] == '"' ) { 
		qflag = !qflag; 
		if( qflag ) { qcount++; qw = 0; }	/* quote handling */
		}
	}
if( ! wassep ) {
	searchwordlen[ nsw ] = i - laststart;
	if( qflag ) { inqterm[ nsw ] = qcount; qorder[ nsw ] = qw++; }
	else { inqterm[ nsw ] = 0;  qorder[ nsw ] = 0; }
	}
nsw++;


/* scan the test words string and mark beginnings of words, and keep track of lengths of words.. */
PARSE_TESTLIST:
ntw = -1;
for( i = 0, wassep = 1, laststart = 0; testlist[i] != '\0'; i++ ) {
	if( isspace( (int) testlist[i] ) || ispunct( (int) testlist[i] ) ) {
		if( ! wassep ) { /* end of a word */
			testwordlen[ ntw ] = i - laststart;
			wassep = 1;
			}
		}
	else 	{
		if( wassep ) {  /* begining of a word */
			ntw++;
			if( ntw >= MAXWORDS ) goto COMPARE;
			testword[ ntw ] = &testlist[i];
			laststart = i;
			}
		wassep = 0;
		}
	}
if( ! wassep ) testwordlen[ ntw ] = i - laststart;
ntw++;

COMPARE:

/* find reverse qorder so we have a 'countdown'.. */
prevq = inqterm[ nsw-1 ];
for( i = nsw-1, j = 0; i >= 0; i-- ) {
	if( !inqterm[i] ) revqorder[i] = -1;
	else	{
		if( inqterm[i] != prevq ) { j = 0; prevq = inqterm[i]; }
		revqorder[i] = j++;
		}
	}
	
/*
 * printf( "\n======= %d search words\n", nsw );
 * for( i = 0; i < nsw; i++ ) printf( "%s (len: %d) (q:%d %d %d)\n", 
 * 	searchword[i], searchwordlen[i], inqterm[i], qorder[i], revqorder[i] ); 
 * printf( "======= %d test words\n", ntw );
 * for( i = 0; i < ntw; i++ ) printf( "%s (%d)\n", testword[i], testwordlen[i] );
 */




/* run the word comparison.. */
nterms = -1;
nadjacent = 1;
nonexact = 0;

for( i = 0; i < nsw; i++ ) { 			/* loop across search words  */


	if( inqterm[i] && qorder[i] != 0 ) continue; 
	
	nterms++;
	termmatch[ nterms ] = 0;
	

	for( j = 0; j < ntw; j++ ) {		/* loop across test words */

		if( searchwordlen[i] < MINSTARSIZE && searchwordlen[i] != testwordlen[j] ) diff = 1;
		else diff = strnicmp( searchword[i], testword[j], searchwordlen[i] );


		if( diff == 0 ) {

			/* if this search word is within a quoted term, check subsequent word(s).. */
			if( inqterm[i] && qorder[i] == 0 ) {
			   	for( k = 0; k <= revqorder[i]; k++ ) {
					if( j+k >= ntw ) break;  				/* premature end of testlist */
					else if( searchwordlen[i+k] != testwordlen[j+k] ) break;	/* non-equal lengths */

					/* changed to fix 'contains' bug, scg 2/24/04 */
					/* was: else if( strnicmp( searchword[i+k], testword[j+k], searchwordlen[i+k] ) == 0 ) continue; */
					else if( strnicmp( searchword[i+k], testword[j+k], searchwordlen[i+k] ) != 0 ) break; 

					}
				if( k > revqorder[i] ) {
					termmatch[ nterms ] = 1;
					i+= (k+1);
					/* see if subsequent word also matches.. */
					if( i < nsw && j+k+1 < ntw ) {
						if( searchwordlen[i] < MINSTARSIZE && searchwordlen[i] != testwordlen[j+k+1] ) diff = 1;
						else diff = strnicmp( searchword[i], testword[j+k+1], searchwordlen[i] );
						if( diff == 0 ) nadjacent++; 
						}
					i--;
					goto NEXTSEARCHWORD;
					}
				else continue; /* move on to next test word */
				}

			else 	{
				termmatch[ nterms ] = 1;
				if( searchwordlen[i] != testwordlen[j] ) nonexact++;

				/* see if we have adjacent words.. */
				if( i < nsw-1 && j < ntw-1 ) {
					if( searchwordlen[i+1] < MINSTARSIZE && searchwordlen[i+1] != testwordlen[j+1] ) diff = 1;
					else diff = strnicmp( searchword[i+1], testword[j+1], searchwordlen[i+1] );
					if( diff == 0 ) { nadjacent++; goto NEXTSEARCHWORD; }
					}
				}
			}
		}
	NEXTSEARCHWORD: ;
	}
nterms++;

nmatchingterms = 0;
for( i = 0; i < nterms; i++ ) if( termmatch[i] ) nmatchingterms++;

if( nadjacent > nmatchingterms ) nadjacent = nmatchingterms;

if( nmatchingterms == 0 ) score = 20;
else if( nmatchingterms == nterms ) {
	score = nterms - nadjacent;
	if( score > 3 ) score = 3;
	}
else if( nmatchingterms == (nterms-1) ) {
	score = 5 + (nmatchingterms - nadjacent);
	if( score > 7 ) score = 7; 
	}
else if( nmatchingterms == (nterms-2) ) {
	score = 9 + (nmatchingterms - nadjacent);
	if( score > 11 ) score = 11; 
	}
else if( nmatchingterms == (nterms-3) ) score = 13;
else if( nmatchingterms > 0 ) score = 15;

if( score < 20 && nonexact ) score++;  /* add 1 if any nonexact found */

/* printf( "nterms: %d   nfound: %d   nadjacent: %d  nonexact: %d   score: %d\n", 
 *  nterms, nmatchingterms, nadjacent, nonexact, score ); 
 */

return( score );
}



#ifdef TESTING
main( argc, argv )
int argc;
char **argv;
{
int stat;
char testlist[100], searchlist[100];

if( argc != 3 ) exit( 1 );
strcpy( searchlist, argv[1] );
strcpy( testlist, argv[2] );
stat = GL_containswords( searchlist, testlist, 0, 0 );
printf( "stat = %d\n", stat );
}
#endif
