/* BUILDIX - Build ISAM index(es).  Invoked by maintain(1) or may be invoked manually.

   Usage: buildix tablename fieldname level ixtype [-a] [-t sortmethod]

   level: either 2 or 3.  Level 3 is used for larger files and involves an additional tier.

   ixtype: either 'standard' or 'direct'.  

	'direct' indicates that there is no level 1 index, rather, the data file itself is 
	already physically ordered on the field, and level 2 entries point into the data file.


   Option:

	-a 		Actual mode, indicates that buildix was invoked recursively.
			-a is never on the command line when invoked manually.

	-t sortmethod	Allows other sort methods to be specified.  Currently the only
			sortmethod implemented is "num".  Default sort method is alphanumeric.

	-n altname	This option allows database table reads to proceed while new indexes are 
			being built.  There must be a copy of the table file called 'altname'
			in the database 'data' directory.  Generated indexfiles will be named
			using 'altname' as the tablename, and be placed in the database 'indexes' 
			directory.
			

   Result index files will be placed in the database 'indexes' subdirectory and be named
   tablename.fieldname.indexlevel

   Note: multiple index files are created by recursive invocation of this program,
   using the -a (actual) command line argument.  -a is only present in automatically
   invoked instances.
		eg. if user does buildix 2 .., first round does .1, 
                           then recursive invoke: buildix 2 .. -a 2

		     or if user does buildix 3 .., first round does .1,
			      then recursive invoke: buildix 3 .. -a 2
			      then recursive invoke: buildix 3 .. -a 3

 * Copyright 2002 Stephen C. Grubb  
 * This code is covered under the GNU General Public License (GPL);
 * see the file ./Copyright for details. 

*/

#include <stdio.h>
#include <ctype.h>
#include "tdhkit.h"
#include "shsql.h"

/* This program relies on the availability of unix 'sort' and 'ls' commands */

#define MAXCWP  20000	/* max size of common words pool */
#define MAXNCW  2000	/* max number of common words */
#define MAXDUPPOOL 3000 /* max size of duplicate words pool */
#define MAXDUPS 300 	/* max number of duplicate words */

#define MAXF 10		/* max number of fields involved in a combinedword index */

#define USAGE_MSG 	"usage: buildix tablename fieldname level indextype [-b bytes]"

#define PERIOD_D 30	/* sample every 30 records when doing a direct */
#define PERIOD   100	/* sample every 100 records otherwise */

#define STANDARD 0
#define DIRECT 1	/* only on the level 2 of a direct index, upper levels are STANDARD */
#define WORD 2		/* combinedword will also use this */

extern int TDH_setfmdelim();

/* common word list */
char cwp[ MAXCWP ], *commonword[ MAXNCW ];
int ncw;

/* duplicate word pool */
int ndup, duploc;
char duppool[MAXDUPPOOL], *dups[MAXDUPS];


int
main( argc, argv )
int argc;
char **argv;
{
int i, j, k, ix;
FILE *ifp, *ofp;
char buf[ MAXRECORDLEN ];
char tok[ DATAMAXLEN+1 ];
char key[MAXF][ DATAMAXLEN+1 ];
char infile[200];
char outfile[200];
char tablename[MAXPATH];
char fakename[MAXPATH];
char fieldname[MAXF][80];
long ofs;
long ftell();
int col;
int fldpos[ MAXF ];
int indexlevel;
int nrec;
int isample;
int highestlevel;
int nout, numsort, stat;
int oflag;	/* 1 if tablename is a full pathname rather than a table name */
char topt[20];
char ixtypestr[255];
int ixtype;
char b64ofs[20];
int ncw;
int nflds;
long firstrecofs;
int buflen;
char altname[80];
int lasttime;
int addfake0; /* if 1, add fake entries to the .0 file for secondary fields (combined types only) */

TDH_errprog( "buildix" );

if( argc < 5 ) {
	fprintf( stderr, "%s\n", USAGE_MSG ); 
	exit(1);
	}


/* process config file.. */
stat = SHSQL_allconfig();
if( stat != 0 ) exit( 1 );


/* process command line args.. */
highestlevel = 0;
numsort = 0;
ncw = 0;
nflds = 1;
addfake0 = 0;
strcpy( altname, "" );

strcpy( tablename, argv[1] ); /* it may be possible to have separate recid and tablename */

strcpy( fieldname[0], argv[2] );

highestlevel = atoi( argv[3] );
if( highestlevel < 2 || highestlevel > 3 ) { fprintf( stderr, "Invalid level specified. %s\n", USAGE_MSG ); exit( 1 ); }

strcpy( ixtypestr, argv[4] ); /* for combinedword this will be of form:   combinedword,fieldname2,..,fieldnameN */
			      /* for combined this will be   combined,fieldname2,..,fieldnameN */

indexlevel = 0;
	
for( i = 5; i < argc; i++ ) {
	if( i < (argc-1) && strcmp( argv[i], "-a" )==0 ) { i++; indexlevel = atoi( argv[i] ); }
	else if( i < (argc-1) && strcmp( argv[i], "-t" )==0 ) { 
		i++; 
		if( argv[i][0] == 'n' ) numsort = 1;
		}
	else if( i < (argc-1) && strcmp( argv[i], "-n" )==0 ) { 
		i++; 
		strcpy( altname, argv[i] );
		}
	}

ixtype = STANDARD;


/* if index level not given via -a, set it here.. */
if( indexlevel == 0 ) {
	if( stricmp( ixtypestr, "direct" )==0 ) { indexlevel = 2; ixtype = DIRECT; }
	else if( stricmp( ixtypestr, "word" )==0 ) { indexlevel = 1; ixtype = WORD; }
	else if( strnicmp( ixtypestr, "combined", 8 )==0 ) { 
		indexlevel = 1; 
		if( strnicmp( ixtypestr, "combinedword", 12 )==0 ) { ixtype = WORD; ix = 13; }
		else { ixtype = STANDARD; ix = 9; }
		addfake0 = 1;
		for( nflds = 1; ; nflds++ ) {
			if( nflds >= MAXF ) { 
				fprintf( stderr, "buildix: max of %d fields reached.. additional fields not included\n", MAXF ); 
				break;
				}
			GL_getseg( fieldname[nflds], ixtypestr, &ix, "," );
			if( fieldname[nflds][0] == '\0' ) break;
			}
		}
	else indexlevel = 1; /* ixtype will be STANDARD */
	}
/* otherwise indexlevel was set via -a above, to do upper levels */




if( GL_member( tablename[0], "./" )) oflag = 1;
else oflag = 0;

if( ixtype == WORD && ( indexlevel == 1 || ixtype == DIRECT ) ) {
	FILE *cfp;
	int len, cwplen;
	/* for word index, get list of "very common words" that are not worth indexing.. */
	if( SHSQL_commonwordsfile[0] != '\0' ) {
		cfp = fopen( SHSQL_commonwordsfile, "r" );
		if( cfp == NULL ) fprintf( stderr, "buildix warning: no common words file found (%s)\n", SHSQL_commonwordsfile );
		else	{
			ncw = 0;
			cwplen = 0;
			commonword[ ncw ] = &cwp[ cwplen ];
			while( fgets( buf, 200, cfp ) != NULL ){
				if( buf[0] == '/' || isspace( (int) buf[0] )) continue;
				ncw++;
				if( ncw >= MAXNCW ) { 
					fprintf( stderr, "buildix warning: common words list is too large.. truncating\n" );
					break;
					}
				len = strlen( buf ) -1;
				if( cwplen+len >= MAXCWP ) { 
					fprintf( stderr, "buildix warning: common words list is too large.. truncating\n" );
					break;
					}
				strcpy( &cwp[ cwplen ], buf );
				cwplen += (len);
				cwp[cwplen++] = '\0';
				commonword[ ncw ] = &cwp[cwplen];
				}
			fclose( cfp );
			}
		}
	}
			

strcpy( fakename, tablename );  /* use fakename so we can handle hierarchical tables */
for( i = 0; fakename[i] != '\0'; i++ ) if( fakename[i] == '/' ) fakename[i] = '!';


/* open input file.. */
if( indexlevel == 1 || ixtype == DIRECT ) {

	/* open data file, get field name header, and get ready to read from it.. */
	if( oflag ) strcpy( infile, tablename );
	else if( altname[0] ) sprintf( infile, "%s/data/%s", SHSQL_projdir, altname );
	else sprintf( infile, "%s/data/%s", SHSQL_projdir, tablename );

	ifp = fopen( infile, "r" );		/* open the file */
	if( ifp == NULL ) { fprintf( stderr, "buildix error: cannot open file (%s) to read\n", infile ); exit( 1 ); }

	/* get field name header */				/* tagvalue */
	while( fgets( buf, MAXRECORDLEN-1, ifp ) != NULL ) { 
		if( strncmp( buf, "//", 2 )==0 || isdelim( buf[0] ) || buf[0] == '\n' ) continue;	/* datadelim */
		else break;
		}
	TDH_setfmdelim( SHSQL_delim );
	TDH_loadfieldmap( "shsql_header", buf ); 	/* load fieldmap */	/* datadelim */  /* tagvalue */
	TDH_setfmdelim( BLANK );
	for( i = 0; i < nflds; i++ ) {
		fldpos[i] = fieldmap( fieldname[i] );
		if( fldpos[i] < 0 ) { fprintf( stderr, "buildix error: unrecognized field name %s\n", fieldname[i] ); exit(1); }
		}
	firstrecofs = ftell( ifp );
	}


else	{
	
	/* open the index file we just built and get ready to read from it.. */
	if( oflag ) sprintf( infile, "%s.%s.%d", tablename, fieldname[0], indexlevel-1 );
	else if( altname[0] ) sprintf( infile, "%s/indexes/%s.%s.%d", SHSQL_projdir, altname, fieldname[0], indexlevel-1 );
	else sprintf( infile, "%s/indexes/%s.%s.%d", SHSQL_projdir, fakename, fieldname[0], indexlevel-1 );
	ifp = fopen( infile, "r" );
	if( ifp == NULL ) { fprintf( stderr, "buildix error: cannot open input %s\n", infile ); exit(1); }
	}



/* try opening outfile to be sure we can.. */
if( oflag ) sprintf( outfile, "%s.%s.%d", tablename, fieldname[0], indexlevel );
else if( altname[0] ) sprintf( outfile, "%s/indexes/%s.%s.%d", SHSQL_projdir, altname, fieldname[0], indexlevel );
else sprintf( outfile, "%s/indexes/%s.%s.%d", SHSQL_projdir, fakename, fieldname[0], indexlevel );

ofp = fopen( outfile, "w" );
if( ofp == NULL ) { fprintf( stderr, "buildix error: cannot open file (%s) to write\n", outfile ); exit( 1 ); }


/* show brief in-progress message.. */
fprintf( stderr, "%.12sindex%s:%d  ", ixtypestr, (numsort)?"(num)":"", indexlevel );


/* create header.. */
fprintf( ofp, "! index-type: %s sort-type: %s  top-index-is: %d ", ixtypestr, (numsort)?"num":"alpha", highestlevel );

if( indexlevel == 1 || ixtype == DIRECT ) {
	fseek( ifp, 0L, SEEK_END ); /* go to end of file */
	fprintf( ofp, "   datafile-new-area-begins-at: %ld", ftell( ifp ) );
	fseek( ifp, firstrecofs, SEEK_SET ); /* restore to first usable record in file */
	}
fprintf( ofp, "\n" );


/* if we need sorted output, open a sort pipe.. */
if( indexlevel == 1 || ixtype == DIRECT ) {
	fclose( ofp );

	/* output will be processed via 'sort'.. */
	if( !isspace( (int) SHSQL_delim ) ) sprintf( topt, "-t '%c'", SHSQL_delim );
	else strcpy( topt, "" );

	if( numsort ) sprintf( buf, "sort -n +0 -1 %s >> %s", topt, outfile );
	else sprintf( buf, "sort +0 -1 %s >> %s", topt, outfile ); 
	putenv( "LOCALE=C" );						/* locale-collate */
	ofp = popen( buf, "w" );
	if( ofp == NULL ) { fprintf( stderr, "buildix error: cannot open output sort pipe: %s\n", buf ); exit(1); }
	}



/* determine sampling interval.. */
if( ixtype == DIRECT ) isample = PERIOD_D;
else isample = PERIOD;

/* skip over header record in input file, if any.. */
if( indexlevel > 1 || (ixtype == DIRECT && indexlevel > 2 ) ) fgets( buf, MAXRECORDLEN-1, ifp );  /* changed 9/30/02 */


/* start reading.. */
ofs = ftell( ifp ); /* get disk offset for first record to be read.. */
nrec = 0;
nout = 0;
while( fgets( buf, MAXRECORDLEN-1, ifp ) != NULL ) {

	buflen = strlen( buf );
	if( buf[ buflen-1 ] == '\n' ) buf[ buflen-1 ] = '\0';

	/* parse out fields.. */
	if( indexlevel == 1 || ixtype == DIRECT ) {
        	i = 0; col = 0;
        	while( 1 ) {
			SHSQL_getfld( tok, buf, &col );			/* datadelim */		/* tagvalue */
                	if( tok[ 0 ] == '\0' ) break;
                	if( i == 0 && strncmp( tok, "//", 2 )==0 ) goto BOTTOM;
			for( j = 0; j < nflds; j++ ) { /* loop runs once except for combinedword */
				if( i == fldpos[j] ) strcpy( key[j], tok ); 
				}
                	i++;
                	}
        	if( i == 0 ) goto BOTTOM; /* skip blank lines */
		}
	else	{
		col = 0;
		SHSQL_getfld( key[0], buf, &col );
		if( key[0][0] == '\0' || strncmp( key[0], "//", 2 )==0 ) goto BOTTOM;
		SHSQL_getfld( tok, buf, &col );
		}


	/* output an index entry.. */

	/* convert ofs to base64 representation for compactness.. */
	SHSQL_l_to_b64( ofs, b64ofs );


	if( ixtype == WORD && indexlevel == 1 ) {  
	    /* break up key into words (delimited on any space or punct char) and output an index entry for each.. */
	    char *word;
	    int wassep, laststart, foundsomething, common, duplicate, wordlen;

	    ndup = 0;
	    duploc = 0;

	    for( k = 0; k < nflds; k++ ) {  /* loop runs once for word indexes, more than once for combinedword */
	
		if( !numsort ) for( i = 0; key[k][i] != '\0'; i++ ) key[k][i] = tolower( key[k][i] ); /* convert key to lower case */

		wassep = 1; foundsomething = 0; lasttime = 0;
		for( i = 0; ; i++ ) {
			if( key[k][i] == '\0' ) lasttime = 1;
        		if( ( isspace( (int) key[k][i] ) || ispunct( (int) key[k][i] ) ) || lasttime ) { 
                		if( ! wassep ) { /* end of a word */
					key[k][i] = '\0';			

					/* be sure word is not in the common words list.. */
					for( j = 0, common = 0; j < ncw; j++ ) if( stricmp( word, commonword[j] )==0 ) common = 1;

					/* be sure word is not a duplicate ie. has not already been written for this row.. */
					for( j = 0, duplicate = 0; j < ndup; j++ ) if( stricmp( word, dups[j] )==0 ) duplicate = 1;

					if( !common && !duplicate ) {   /* output an index entry.. */
						if( i - laststart > SHSQL_ixtrunclen ) key[k][ laststart + SHSQL_ixtrunclen ] = '\0';
						fprintf( ofp, "%s%c%s\n", word, SHSQL_delim, b64ofs );		/* datadelim */
						nout++;
						foundsomething = 1;

						/* add to duplicate control word list.. */
						wordlen = strlen( word );
						if( duploc + wordlen < (MAXDUPPOOL-5) && ndup < (MAXDUPS-3) ) { 
							dups[ndup++] = &duppool[duploc];
							strcpy( &duppool[duploc], word );
							duploc += strlen( word ) + 1;
							}
						}
					}
                        	wassep = 1;
				if( lasttime ) break;
                        	}
        		else    {
                		if( wassep ) {  /* begining of a word */
                        		word = &key[k][i];
                        		laststart = i;
					}
                		wassep = 0;
				}
                	}
		if( !foundsomething ) {	 /* put out an entry for null */
			fprintf( ofp, "%s%c%s\n", TDH_dbnull, SHSQL_delim, b64ofs );		/* datadelim */
			nout++;
			}
		}
	    }

	else if( indexlevel == 1 || (nrec % isample) == 0 ) {
		for( k = 0; k < nflds; k++ ) {  /* combined can use key[0]..[n]; all other types use only key[0]  - scg 12/6/04 */ 
			key[k][ SHSQL_ixtrunclen ] = '\0'; /* truncate */
			if( !numsort ) for( i = 0; key[k][i] != '\0'; i++ ) key[k][i] = tolower( key[k][i] ); /* convert key to lower case */
			fprintf( ofp, "%s%c%s\n", key[k], SHSQL_delim, b64ofs );		/* datadelim */
			nout++;
			}
		}
	nrec++;

	BOTTOM:

	ofs = ftell( ifp ); /* get disk offset for next record to be read.. */
	}

/* close files */
fclose( ifp );
if( indexlevel == 1 || ixtype == DIRECT ) pclose( ofp );
else fclose( ofp );



/* invoke next higher level, if necessary.. */
if( indexlevel < highestlevel ) {
	if( SHSQL_bin[0] != '\0' ) sprintf( tok, "%s/buildix", SHSQL_bin );
	else strcpy( tok, "buildix" ); /* path lookup */

	/* the index type is passed and used to label index headers, but all higher level indices always 
   	   have a 'standard' structure since they just point to another index */
	sprintf( buf, "%s %s %s %d %s -a %d -t %s %s %s", tok, tablename, fieldname[0], highestlevel, ixtypestr, indexlevel+1, 
		(numsort)?"num":"alpha", (altname[0])?"-n":"", altname );
	/* fprintf( stderr, "\nbuildix is invoking: %s\n", buf );  */
	system( buf );
	}

/* for combined index types, add 'fake' entries into the .0 file for secondary fields.. 
 * necessary so that if these fields are updated, the system will know to append new record at end.. */
if( addfake0 && indexlevel == 1 && !oflag ) { 
	FILE *ffp;
	sprintf( buf, "%s/indexes/%s.0", SHSQL_projdir, fakename );
	ffp = fopen( buf, "a" );
	if( ffp == NULL ) fprintf( stderr, "buildix error: cannot open %s to append fake entries\n", buf ); 
	else 	{
		for( i = 1; i < nflds; i++ ) fprintf( ffp, "%s.%s.2\n", fakename, fieldname[i] );
		fclose( ffp );
		}
	}

fprintf( stderr, "\n" );
exit( 0 );
}
