/* ,file-id archive://[lord]/429/rx/lexer.c/1998-05-18
 */
/*	Copyright (C) 1997 Tom Lord
 * 
 * This program is provided to you under the terms of the Liberty Software
 * License.  You are NOT permitted to redistribute, modify, or use it
 * except in very specific ways described by that license.
 *
 * This software comes with NO WARRANTY.
 * 
 * You should have received a copy of the Liberty Software License
 * along with this software; see the file =LICENSE.  If not, write to
 * the Tom Lord, 1810 Francisco St. #2, Berkeley CA, 94703, USA.  
 */



#include "rx/regex.h"
#include "lexer.h"



/* scan_lexeme
 *
 * This function encapsulates a common way to use regular expressions:
 * for lexical analysis.
 *
 * Parameters:
 *
 * llen : (return parameter) length of the match.
 * unfa : (input/output parameter) an NFA for this language.
 * regexp : a regexp describing the lexical language to match with.
 * buf : text to be searched for a lexeme.
 * len : length of the text to be searched.
 *
 * The string passed for "regexp" defines the lexical language.
 * It should be of a form similar to:
 *
 * 	"if[[:cut 1:]]\|then[[:cut 2:]]..."
 *
 * State labels (the arguments to the "cut" operator) determine the 
 * value returned by "scan_lexeme".  In cases of ambiguity, the longest
 * matching lexeme is returned.  Where two equally long lexemes match,
 * the more positive state label is returned.
 *
 * The value of "*unfa" should be initialized to 0 and the value
 * passed for "unfa" should be the same whenver the value passed
 * for "regexp" is the same.  The result is effective use of 
 * the superstate cache.
 *
 * Upon return, "*unfa" points to an rx_unfa which can be freed
 * by calling "rx_free_unfa" declared in "unfa.h", but which should
 * normally be saved for later calls to scan_lexeme.
 *
 * Return value:
 * 
 * 0 if the end of string is reached before a lexeme is found.
 * -1 if the string is found to not begin with a valid lexeme.
 * <n>, the state label associated with a matched lexeme.
 *
 *
 */

int
scan_lexeme (int * llen,
	     struct rx_unfa ** unfa,
	     unsigned char * regexp,
	     char * buf, int len)
{
  if (!len)
    return 0;

  if (!*unfa)
    {
      int ret;
      struct rx_exp_node * parsed;
      int ign;

      ign = 0;
      ret = rx_parse (&parsed, &ign, regexp, strlen (regexp), 1, 1, 1, 256, 0);
      if (ret)
	panic ("unable to compile regexp in scan_lexeme");
      *unfa = rx_unfa (parsed, 256);
      rx_free_rexp (parsed);
    }

  {
    struct rx_dfa ldfa;
    int ate;
    int final_tag;

    rx_init_dfa_from_rx (&ldfa, (*unfa)->nfa);
    rx_dfa_goto_start_superstate (&ldfa);
    
    ate = 0;
    final_tag = 0;

    {
      int added;
      while (1)
	{
	  added = rx_dfa_advance_to_final (&ldfa, buf + ate, len - ate);
	  if (added == 0)
	    break;
	  else
	    {
	      if (ldfa.final_tag)
		{
		  final_tag = ldfa.final_tag;
		  ate += added;
		}
	      else
		break;
	    }
	}
    }

    if (!final_tag)
      {
	if (len == ate)
	  {
	    rx_free_dfa_storage (&ldfa);
	    return 0;
	  }
	else
	  {
	    rx_free_dfa_storage (&ldfa);
	    return -1;
	  }
      }
    else
      {
	rx_free_dfa_storage (&ldfa);
	*llen = ate;
	return final_tag;
      }
  }
}
