/*************************************************************************
 *
 *  OpenOffice.org - a multi-platform office productivity suite
 *
 *  $RCSfile: SimpleTokenizer.java,v $
 *
 *  $Revision: 1.2 $
 *
 *  last change: $Author: rt $ $Date: 2005/09/09 16:49:32 $
 *
 *  The Contents of this file are made available subject to
 *  the terms of GNU Lesser General Public License Version 2.1.
 *
 *
 *    GNU Lesser General Public License Version 2.1
 *    =============================================
 *    Copyright 2005 by Sun Microsystems, Inc.
 *    901 San Antonio Road, Palo Alto, CA 94303, USA
 *
 *    This library is free software; you can redistribute it and/or
 *    modify it under the terms of the GNU Lesser General Public
 *    License version 2.1, as published by the Free Software Foundation.
 *
 *    This library is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *    Lesser General Public License for more details.
 *
 *    You should have received a copy of the GNU Lesser General Public
 *    License along with this library; if not, write to the Free Software
 *    Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 *    MA  02111-1307  USA
 *
 ************************************************************************/

package com.sun.xmlsearch.util;

import java.text.*;

public final class SimpleTokenizer extends Tokenizer {
    private BreakIterator _boundary = BreakIterator.getWordInstance();
    private int _start, _end;

    public void setText(String text) {
		_boundary.setText(_source = text);
		_start = _boundary.first();
    }

    private Token generateToken(int start, int end) {
		//  System.out.println("'"+_source.substring(start, end)+"'");
		return new Token(_source.substring(start, end), start, end);
    }

    public Token nextToken() {
		while ((_end = _boundary.next()) != BreakIterator.DONE)
			if (_end - _start == 1	// one character 'token' -- typically punctuation
				&& Character.isLetterOrDigit(_source.charAt(_start)) == false)
				_start = _end;		// ... looking for a 'real token'
			else if (Character.isWhitespace(_source.charAt(_start)))
				_start = _end;		// ... looking for a 'real token'
			else
				return generateToken(_start, _start = _end);
		return null;
    }
}
