#!/bin/sh -

#--
# External parser for HTDIG that parses Word files so they can
# be indexed.
#--
# Written by Richard W.M. Jones <rjones@imcl.com>. Distributed freely
# under the terms of the GNU General Public License (GPL).
#--

#----------------------------------------------------------------------
# Configurable stuff here:

# The program that converts Word files into text. I use ``catdoc''
# by Victor Wagner <vitus@agropc.msk.su>. You may wish to just use
# ``strings''.
CATDOC=/usr/local/bin/catdoc
#CATDOC=strings

# End of configurable stuff.
#----------------------------------------------------------------------

# Arguments are:
#   $1 = input file
#   $2 = content type (ignored)
#   $3 = base URL
#   $4 = HTDIG config file (ignored)
# HTDIG expects us to print out:
#   w WORD LOCATION HEADING    Word at location 0-1000 under heading
#   u URL DESCRIPTION          URL with description
#   t TITLE                    Title of document
#   h HEAD                     Heading
#   a ANCHOR                   Anchor (ie. like <a name="">)
#   i IMAGE_URL                Image pointer

#----------------------------------------------------------------------

# Format input to word per line.

function wordPerLine () {
    awk '{for (i=1;i<=NF;++i) print $i;}'
}

# Change non-alphabetical/numeric characters in space.

function removeNonAlNum () {
    tr -c '[a-zA-Z0-9\n]' ' '
}

#----------------------------------------------------------------------

# Parse input file to linear list of words.
$CATDOC $1 | removeNonAlNum | wordPerLine > /tmp/htparsedoc.$$

# Compute length of list.
filelen=`wc -l < /tmp/htparsedoc.$$`

# We can't find the title from the word document, so make one up.
/usr/bin/echo "t\tWord document $3"

# Pass words to htdig.
if [ $filelen -gt 0 ]; then
    awk "{printf (\"w\t%s\t%d\t-\t\n\", \$1, 1000*NR/$filelen);}" \
	< /tmp/htparsedoc.$$
fi

# Remove temporary file.
rm /tmp/htparsedoc.$$
