/*
 * Insert an index between "<!--begin-index-->" and "<!--end-index-->",
 * or replacing the comment "<!--index-->"
 *
 * The index links to elements with ID attributes as well as with
 * empty <A NAME> elements.
 *
 * Any <A> tags with a class of "bctarget" are not copied, but
 * regenerated. They are assumed to be backwards-compatible versions
 * of ID attributes on their parent elements. But if the option -t or
 * -x are given, those <A> elements are removed.
 *
 * There's a limit of 100000 index terms (10^(MAXIDLEN-1)).
 *
 * Index terms are elements with a class of "index", "index-inst" or
 * "index-def", as well as all <dfn> elements. The contents of the
 * element is the index term, unless the element has a title
 * attribute. The title attribute can contain "|" and "!!":
 *
 * "term"
 * "term1|term2|term3|..."
 * "term!!subterm!!subsubterm!!..."
 * "term1!!subterm1|term2!!subterm2|..."
 * etc.
 *
 * For backward compatibility with an earlier Perl program, "::" is
 * accepted as an alternative for "!!", but it is better not to use
 * both separators in the same project, since the sorting maybe
 * adversely affected.
 *
 * Class "index-def" results in a bold entry in the index, "index" in
 * a normal one. "index-inst" is an alias for "index", provided for
 * backward compatibility.
 *
 * Copyright © 1994-2005 World Wide Web Consortium
 * See http://www.w3.org/Consortium/Legal/copyright-software
 *
 * Author: Bert Bos <bert@w3.org>
 * Created: 11 Apr 2000
 * Version: $Id: hxindex.c,v 1.6 2009/03/25 18:46:19 bbos Exp $
 *
 **/
#include <config.h>
#include <assert.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>
#if STDC_HEADERS
# include <string.h>
#else
# ifndef HAVE_STRCHR
#  define strchr index
#  define strrchr rindex
# endif
# ifndef HAVE_STRSTR
#  include "strstr.e"
# endif
#endif
#ifdef HAVE_ERRNO_H
#  include <errno.h>
#else
extern int errno;
char *strerror(int errnum);
int strerror_r(int errnum, char *buf, size_t n);
#endif

#ifdef HAVE_ERRNO_H
#  include <errno.h>
#endif

#ifdef HAVE_SEARCH_H
#  include <search.h>
#else
#  include "search-freebsd.h"
#endif
#include "export.h"
#include "types.e"
#include "heap.e"
#include "tree.e"
#include "html.e"
#include "scan.e"
#include "dict.e"
#include "openurl.e"
#include "genid.e"
#include "errexit.e"
#include "class.e"

#undef USE_DATA_ATTRIBUTE	/* Data attributes are a proposal in HTML5 */

#define BEGIN_INDEX "begin-index" /* <!--begin-index--> */
#define END_INDEX "end-index"	/* <!--end-index--> */
#define INDEX "index"		/* <!--index--> */
#define INDEX_INST "index-inst"	/* class="index-inst" */
#define INDEX_DEF "index-def"	/* class="index-def" */
#define TARGET "bctarget"	/* CLASS="...bctarget..." */

#define MAXSTR 2048		/* Max. length of URL + term */
#define MAXSUBS 20		/* Max. depth of subterms */
#define SECNO "secno"		/* Class of elements that define section # */
#define NO_NUM "no-num"		/* Class of elements without a section # */

typedef struct _indexterm {
  string term, url;
  int importance;		/* 1 (low) or 2 (high) */
  string secno;
} *Indexterm;

static Tree tree;
static Boolean xml = False;	/* Use <empty /> convention */
static string base = NULL;	/* (Rel.) URL of output file */
static string indexdb = NULL;	/* Persistent store of terms */
static string* userclassnames = NULL;	/* Persistent store of class names */
static FILE *globalfile;	/* Must be global for twalk */
static string globalprevious;	/* Must be global for twalk */
static string globalurlprevious;/* Must be global for twalk */
static Boolean bctarget = True;	/* Add <A name=> after IDs */
static Boolean use_secno = False; /* Anchor text is "#" instead of section # */
static Boolean final = False;	/* Leave used attributes in document */


/* handle_error -- called when a parse error occurred */
static void handle_error(void *clientdata, const string s, int lineno)
{
  (void) fprintf(stderr, "%d: %s\n", lineno, s);
}

/* start -- called before the first event is reported */
static void* start(void)
{
  tree = create();
  return NULL;
}
  
/* end -- called after the last event is reported */
static void end(void *clientdata)
{
  /* skip */
}

/* handle_comment -- called after a comment is parsed */
static void handle_comment(void *clientdata, string commenttext)
{
  tree = append_comment(tree, commenttext);
}

/* handle_text -- called after a tex chunk is parsed */
static void handle_text(void *clientdata, string text)
{
  tree = append_text(tree, text);
}

/* handle_declaration -- called after a declaration is parsed */
static void handle_decl(void *clientdata, string gi,
			string fpi, string url)
{
  tree = append_declaration(tree, gi, fpi, url);
}

/* handle_proc_instr -- called after a PI is parsed */
static void handle_pi(void *clientdata, string pi_text)
{
  tree = append_procins(tree, pi_text);
}

/* handle_starttag -- called after a start tag is parsed */
static void handle_starttag(void *clientdata, string name, pairlist attribs)
{
  conststring id;

  tree = html_push(tree, name, attribs);

  /* If it has an ID, store it (so we don't accidentally generate it) */
  if ((id = pairlist_get(attribs, "id"))) storeID(id);
}

/* handle_emptytag -- called after an empty tag is parsed */
static void handle_emptytag(void *clientdata, string name, pairlist attribs)
{
  conststring id;

  tree = html_push(tree, name, attribs);

  /* If it has an ID, store it (so we don't accidentally generate it) */
  if ((id = pairlist_get(attribs, "id"))) storeID(id);
}

/* handle_endtag -- called after an endtag is parsed (name may be "") */
static void handle_endtag(void *clientdata, string name)
{
  tree = html_pop(tree, name);
}

/* next -- return pointer to next non-punctuation, non-markup char, or NULL */
static conststring next(conststring s, const char stopchar)
{
  Boolean in_ent = False, in_dquote = False, in_squote = False, in_tag = False;

  for (;; s++) {
    if (*s == '\0' || *s == stopchar) return NULL;
    else if (in_ent) in_ent = *s != ';';
    else if (in_dquote) in_dquote = *s != '"';
    else if (in_squote) in_squote = *s != '\'';
    else if (in_tag) {
      if (*s == '"') in_dquote = True;
      else if (*s == '\'') in_squote = True;
      else in_tag = *s != '>';
    } else {
      if (*s == '&') in_ent = True;
      else if (*s == '<') in_tag = True;
      else if (isalnum(*s)) return s;
    }
  }
}

/* folding_cmp -- compare two string, ignoring case and punctuation */
static int folding_cmp(conststring a, conststring b, const char stopchar)
{
  /* Stopchar determines if we compare full strings or subterms up to '\1' */
  assert(stopchar == '\0' || stopchar == '\1');
  assert(a);
  assert(b);

  for (;;) {
    a = next(a, stopchar);
    b = next(b, stopchar);
    if (!a) return b ? -1 : 0;
    else if (!b) return 1;
    else if (tolower(*a) < tolower(*b)) return -1;
    else if (tolower(*a) > tolower(*b)) return 1;
    else a++, b++;
  }
  assert(! "Cannot happen!");
}

/* trim -- remove leading and trailing white space */
static void trim(string s)
{
  int i, j;

  for (i = 0; isspace(s[i]); i++);
  for (j = strlen(s); j > i && isspace(s[j-1]); j--);
  if (i != 0) memmove(s, s + i, j - i);
  s[j-i] = '\0';
}

/* indent -- print newline and n times 2 spaces */
static void indent(int n)
{
  putchar('\n');
  for (; n > 0; n--) printf("  ");
}

/* print_full_term -- print the full term for use in a TITLE attribute */
static void print_full_term(const string term)
{
  int j;
  typedef enum {IN_CONTENT, IN_TAG, IN_DSTRING, IN_SSTRING} Tag_state;
  Tag_state state;

  for (j = 0, state = IN_CONTENT; term[j]; j++) {
    switch (state) {
    case IN_CONTENT:
      if (term[j] == '\1') putchar(' ');
      else if (term[j] == '"') fputs("&quot;", stdout);
      else if (term[j] == '<') state = IN_TAG;
      else putchar(term[j]);
      break;
    case IN_TAG:
      if (term[j] == '>') state = IN_CONTENT;
      else if (term[j] == '"') state = IN_DSTRING;
      else if (term[j] == '\'') state = IN_SSTRING;
      break;
    case IN_DSTRING:
      if (term[j] == '"') state = IN_TAG;
      break;
    case IN_SSTRING:
      if (term[j] == '\'') state = IN_TAG;
      break;
    default:
      assert(! "Cannot happen\n");
    }
  }
}

/* write_index_item -- write one item in the list of index terms */
static void write_index_item(const void *term1, const VISIT which,
			     const int depth)
{
  string sub[MAXSUBS], oldsub[MAXSUBS];
  string p;
  Indexterm term = *(Indexterm*)term1;
  int i, j, n, oldn;

  if (which != postorder && which != leaf) return;

  sub[0] = term->term;
  n = 1;
  for (p = strstr(term->term, "\1\1"); p; p = strstr(p + 2, "\1\1"))
    sub[n++] = p + 2;
  sub[n] = sub[n-1] + strlen(sub[n-1]) + 2;

  assert(globalprevious);

  oldsub[0] = globalprevious;
  oldn = 1;
  for (p = strstr(globalprevious, "\1\1"); p; p = strstr(p + 2, "\1\1"))
    oldsub[oldn++] = p + 2;
  oldsub[oldn] = oldsub[oldn-1] + strlen(oldsub[oldn-1]) + 2;

  /* Count how many subterms are equal to the previous entry */
  for (i = 0; i < min(n, oldn)
	 && folding_cmp(sub[i], oldsub[i], '\1') == 0; i++);

  for (j = oldn - 1; j > i; j--) {indent(j); printf("</ul>");}
  if (n > oldn && oldn == i) {indent(i); printf("<ul>");}

  /* Print new (sub)terms */
  for (j = i; j < n; j++) {
    indent(j); printf("<li>");
    for (p = sub[j]; p != sub[j+1] - 2; p++) putchar(*p);
    if (j != n - 1) {indent(j+1); printf("<ul>");}
  }

  int listmode = 0;

  if (folding_cmp(globalprevious, term->term, '\0') == 0)
    if (globalurlprevious) {
      string d = strchr(globalurlprevious,'#');
      if (d)
        listmode = strncmp(globalurlprevious, term->url, d - globalurlprevious);
      else
        listmode = strcmp(globalurlprevious, term->url);
    }

  /* Print a link */
  switch (term->importance) {
    case 1:
      printf("%s <a href=\"%s\" title=\"", listmode ? ";" : ",", term->url);
      print_full_term(term->term);
      printf("\">%s</a>", use_secno ? term->secno : "#");
      break;
    case 2:
      printf("%s <a href=\"%s\" title=\"", listmode ? ";" : ",", term->url);
      print_full_term(term->term);
      printf("\"><strong>%s</strong></a>", use_secno ? term->secno : "#");
      break;
    default:
      assert(! "Cannot happen\n");
  }

  /* Remember this term */
  globalprevious = term->term;
  globalurlprevious = term->url;
}

/* mkindex -- write out an index */
static void mkindex(Indexterm terms)
{
  string p, h;

  printf("<ul class=\"indexlist\">");
  globalprevious = "zzzzzzzzzzzzzzzzzz"; /* An unlikely alphanumeric string */
  twalk(terms, write_index_item);

  /* Close all open lists */
  p = globalprevious;
  while (p) {
    printf("\n</ul>");
    h = p;
    p = strstr(h, "\1\1");
    if (p) p += 2;
  }
}

/* expand -- write the tree, add <A NAME> if needed and replace <!--index--> */
static void expand(Tree t, Boolean *write, Indexterm terms)
{
  conststring val;
  Tree h;
  pairlist a;
  string s;
  Boolean do_tag;

  for (h = t->children; h != NULL; h = h->sister) {
    switch (h->tp) {
      case Text:
	if (*write) printf("%s", h->text);
	break;
      case Comment:
	s = newstring(h->text);
	trim(s);
	if (eq(s, INDEX) || eq(s, BEGIN_INDEX)) {
	  if (!final) printf("<!--%s-->\n", BEGIN_INDEX);
	  mkindex(terms);
	  if (!final) printf("<!--%s-->", END_INDEX);
	  if (eq(s, BEGIN_INDEX)) *write = False;	/* Skip old index */
	} else if (eq(s, END_INDEX)) {
	  *write = True;
	} else {
	  printf("<!--%s-->", h->text);
	}
	dispose(s);
	break;
      case Declaration:
	printf("<!DOCTYPE %s", h->name);
	if (h->text) printf(" PUBLIC \"%s\"", h->text);	else printf(" SYSTEM");
	if (h->url) printf(" \"%s\"", h->url);
	printf(">");
	break;
      case Procins:
	if (*write) printf("<?%s>", h->text);
	break;
      case Element:
	if (*write) {
	  /* If an <a> was inserted by index itself, remove it */
	  do_tag = !eq(h->name, "a") || !has_class(h->attribs, TARGET);
	  if (do_tag) {
	    printf("<%s", h->name);
	    for (a = h->attribs; a != NULL; a = a->next) {
	      printf(" %s", a->name);
	      if (a->value != NULL) printf("=\"%s\"", a->value);
	    }
	    assert(! is_empty(h->name) || h->children == NULL);
	    printf(xml && is_empty(h->name) ? " />" : ">");
	    /* Insert an <A NAME> if element has an ID and is not <A> */
	    if (bctarget && is_mixed(h->name) && (val = get_attrib(h, "id"))
		&& !eq(h->name, "a") && ! xml)
	      printf("<a class=\"%s\" name=\"%s\"></a>", TARGET, val);
	  }
	  expand(h, write, terms);
	  if (do_tag && ! is_empty(h->name)) printf("</%s>", h->name);
	}
	break;
      case Root:
	assert(! "Cannot happen");
	break;
      default:
	assert(! "Cannot happen");
    }
  }
}

/* termcmp -- comparison routine for Indexterms */
static int termcmp(const void *a1, const void *b1)
{
  Indexterm a = (Indexterm)a1, b = (Indexterm)b1;

  assert(a);
  assert(b);
  assert(a->term);
  assert(b->term);

  switch (folding_cmp(a->term, b->term, '\0')) {
  case -1: return -1;
  case 0: return strcmp(a->url, b->url); /* Terms are equal, use URL instead */
  case 1: return 1;
  default: assert(! "Cannot happen!");
  }
}

/* copy_contents -- recursively expand contents of element t into a string */
static void copy_contents(Tree t, string *s)
{
  Tree h;
  int i;
  pairlist a;
  string p;

  for (h = t->children; h != NULL; h = h->sister) {
    switch (h->tp) {
      case Text:
	i = *s ? strlen(*s) : 0;
	renewarray(*s, i + strlen(h->text) + 1);
	/* Copy, but transform all whitespace to spaces */
	for (p = h->text; *p; p++, i++) (*s)[i] = isspace(*p) ? ' ' : *p;
	(*s)[i] = '\0';
	break;
      case Comment: break;
      case Declaration: break;
      case Procins: break;
      case Element:
	/* Only certain tags are retained */
	if (eq(h->name, "span") || eq(h->name, "code") || eq(h->name, "tt")
	    || eq(h->name, "acronym") || eq(h->name, "abbr")
	    || eq(h->name, "bdo") || eq(h->name, "kbd") || eq(h->name, "samp")
	    || eq(h->name, "sub") || eq(h->name, "sup")
	    || eq(h->name, "var")) {
	  strapp(s, "<", h->name, NULL);
	  for (a = h->attribs; a != NULL; a = a->next) {
	    if (! a->value) strapp(s, " ", a->name, NULL);
	    else strapp(s, " ", a->name, "=\"", a->value, "\"", NULL);
	  }
	  assert(! is_empty(h->name) || h->children == NULL);
	  if (is_empty(h->name)) {
	    strapp(s, xml ? " />" : ">", NULL);
	  } else {
	    strapp(s, ">", NULL);
	    copy_contents(h, s);
	    strapp(s, "</", h->name, ">", NULL);
	  }
	} else {				/* Ignore tag, copy contents */
	  copy_contents(h, s);
	}
	break;
      case Root: assert(! "Cannot happen"); break;
      default: assert(! "Cannot happen");
    }
  }
}

/* copy_to_index -- copy the contents of element h to the index db */
static void copy_to_index(Tree t, Indexterm *terms, int importance,
			  conststring secno)
{
  conststring id, title0;
  string title, h;
  Indexterm term;
  int i, n;

  id = get_attrib(t, "id");
#ifdef USE_DATA_ATTRIBUTE
  if (! (title0 = get_attrib(t, "data-index")))
#endif
    title0 = get_attrib(t, "title");

  /* Get term either from title attribute or contents */
  if (title0) {
    /* Hack: replace !! and :: by \1\1, so that the string sorts better */
    title = newstring(title0);
    for (h = strstr(title, "!!"); h; h = strstr(h + 2, "!!")) h[0] = h[1] = '\1';
    for (h = strstr(title, "::"); h; h = strstr(h + 2, "::")) h[0] = h[1] = '\1';

    i = 0;
    while (title[i]) {
      n = strcspn(title + i, "|");		/* Find | or \0 */
      new(term);
      term->importance = importance;
      term->secno = newstring(secno);
      term->url = NULL;
      strapp(&term->url, base, "#", id, NULL);
      term->term = newnstring(title + i, n);
      if (! tsearch(term, (void**)terms, termcmp))
	errexit("Out of memory while parsing term %s\n", term->term);
      i += n;
      if (title[i]) i++;			/* Skip '|' */
    }
    if (final)					/* Remove used attribute */
#ifdef USE_DATA_ATTRIBUTE
      if (!delete_attrib(t, "data-index"))
#endif
	delete_attrib(t, "title");

  } else {					/* Recursively copy contents */

    new(term);
    term->importance = importance;
    term->secno = newstring(secno);
    term->url = term->term = NULL;
    strapp(&term->url, base, "#", id, NULL);
    copy_contents(t, &term->term);
    if (term->term)				/* Non-empty contents */
      if (! tsearch(term, (void**)terms, termcmp))
	errexit("Out of memory while parsing term %s\n", term->term);

  }
}

/* collect -- collect index terms, add IDs where needed */
static void collect(Tree t, Indexterm *terms, string *secno)
{
  int importance;
  Tree h;

  for (h = t->children; h != NULL; h = h->sister) {
    switch (h->tp) {
      case Text: case Comment: case Declaration: case Procins: break;
      case Element:
	if (has_class(h->attribs, SECNO)) {
	  dispose(*secno);
	  copy_contents(h, secno);
	  trim(*secno);
	} else if (has_class(h->attribs, NO_NUM)) {
	  dispose(*secno);
	  *secno = newstring("#");
	}
	if (eq(h->name, "dfn")) importance = 2;
	else if (has_class(h->attribs,INDEX)||has_class(h->attribs,INDEX_INST))
	  importance = 1;
	else if (userclassnames && has_class_in_list(h->attribs, userclassnames))
	  importance = 1;
	else if (has_class(h->attribs, INDEX_DEF)) importance = 2;
	else importance = 0;
	if (importance != 0) {
	  /* Give it an ID, if it doesn't have one */
	  if (! get_attrib(h, "id")) set_attrib(h, "id", gen_id(h));
	  copy_to_index(h, terms, importance, *secno);
	} else {
	  collect(h, terms, secno);
	}
	break;
      case Root: assert(! "Cannot happen"); break;
      default: assert(! "Cannot happen");
    }
  }
}

/* load_index -- read persistent term db from file */
static void load_index(const string indexdb, Indexterm *terms)
{
  FILE *f;
  int n, n3;
  char line[MAXSTR];
  Indexterm term;

  if (! (f = fopen(indexdb, "r"))) return;	/* Assume file not found... */

  while (fgets(line, sizeof(line), f)) {
    chomp(line);
    n = strcspn(line, "\t");
    if (line[n] != '\t') errexit("Illegal syntax in %s\n", indexdb);
    new(term);
    term->term = newnstring(line, n);
    switch (line[n + 1]) {
      case '1': term->importance = 1; break;
      case '2': term->importance = 2; break;
      default: errexit("Error in %s (column 2 must be '1' or '2')\n", indexdb);
    }
    if (line[n+2] != '\t') errexit("Illegal syntax in %s\n", indexdb);
    n3 = n + 3 + strcspn(line + n + 3, "\t");
    if (line[n3] != '\t') errexit("Illegal syntax in %s\n", indexdb);
    term->url = newnstring(line + n + 3, n3 - n - 3);
    term->secno = newstring(line + n3 + 1);
    if (! tsearch(term, (void**)terms, termcmp))
      errexit("Out of memory while loading %s\n", indexdb);
  }

  fclose(f);
}

/* save_a_term -- write one term to globalfile */
static void save_a_term(const void *term1, const VISIT which, const int dp)
{
  Indexterm term = *(Indexterm*)term1;

  if (which == endorder || which == leaf)
    fprintf(globalfile, "%s\t%d\t%s\t%s\n", term->term, term->importance,
	    term->url, term->secno);
}

/* save_index -- write terms to file */
static void save_index(const string indexdb, Indexterm terms)
{
  if (! (globalfile = fopen(indexdb, "w")))
    errexit("%s: %s\n", indexdb, strerror(errno));
  twalk(terms, save_a_term);
  fclose(globalfile);
}

/* usage -- print usage message and exit */
static void usage(string name)
{
  errexit("Version %s\nUsage: %s [-i indexdb] [-b base] [-x] [-t] [-n] [-c userclass] [html-file]\n",
	  VERSION, name);
}

/* tokenize -- split string s into tokens at each comma, return an array */
static string * tokenize(string s)
{
  string * t;
  int i, n;

  assert(s && s[0]);
  for (t = NULL, n = 0; *s; s += i + 1, n++) {
    i = strcspn(s, ",");
    renewarray(t, n + 1);
    t[n] = newnstring(s, i);
  }
  renewarray(t, n + 1);		/* Make final item NULL */
  t[n] = NULL;
  return t;
}

/* main */
int main(int argc, char *argv[])
{
  Boolean write = True;
  Indexterm termtree = NULL;	/* Sorted tree of terms */
  string secno;
  char c;

  /* Bind the parser callback routines to our handlers */
  set_error_handler(handle_error);
  set_start_handler(start);
  set_end_handler(end);
  set_comment_handler(handle_comment);
  set_text_handler(handle_text);
  set_decl_handler(handle_decl);
  set_pi_handler(handle_pi);
  set_starttag_handler(handle_starttag);
  set_emptytag_handler(handle_emptytag);
  set_endtag_handler(handle_endtag);

  yyin = NULL;

  while ((c = getopt(argc, argv, "txbi:cnf")) != -1)
  switch (c) {
  case 't': bctarget = False; break; /* Don't write <a name> after each ID */
  case 'x': xml = True; break;	/* Output as XML */
  case 'b': base = newstring(optarg); break; /* Set base of URL */
  case 'i': indexdb = newstring(optarg); break;	/* Set name of index db */
  case 'c': userclassnames = tokenize(optarg); break; /* Set class names */
  case 'n': use_secno = True; break; /* Print section numbers instead of "#" */
  case 'f': final = True; break; /* "Final": remove used attributes */
  default: usage(argv[0]);
  }
  if (optind == argc) yyin = stdin;
  else if (optind >= argc) usage(argv[0]);
  else if (eq(argv[optind], "-")) yyin = stdin;
  else yyin = fopenurl(argv[optind], "r");

  if (yyin == NULL) {perror(argv[optind]); exit(1);}

  if (!base) base = newstring("");

  /* Read the index DB into memory */
  if (indexdb) load_index(indexdb, &termtree);

  /* Parse, build tree, collect existing IDs */
  if (yyparse() != 0) exit(3);

  /* Scan for index terms, add them to the tree, add IDs where needed */
  secno = newstring("#");
  collect(get_root(tree), &termtree, &secno);

  /* Write out the document, adding <A NAME> and replacing <!--index--> */
  expand(get_root(tree), &write, termtree);

  /* Store terms to file */
  if (indexdb) save_index(indexdb, termtree);

  fclose(yyin);
#if 0
  tree_delete(tree);				/* Just to test memory mgmt */
#endif
  return 0;
}
