/********************************************************************************************************
 * QRNA - Comparative analysis of biological sequences 
 *         with pair hidden Markov models, pair stochastic context-free
 *        grammars, and probabilistic evolutionary  models.
 *       
 * Version 2.0.0 (JUN 2003)
 *
 * Copyright (C) 2000-2003 Howard Hughes Medical Institute/Washington University School of Medicine
 * All Rights Reserved
 * 
 *     This source code is distributed under the terms of the
 *     GNU General Public License. See the files COPYING and LICENSE
 *     for details.
 ***********************************************************************************************************/

/* cfgbuild.c
 *
 * Build a RNA SCFG from known structures.
 *
 * ER, Fri Jun 18 15:46:09 CDT 1999
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include "funcs.h"
#include "globals.h"
#include "squid.h"
#include "structs.h"
#include "version.h"

#ifdef MEMDEBUG
#include "dbmalloc.h"
#endif


static struct opt_s OPTIONS[] = {
  { "-h",        TRUE,  sqdARG_NONE    },
  { "-t",        TRUE,  sqdARG_NONE },
  { "--boot",    FALSE, sqdARG_STRING },
  { "--counts",  FALSE, sqdARG_STRING },
  { "--lod",     FALSE, sqdARG_STRING }, 
  { "--rdb",     FALSE, sqdARG_STRING  },
  { "--summary", FALSE, sqdARG_STRING  },  
  { "-k",        TRUE,  sqdARG_NONE },
  { "-o",        TRUE,  sqdARG_STRING },
  { "-p",        TRUE,  sqdARG_NONE },
  { "-d",        TRUE,  sqdARG_NONE },
};
#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))

static char usage[]  = "\
Usage: cfgbuild [-options] <SCFG outfile> <seqfile in> [<seqfile in> ...]\n\
Seqfile(s) in SELEX or SQUID format, with secondary structure annotation.\n\
Available options are:\n\
   -h               : print short help and usage info\n\
   -t               : use tied distributions\n\
   --boot <file>    : determine bootstrap confidences; output to <file>\n\
   --lod <file>     : save log-odds form SCFG to RDB <file>\n\
   --rdb <file>     : save counts to RDB database <file>\n\
   --summary <file> : save by-node summary to RDB <file>\n\
   -k               : allow pseudoknots\n\
   -o <outfile>     : direct score and annotated sequence to <outfile>\n\
   -p               : print traceback\n\
   -d               : print each individual step of the traceback for debugging\n\
";

static char banner[] = "cfgbuild - build RNA SCFG from known structures";

static void print_free_parameters(FILE *fp, int **enslave, int allow_pseudoknots);
static void print_bootstrap(char *bootfile, double **cfg, double **pcfg, 
			    double *perstate, int **enslave);

int
main(int argc, char **argv)
{ 
  char             *seq;       	/* example RNA sequence                            */
  SQINFO         sqinfo;	/* info for seq, including structure               */
  struct tracekn_s  *tr;       	/* trace of this RNA structure                     */
  char         *seqfile;        /* input sequence file                             */
  SQFILE          *sqfp;	/* open sequence file                              */
  int            format;        /* format of sequence file                         */
  double          **cfg;        /* grammar being learned, counts                   */
  double         **pcfg;        /* SCFG in probability form                        */  
  char         *cfgfile;       	/* save file for SCFG                              */
  char         *outfile;      	/* where to send the output                        */
  FILE             *ofp;       	/* an open output file                             */
  int             nfile;       	/* number of file                                  */
  int              nseq;       	/* number of sequence                              */
  double       *basefreq;        /* the joint base frecuencies                      */
  char         *rdbfile;       	/* option: RDB database of counts                  */
  char        *summfile;        /* option: RDB, by-node count summary              */
  char         *lodfile;        /* option: RDB database of lod model               */   
  char        *bootfile;        /* option: save bootstrap confidence data          */ 
  int         **enslave;        /* if non-NULL, gives parameter tying              */
  double       *perstate; 	/* array of count sums per source state            */
  int            nbases;       	/* total # of nucleotides trained on               */
  int allow_pseudoknots;	/* TRUE  == allow pseudoknots                      */
 
  char *optname;
  char *optarg; 
  int   optind;	
  int   printtrace;	
  int   debug;	

#ifdef MEMDEBUG
  unsigned long histid1, histid2, orig_size, current_size;
  orig_size = malloc_size(&histid1);
#endif

  /*********************************************** 
   * Parse command line
   ***********************************************/

  lodfile     = NULL;
  rdbfile     = NULL;
  summfile    = NULL;
  bootfile    = NULL;
  enslave     = NULL;
  outfile     = NULL;
  printtrace  = FALSE;
  debug       = FALSE;


  allow_pseudoknots = FALSE;   /* TRUE  ==  allow pseudoknots */

  while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
		&optind, &optname, &optarg))
    {
      if      (strcmp(optname, "-t")         == 0) enslave            = EnslaveTrans();
      else if (strcmp(optname, "--boot")     == 0) bootfile           = optarg;
      else if (strcmp(optname, "--lod")      == 0) lodfile            = optarg;
      else if (strcmp(optname, "--rdb")      == 0) rdbfile            = optarg;
      else if (strcmp(optname, "--summary")  == 0) summfile           = optarg;
      else if (strcmp(optname, "-k")         == 0) allow_pseudoknots  = TRUE;
      else if (strcmp(optname, "-o")         == 0) outfile            = optarg;
      else if (strcmp(optname, "-p")         == 0) printtrace         = TRUE;
      else if (strcmp(optname, "-d")         == 0) debug              = TRUE;
      else if (strcmp(optname, "-h")         == 0) 
	{
	  puts(banner);
	  printf("         cfgbuild %s (%s)", RELEASE, RELEASEDATE);
	  printf(" using squid %s (%s)\n", squid_version, squid_date);
	  puts(usage);
	  exit(0);
	}
    }

  if (argc - optind < 2)
    Die("Incorrect number of command line arguments.\n%s\n", usage);
  cfgfile  = argv[optind++]; 

  /*********************************************** 
   * Print banner
   ***********************************************/

  puts(banner);
  printf("         cfgbuild %s (%s)", RELEASE, RELEASEDATE);
  printf(" using squid %s (%s)\n", squid_version, squid_date);
  printf("---------------------------------------------------\n");
  
  /*********************************************** 
   * Open output file.
   ***********************************************/
  ofp = stdout;
  if (outfile != NULL && (ofp = fopen(outfile, "w")) == NULL)
    Die("Failed to open output file %s", outfile);
  
  /*********************************************** 
   * Read structures and count their transitions into a SCFG
   ***********************************************/
  cfg = AllocSCFG();
  nfile  = 0;
  nbases = 0;
  nseq = 0;
  basefreq = AllocBaseFreq();

  for (; optind < argc; optind++)
    {
      seqfile = argv[optind];
      nfile++;
      printf("Database #%d. %s\n", nfile, seqfile);

      if (! SeqfileFormat(seqfile, &format, NULL))
	Die("Failed to determine format of sequence file %s\n", seqfile);
      if ((sqfp = SeqfileOpen(seqfile, format, NULL)) == NULL)
	Die("Failed to open sequence file %s", seqfile);

      while (ReadSeq(sqfp, format, &seq, &sqinfo))
	{
	  s2upper(seq);
	  StripDegeneracy(seq, &sqinfo.len);

	  if (!(sqinfo.flags & SQINFO_SS))
	    Warn("  Sequence %s has no structure given" , sqinfo.name);
	  else if (!VerifyKHS(sqinfo.name, sqinfo.ss, sqinfo.len, FALSE))
	    Warn("  Sequence %s structure fails validation", sqinfo.name);
	  else 
	    {
	      WriteSeqinfo(ofp, &sqinfo, nfile, nseq, nbases);

	      if (KHS2Trace(ofp, seq, sqinfo.ss, sqinfo.len, &tr, allow_pseudoknots, debug))
		{
		  TraceCount(seq, sqinfo.len, 1.0, tr, cfg);
		  nseq++;
		  nbases += sqinfo.len;

		  BaseCompChar(ofp, seq, sqinfo.len-1, sqinfo.len-1, basefreq);

		  printf("added counts: A = %f, C = %f, G = %f, U = %f\n", 
			 basefreq[0], basefreq[1], basefreq[2], basefreq[3]);
		}
	      else
		Warn("  Sequence %s structure must be really broken\n", 
		     sqinfo.name);

	      if (printtrace) GraphicTrace(ofp, tr, seq, sqinfo.len);
	      FreeTracekn(tr);
	    }
	  FreeSequence(seq, &sqinfo);
	}
      putc('\n', stdout);
      fflush(stdout);
      SeqfileClose(sqfp);
    }

  /* normalize counts  
   */
  BaseCompNorm(nbases, basefreq);
  
  /*********************************************** 
   * Save stuff
   ***********************************************/

  /* Convert counts to probabilities, but keep the old counts-based model
   * so we can do some statistics and tables from it.
   */
  perstate = CountsPerState(cfg);

  if (enslave != NULL) TieCounts(cfg, enslave); 

  pcfg = DupSCFG(cfg);
  
  /* log2 prob         
   */
  Log2ProbSCFG(pcfg);
  
 /* Print useful information
  */
  printf("Structure files:     %d\n", nfile);
  printf("Total # structures:  %d\n", nseq);
  printf("Total # nucleotides: %d\n", nbases);
  printf("Total frequencies: A = %f, C = %f, G = %f, U = %f\n", 
	 basefreq[0], basefreq[1], basefreq[2], basefreq[3]);
  printf("Total # transitions: %.0f\n", DSum(perstate, NDPS));

  print_free_parameters(stdout, enslave, allow_pseudoknots);
  if (bootfile != NULL) print_bootstrap(bootfile, cfg, pcfg, perstate, enslave);


  /* save counts data to RDB database if we're asked
   */ 
  if (rdbfile != NULL)
    {
      if ((ofp = fopen(rdbfile, "w")) == NULL)
	Die("failed to open RDB database %s for write", rdbfile);
      WriteRdbSCFG(ofp, cfg);
      fclose(ofp);
      printf("Counts saved to RDB file:  %s\n", rdbfile);
    }

  /* Save a counts summary if we're asked:
   * transitions summarized by nonterminal node types, instead
   * of individual states. 
   */
  if (summfile != NULL) 
    {
      if ((ofp = fopen(summfile, "w")) == NULL)
	Die("failed to open RDB database %s for write", summfile);
      WriteRdbSummary(ofp, cfg);
      fclose(ofp);
      printf("Summary saved to RDB file: %s\n", summfile);
    }
  
  /* Save the SCFG itself. (in counts form0
   */
  if ((ofp = fopen(cfgfile, "w")) == NULL)
    Die("failed to open SCFG save file %s", cfgfile);
  if (!SaveSCFG(ofp, cfg))
    Die("failed to save SCFG");
  fclose(ofp);

  if (lodfile != NULL)
    {
      if ((ofp = fopen(lodfile, "w")) == NULL)
	Die("failed to open log odds file %s for write", lodfile);
      WriteRdbLogSCFG(ofp, pcfg);
      fclose(ofp);
      printf("Log2 SCFG in RDB database: %s\n", lodfile); 
    }

  free(basefreq);
  FreeSCFG(cfg);
  FreeSCFG(pcfg);

#ifdef MEMDEBUG
  current_size = malloc_size(&histid2);
  if (current_size != orig_size)
    malloc_list(2, histid1, histid2);
  else
    fprintf(stderr, "[No memory leaks]\n");
#endif
  return EXIT_SUCCESS;
}





static void
print_free_parameters(FILE *fp, int **enslave, int allow_pseudoknots)
{
  int nfree, tied_free;
  int nonzero, tied_nonzero;

  CountFreeParameters(NULL, &nfree, &nonzero, allow_pseudoknots);
  if (enslave == NULL)
    {
      fprintf(fp, "Nonzero parameters: %5d\n", nonzero);
      fprintf(fp, "Free parameters:    %5d\n", nfree);
    }
  else
    {
      CountFreeParameters(enslave, &tied_free, &tied_nonzero, allow_pseudoknots);
      fprintf(fp, "Nonzero parameters: %5d -- tied --> %5d\n", nonzero, tied_nonzero);
      fprintf(fp, "Free parameters:    %5d -- tied --> %5d\n", nfree, tied_free);
    }
}

static void
print_bootstrap(char *bootfile, double **cfg, double **pcfg, double *perstate, int **enslave)
{
  FILE  *fp;
  double **high;
  double **low;
  int i, j;
  
  if (bootfile != NULL)
    {
      if ((fp = fopen(bootfile, "w")) == NULL)
	Die("Failed to open bootfile %s for saving bootstrap");
    }
  else fp = stdout;

  BootstrapConfidence(pcfg, perstate, enslave, 1000, 0.95, &high, &low);
  for (i = 0; i < NDPS; i++)
    for (j = 0; j < Ntrans[i]; j++)
      if (Connects(i,Ntype(i,j)))
	fprintf(fp, "%4s > %4s  val: %.3f  hi: %.3f lo: %.3f  +-: %3.0f %6.0f %6.0f\n",
		stNAME[i], stNAME[j], pcfg[i][j],
		high[i][j], low[i][j], 
		50. * (high[i][j] - low[i][j]) / pcfg[i][j],
		cfg[i][j], perstate[i]); 
  if (bootfile != NULL) fclose(fp);
  FreeSCFG(high);
  FreeSCFG(low);
}
