/********************************************************************************************************
 * QRNA - Comparative analysis of biological sequences 
 *         with pair hidden Markov models, pair stochastic context-free
 *        grammars, and probabilistic evolutionary  models.
 *       
 * Version 2.0.0 (JUN 2003)
 *
 * Copyright (C) 2000-2003 Howard Hughes Medical Institute/Washington University School of Medicine
 * All Rights Reserved
 * 
 *     This source code is distributed under the terms of the
 *     GNU General Public License. See the files COPYING and LICENSE
 *     for details.
 ***********************************************************************************************************/

/* qrna_evolve.c
 * 
 * Given a sequence, generates another according to an evolutionary model
 *
 * ER, Wed Mar  1 13:24:00 CST 2000 [ST. Louis]
 * 
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <time.h>

#include "funcs.h"
#include "globals.h"
#include "squid.h"
#include "structs.h"
#include "version.h"


#ifdef MEMDEBUG
#include "dbmalloc.h"
#endif

static struct opt_s OPTIONS[] = {
  { "-a",        TRUE,  sqdARG_NONE},
  { "-C",        TRUE,  sqdARG_NONE},
  { "-c",        TRUE,  sqdARG_STRING},
  { "-D",        TRUE,  sqdARG_NONE},
  { "-h",        TRUE,  sqdARG_NONE},
  { "-i",        TRUE,  sqdARG_STRING},
  { "-l",        TRUE,  sqdARG_STRING},
  { "-m",        TRUE,  sqdARG_STRING},
  { "-O",        TRUE,  sqdARG_NONE},
  { "-o",        TRUE,  sqdARG_STRING},
  { "-p",        TRUE,  sqdARG_STRING},
  { "-q",        TRUE,  sqdARG_STRING},
  { "-R",        TRUE,  sqdARG_NONE},
  { "-t",        TRUE,  sqdARG_NONE},
  { "-v",        TRUE,  sqdARG_NONE},
};
                
#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))

static char usage[]  = "\
Usage: qrna_evolve [-options] <seqfile in> \n\
where options are:\n\
   -a             : print alignment \n\
   -C             : create a COD-like alignment (default is mut5px)\n\
   -c <cfgfile>   : <cfgfile> to use to train the rna model (default = tRNA+rRNA)\n\
   -D <codfile>   : include a file of coding-coding frequencies for the coding model\n\
   -h             : print short help and usage info\n\
   -i <tinit>     : minimum evolutionary time factor (default tinit=0)\n\
   -l <maxlenhit> : change the maxlenhit parameter (default 1000)\n\
   -m <tmax>      : maximum evolutionary time factor (default tmax=12)\n\
   -O             : create a OTH-like alignment (default is mut5px)\n\
   -o <outfile>   : direct structure-annotated sequence to <outfile>\n\
   -p <pamfile>   : <pamfile> to use (default = BLOSUM62)\n\
   -q <num>       : number of alingments to be created (default = 1)\n\
   -R             : create a RNA-like alignment (default is mut5px)\n\
   -t             : print traceback\n\
   -v             : verbose debugging output\n\
";

static char banner[] = "qrna_evolve";

int
main(int argc, char **argv)
{
  char    *seqfile;               /* input sequence file                                    */
  SQFILE  *sqfp;	          /* open sequence file                                     */
  char    *seq;	                  /* given sequence                                         */
  int     *iseq;	           /* given sequence, integer form                          */
  int     *isegX, *isegY;	  /* generated sequences in integer form (with gaps)        */
  SQINFO   sqinfoX;               /* info structures for seqX                               */
  SQINFO   sqinfoY;               /* info structures for seqY                               */
  int     *ct;                    /* .ct notation for seq's RNA structure (if any)          */
  int      leg;                   /* length of a given alignment                            */
  int      Lmax;                  /* length of the max alignment                            */
  int      format;                /* format of seq file                                     */

  char    *codonfile;             /* codon-codon joint frequencies                          */
  char    *hexapfile;             /* Hexamer joint frequencies                              */
  char    *pamfile;               /* PAM substitution matrix                                */
  char    *cfgfile;               /* RNA grammar file                                       */
  char    *ribofile;              /* RNA pair-to-pair probs                                 */

  struct   model_s *model;        /* transition + emission probs 3 models + null model      */
  struct   ali_s   *ali;          /* arrays to store the alignment created by full viterbi  */ 

  int     alignment;	          /* TRUE prints alignment                                  */
  int     cod;	                  /* TRUE create a COD-like  alignment                      */
  int     oth;	                  /* TRUE create a OTH-like  alignment                      */
  int     rna;	                  /* TRUE create a RNA-like  alignment                      */
  double  tfactor;                /* evolutionary time factor                               */
  double  tinc, tinit, tmax;      /* time increment, min amd max times considered           */
  int     traceback;              /* TRUE to traceback alignment                            */
  int     pedantic;               /* TRUE do some checks for evolutionary models to debug   */
  int     verbose;                /* TRUE to be extremely verbose to debug                  */

  char   *outfile;                /* where to send the output                               */
  FILE   *ofp;	                  /* open output file                                       */
  char   *optname;
  char   *optarg; 
  char   *string_name;
  int     num = 0, num_align;
  int     optind;	
  int     seed;

  double  id;                     /* percentage identities in the alignments                */
  double  gap;                    /* percentage of gaps in the alignments                   */
  double  mut;                    /* percentage mutations in the alignments                 */

  double  first_pos;
  double  second_pos;
  double  third_pos;

  int     pairs;
  int     comp_pairs;
  int     noncomp_pairs;

  /* re-seed the random number generator.
   */
  seed = (int) time ((time_t *) NULL);
  sre_srandom(seed); /* reinit sre_random each time you shuffle a sequence */

  /* Parse command line
   */
  tinc     =  0.01;    /* time increments (if tinit < tmax)  */
  tinit    =  0.00;    /* default minimum evolutionary time  */
  tmax     = 12.00;    /* default maximum evolutionary time  */

  alignment = FALSE;      /* TRUE  ==  prints alignment                      */
  cod       = FALSE;      /* TRUE  ==  create a COD-like  alignment          */
  oth       = FALSE;      /* TRUE  ==  create a OTH-like  alignment          */
  rna       = FALSE;      /* TRUE  ==  create a RNA-like  alignment          */
  pedantic  = FALSE;      /* TRUE  ==  check your evolutionary models        */
  traceback = FALSE;      /* TRUE  ==  traceback alignment                   */
  verbose   = FALSE;      /* TRUE  ==  for debuging                          */
  Lmax      = 5000;
  
  cfgfile   = NULL;
  ribofile  = NULL;
  codonfile = NULL;
  pamfile   = "BLOSUM62";
  outfile   = NULL;
  leg       = Lmax;
  num_align = 1;

  while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
		&optind, &optname, &optarg))
    {
      if      (strcmp(optname, "-a") == 0)  alignment = TRUE;
      else if (strcmp(optname, "-C") == 0)  cod       = TRUE;
      else if (strcmp(optname, "-c") == 0)  cfgfile   = optarg;
      else if (strcmp(optname, "-D") == 0)  codonfile = optarg;
      else if (strcmp(optname, "-i") == 0)  tinit     = atof(optarg);
      else if (strcmp(optname, "-l") == 0)  leg       = atoi(optarg);
      else if (strcmp(optname, "-m") == 0)  tmax      = atof(optarg);
      else if (strcmp(optname, "-O") == 0)  oth       = TRUE;
      else if (strcmp(optname, "-o") == 0)  outfile   = optarg;
      else if (strcmp(optname, "-p") == 0)  pamfile   = optarg;
      else if (strcmp(optname, "-q") == 0)  num_align = atoi(optarg);
      else if (strcmp(optname, "-R") == 0)  rna       = TRUE;
      else if (strcmp(optname, "-t") == 0)  traceback = TRUE;
      else if (strcmp(optname, "-v") == 0)  verbose   = TRUE;
      else if (strcmp(optname, "-h") == 0) 
	{
	  puts(banner);
	  printf("          %s (%s)", RELEASE, RELEASEDATE);
	  printf(" using squid %s (%s)\n", squid_version, squid_date);
	  puts(usage);
	  exit(0);
	}
    }

  tfactor  = tinit; /* set the time to start with */

  if (argc - optind == 1) {
    seqfile  = argv[optind]; 
  }
  else
    Die("Incorrect number of command line arguments.\n%s\n", usage);
  
  if (cfgfile == NULL) /* SCFG is not provided */
    cfgfile = FileConcat("", "mix_tied_linux.cfg");

  if (ribofile == NULL) /* RIBORPROB is not provided */
    ribofile = FileConcat("", "RIBOPROB85-60.mat");
 
  if (seqfile == NULL) {
    puts(usage);
    exit(0);
  }
  
  /* Open output file 
   */
  ofp = stdout;
  if (outfile != NULL && (ofp = fopen(outfile, "w")) == NULL)
    Die("Failed to open output file %s", outfile);

  /* Open sequence file(s)
   */
  if (! SeqfileFormat(seqfile, &format, NULL))
    Die("Failed to determine format of sequence file %s\n", seqfile);
  if ((sqfp = SeqfileOpen(seqfile, format, NULL)) == NULL)
    Die("Failed to open sequence file %s", seqfile);
  
  /* Allocate space for the sequences
   */
  AllocCharSeq(leg, &seq);
  AllocIntSeq(leg, &iseq);
  AllocIntSeqs(leg, &isegX, &isegY);
  AllocCt(leg, &ct);
  AllocAli(leg, &ali);

  string_name = (char *) MallocOrDie(sizeof(char)*80);

  /* Read seqX (sequence may have gaps)
   */
  while (ReadGapSeq(sqfp, format, &seq, &sqinfoX)) {
    leg = sqinfoX.len;
    IntizeGapAsequence(seq, 0, leg, iseq, verbose);
    /*RemoveGaps(iseq, leg, &leg);*/

    while (tfactor <= tmax) {
      /* Contruct the models 
       *    at a given evolutionary time "tfactor"
       */
      ConstructModels(ofp, codonfile, hexapfile, pamfile, cfgfile, ribofile, &model, leg, tfactor, FALSE, FALSE, pedantic, verbose);

      while (num++ < num_align) {
	if (cod) {
	  SimulateCODSequence(ofp, iseq, &sqinfoX, isegX, &sqinfoY, isegY, 0, &leg, model->cod, 
			      model->null, ali, traceback, alignment, string_name, &first_pos, &second_pos, &third_pos);    
	  snprintf (string_name,  80, "%s_[%.2f/%.2f/%.2f]", "cod", first_pos, second_pos, third_pos);  
	}
	else if (rna) {
	  string_name = (char *) MallocOrDie(sizeof(char)*80);
	  SimulateRNASequence(ofp, iseq, &sqinfoX, isegX, &sqinfoY, isegY, 0, leg, model->rna, 
			      ali, ct, traceback, alignment, string_name, &pairs, &comp_pairs, &noncomp_pairs);    
	  snprintf (string_name,  80, "%s_[%d/%d]", "rna", pairs, comp_pairs);
	}
	else  if (oth) {
	snprintf (string_name,  80, "%s", "oth");
	  SimulateOTHSequence(ofp, iseq, &sqinfoX, isegX, &sqinfoY, isegY, 0, &leg, model->oth, ali, 
			      traceback, alignment, string_name);
	}
	else {/* this is a hack but it works, using model->rna->pi2->w->pl 
	       * as the mutation probabilities including gaps           
	       */
	  snprintf (string_name,  80, "%s", "mut");
	  RemoveGaps(iseq, leg, &leg, verbose); 
	  SimulateMutSequence(ofp, iseq, &sqinfoX, isegX, &sqinfoY, isegY, 0, leg, model->oth->mem, ali, 
			      alignment, string_name);
	  /*string_name = "mut5";
	    SimulateMut5Sequence(ofp, iseq, &sqinfoX, isegX, &sqinfoY, isegY, 0, &leg, 
	    model->rna->pi2->w->pl, ali, alignment, string_name);*/
	}
	
	AlignStat(ofp, &sqinfoX, isegX, &sqinfoY, isegY, leg, &id, &gap, &mut);
	PrintAlignSequences(ofp, &sqinfoX, &sqinfoY, string_name, tfactor, 0, leg, isegX, isegY, ali);
	
	leg = sqinfoX.len;
      }
      num = 0;
      tfactor += tinc; /* increment the evolutionary time */
      FreeModels(model);

      printf("# align %d\n", num_align);
      printf("# time  #id    #gap   #mut   #(id+mut)\n");
      printf("%.2f \t %.4f \t %.4f \t %.4f \t %.4f \n", 
	     tfactor, id/num_align, gap/num_align, mut/num_align, (id+mut)/num_align);
    }
  }
  
  /* Cleanup
   */
  if (outfile != NULL) fclose(ofp);
  SeqfileClose(sqfp);
  if (outfile != NULL) fclose(ofp);
  free(seq);
  free(iseq);
  free(isegX);
  free(isegY);
  free(ct);
  FreeAli(ali);
  FreeModels(model);

  free (string_name);

  return EXIT_SUCCESS;
}









