/********************************************************************************************************
 * QRNA - Comparative analysis of biological sequences 
 *         with pair hidden Markov models, pair stochastic context-free
 *        grammars, and probabilistic evolutionary  models.
 *       
 * Version 2.0.0 (JUN 2003)
 *
 * Copyright (C) 2000-2003 Howard Hughes Medical Institute/Washington University School of Medicine
 * All Rights Reserved
 * 
 *     This source code is distributed under the terms of the
 *     GNU General Public License. See the files COPYING and LICENSE
 *     for details.
 ***********************************************************************************************************/

/*
 * rnamat.c
 *
 * Routines for dealing with RNA subsitution/transition matrix.
 *
 * Elena Rivas, 
 * Tue Jan 21 16:11:37 CST 2003
 *
 * modified from :
 *
 * rnamat.c (part of rsearch)
 * Robert J. Klein
 * February 25, 2002
 */

#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <float.h>
#include "squid.h"
#include "msa.h"
#include "structs.h"
#include "rnamat.h"
#include "nstack.h"        /* Sean's stack routines */

static void  count_background_nts(char *segX, char *segY, int alen, double *background_nt, double wgt);
static void  count_paired_cols  (char *segX, char *segY, int alen, matrix_t *pairedmat,   int *ctX, int *ctY, double wgt);
static void  count_unpaired_cols(char *segX, char *segY, int alen, matrix_t *unpairedmat, int *ctX, int *ctY, double wgt);
static float simple_identity(char *s1, char *s2);

/* Function: AllocFullMatrix()
 * Date:     ER, Wed Jan 22 14:20:31 CST 2003 [St. Louis]
 *           
 *
 * Purpose:  allocate fullmat structure
 *
 * Args:     
 *
 * Returns:  void
 *           fullmat is allocated here
 */
fullmat_t *
AllocFullMatrix (int L) 
{
  fullmat_t *fullmat;
  
  fullmat = (fullmat_t *) MallocOrDie(sizeof(fullmat_t));

  fullmat->name = (char *) MallocOrDie(sizeof(char)*40);       /* More than enough */

  fullmat->unpaired = AllocMatrix (L);
  fullmat->paired   = AllocMatrix (L*L);

  return fullmat;
} 

fullcondmat_t *
AllocFullCondMatrix (int L) {

  fullcondmat_t *fullcondmat;

  fullcondmat = (fullcondmat_t *) MallocOrDie(sizeof(fullcondmat_t));
 
  fullcondmat->name = (char *) MallocOrDie(sizeof(char)*40);       /* More than enough */

  fullcondmat->marg = AllocCondMatrix (L);
  fullcondmat->cond = AllocCondMatrix (L*L);

  return fullcondmat;
} 

matrix_t *
AllocMatrix (int L) 
{
  matrix_t *mat;
  int c;

 mat = (matrix_t *) MallocOrDie(sizeof(matrix_t));

  mat->edge_size = L;
  mat->full_size = L*(L+1)/2;

  mat->matrix = (double *) MallocOrDie (sizeof(double) * mat->full_size);

  for (c = 0; c < mat->full_size; c++) 
    mat->matrix[c] = 0.0;

  mat->E = 0.0;
  mat->H = 0.0;

  return mat;
}

condmatrix_t *
AllocCondMatrix (int L) 
{
  condmatrix_t *condmat;
  int           x;

  condmat = (condmatrix_t *) MallocOrDie(sizeof(condmatrix_t));
  
  condmat->size = L;
  
  condmat->matrix = (double *) MallocOrDie (sizeof(double) * condmat->size * condmat->size);
  
  for (x = 0; x < condmat->size * condmat->size; x++) 
    condmat->matrix[x] = 0.0;
  
  return condmat;
}

/* Function: CalculateJoint()
 * Date:     ER, Thu Jan 23 17:29:56 CST 2003 [St. Louis]
 *           
 *
 * Purpose:  calculate log-probs or log-odds-probs
 *           using the paired columns
 *
 * Args:     
 *
 * Returns:  void
 *           
 */
void 
CalculateJoint (FILE *fp, fullmat_t *fullmat, double *background_nt, int log2odds)
{
  
  fullmat_t *jointfullmat;

  jointfullmat = CopyFullMatrix (fullmat);

  if (log2odds) {
    Log2toOddsMatrix (jointfullmat->unpaired, background_nt);
    Log2toOddsMatrix (jointfullmat->paired,   background_nt);
  }

  PrintFullMatrix (fp, jointfullmat);

  FreeFullMatrix(jointfullmat);
}

/* Function: CalculateConditionals()
 * Date:     ER, Thu Jan 23 17:29:56 CST 2003 [St. Louis]
 *           
 *
 * Purpose:  calculate log-probs or log-odds-probs
 *           using the paired columns
 *
 * Args:     
 *
 * Returns:  void
 *           
 */
void
CalculateConditionals (FILE *fp, fullmat_t *fullmat)
{
  fullcondmat_t *cond_mut1;
  fullcondmat_t *cond_mut2;
  fullcondmat_t *cond_pair;
  condmatrix_t  *dep_mut;
  condmatrix_t  *dep_pair;
  double     val, log2val;
  double     valm, log2valm;
  double     val1, val2;
  int        L;
  int        L2;
  int        xl, yl;
  int        xr, yr;
  int        xpair, ypair;
  int        xpairm, ypairm;
  int        lmut, rmut;
  int        idx;
  int        idxm;
  int        i, j;
  int        islog2;
  
  L = fullmat->unpaired->edge_size;
  L2 = L*L;
  
  cond_mut1 = AllocFullCondMatrix(L);
  cond_mut2 = AllocFullCondMatrix(L);
  cond_pair = AllocFullCondMatrix(L);

  dep_mut  = AllocCondMatrix(L2);
  dep_pair = AllocCondMatrix(L2);

  snprintf (cond_mut1->name, 40, "%s%s", fullmat->name, "-CONDbyMUT");
  snprintf (cond_mut2->name, 40, "%s%s", fullmat->name, "-CONDbyMUT");
  snprintf (cond_pair->name, 40, "%s%s", fullmat->name, "-CONDbyPAIR");
  
  for (xl = 0; xl < L; xl++) 
    for (xr = 0; xr < L; xr++) 
      for (yl = 0; yl < L; yl++) 
	for (yr = 0; yr < L; yr++) {
	  
	  xpair = idx(xl,xr);
	  ypair = idx(yl,yr);

	  xpairm = idx(xr,xl);
	  ypairm = idx(yr,yl);

	  lmut = idx(xl,yl);
	  rmut = idx(xr,yr);
	  
	  idx  = xpair * L2 + ypair;
	  idxm = lmut  * L2 + rmut;

	  log2val  = fullmat->paired->matrix[matrix_index(xpair,ypair)];
	  log2valm = fullmat->paired->matrix[matrix_index(xpairm,ypairm)];

	  val  = EXP2(log2val);
	  valm = EXP2(log2valm);
	 
	  cond_mut1->cond->matrix[idxm] = log2val;
	  cond_mut2->cond->matrix[idxm] = log2valm;
	  cond_pair->cond->matrix[idx]  = log2val;
  
	  cond_mut1->marg->matrix[lmut]  += val;
	  cond_mut2->marg->matrix[lmut]  += valm;
	  cond_pair->marg->matrix[xpair] += val;
	}

  /* check the marginal probabilities */
  CheckSingleProb(cond_mut1->marg->matrix,  L2);
  CheckSingleProb(cond_mut2->marg->matrix,  L2);
  CheckSingleProb(cond_pair->marg->matrix, L2);
  
  /* convert to log2 */
  DLog2(cond_mut1->marg->matrix, L2);
  DLog2(cond_mut2->marg->matrix, L2);
  DLog2(cond_pair->marg->matrix, L2);
  
  for (xl = 0; xl < L; xl++) 
    for (xr = 0; xr < L; xr++) 
      for (yl = 0; yl < L; yl++) 
	for (yr = 0; yr < L; yr++) {
	  
	  xpair = idx(xl,xr);
	  ypair = idx(yl,yr);

	  lmut = idx(xl,yl);
	  rmut = idx(xr,yr);
	  
	  idx  = xpair * L2 + ypair;
	  idxm = lmut  * L2 + rmut;
	  
	  cond_mut1->cond->matrix[idxm] -= cond_mut1->marg->matrix[lmut]; 
	  cond_mut2->cond->matrix[idxm] -= cond_mut2->marg->matrix[lmut]; 
	  cond_pair->cond->matrix[idx]  -= cond_pair->marg->matrix[xpair];
	  
	}

 /* merge both conditional for the mutation case into cond_mut1
   */
  for (i = 0; i < L2; i++) {
    
    val1 = EXP2(cond_mut1->marg->matrix[i]);
    val2 = EXP2(cond_mut2->marg->matrix[i]);
    
    cond_mut1->marg->matrix[i] = LOG2(val1+val2) -1.0;
    
    for (j = 0; j < L2; j++) {
      idx = i * L2 + j;
      
      val1 = EXP2(cond_mut1->cond->matrix[idx]);
      val2 = EXP2(cond_mut2->cond->matrix[idx]);

      cond_mut1->cond->matrix[idx] = LOG2(val1+val2) -1.0;
    }
  }

    /* check the conditional probabilities */
  for (i = 0; i < L2; i++) 
    CheckSingleLog2Prob(cond_pair->cond->matrix + i*L2, L2);  
  for (i = 0; i < L2; i++) 
    CheckSingleLog2Prob(cond_mut1->cond->matrix + i*L2, L2);
  for (i = 0; i < L2; i++) 
    CheckSingleLog2Prob(cond_mut2->cond->matrix + i*L2, L2);

     

  /* Check of Independence */
  for (xl = 0; xl < L; xl++) 
    for (xr = 0; xr < L; xr++) 
      for (yl = 0; yl < L; yl++) 
	for (yr = 0; yr < L; yr++) { 

	  
	  xpair = idx(xl,xr);
	  ypair = idx(yl,yr);

	  lmut = idx(xl,yl);
	  rmut = idx(xr,yr);
	  
	  idx  = xpair * L2 + ypair;
	  idxm = lmut  * L2 + rmut;

	  dep_mut->matrix[idxm] = cond_mut1->cond->matrix[idxm] - cond_mut1->marg->matrix[rmut];
	  dep_pair->matrix[idx] = cond_pair->cond->matrix[idx]  - cond_pair->marg->matrix[ypair];
	}
	 
  islog2 = TRUE;
  PrintFullCondMatrix(fp, cond_mut1, FALSE, islog2);
  PrintFullCondMatrix(fp, cond_pair, TRUE,  islog2);

  if (0) {
    PrintCondMatrix    (fp, dep_mut,   FALSE, TRUE, cond_mut1->name);
    PrintCondMatrix    (fp, dep_pair,  TRUE,  TRUE, cond_pair->name);
  }
  
  
  FreeFullCondMatrix(cond_mut1);
  FreeFullCondMatrix(cond_mut2);
  FreeFullCondMatrix(cond_pair);

  FreeCondMatrix(dep_mut);
  FreeCondMatrix(dep_pair);
}

void
CheckSingleProb(double *psingle, int size)
{
  int    x;
  double prob;
  double sum = 0.0;

  for (x = 0; x < size; x++) {
    prob = psingle[x];
    if (prob < -MARGIN) Die ("CheckSingleLog2Prob(): probabilities are getting too small here. P = %f", prob);
      sum += prob;
  }

  if (sum > 2. - accuracy || sum < accuracy) Die ("sum_x P(x) is %f\n", sum);
}

void
CheckSingleLog2Prob(double *psingle, int size)
{
  int    x;
  double prob;
  double sum = 0.0;

  for (x = 0; x < size; x++) {
    prob = EXP2(psingle[x]);
    if (prob < -MARGIN) Die ("CheckSingleLog2Prob(): probabilities are getting too small here. P = %f", prob);
      sum += prob;
  }

  if (sum > 2. - accuracy || sum < accuracy) Die ("sum_x P(x) is %f\n", sum);
}

/* Function: CopyFullMatrix()
 * Date:     ER, Mon Jan 27 12:34:27 CST 2003 [St. Louis]
 *           
 *
 * Purpose:  copy fullmat_t structure 
 *
 * Args:     
 *
 * Returns:  fullmat_t structure
 *           fullmatcopy is allocated here
 */
fullmat_t *
CopyFullMatrix (fullmat_t *fullmat) 
{
  fullmat_t *fullmatcopy;
  
  fullmatcopy = AllocFullMatrix (fullmat->unpaired->edge_size);

  snprintf (fullmatcopy->name, 40, "%s", fullmat->name);


  CopyMatrix(fullmatcopy->unpaired, fullmat->unpaired);
  CopyMatrix(fullmatcopy->paired,   fullmat->paired);


  return fullmatcopy;
} 


/* Function: CopyMatrix()
 * Date:     ER, Mon Jan 27 12:24:42 CST 2003 [St. Louis]
 *           
 *
 * Purpose:  copy matrix_t structure 
 *
 * Args:     
 *
 * Returns:  void
 *           fill matcopy.
 */
void
CopyMatrix (matrix_t *matcopy, matrix_t *mat)
{
  int x;

  matcopy->edge_size = mat->edge_size;
  matcopy->full_size = mat->full_size;

  for (x = 0; x < mat->full_size; x++) 
    matcopy->matrix[x] = mat->matrix[x];
}

/*
 * count_matrix
 *
 * Given an MSA and two matrices, counts pairs of column(s) from two sequences
 * at a time into the matrices using the BLOSUM algorithm.
 *
 * Fills in paired matrix (for basepairs), unpaired, background nt count in
 * aligned regions
 *
 * Each nucleotide at each position can be:
 *    one of gap, unknown character (not ACGTU), known character
 *    one of left bp, right bp, not base paired
 * If both characters are gaps:
 *   Skip
 * If both characters are known:
 *   If both are left bps and match to same right bps
 *     continue
 *   If both are right bps and match to same left bps
 *     add to pairedmat
 *   Otherwise
 *     Add to unpairedmat
 *
 */

/* Function: CountMatrixER()
 * Date:     ER, Wed Jan 22 13:51:23 CST 2003 [St. Louis]
 *           based on RJK count_matrix() from rsearch
 *
 * Purpose:  calculate the RNA-2-pair probabilties from a MA
 *
 * Args:     
 *
 * Returns:  void
 *           fullmat->paired and fullmat->unparied are filled here.
 */
void 
CountMatrixER (MSA *msa, fullmat_t *fullmat, double *background_nt, int cutoff_perc) {
  double cur_wgt;
  int i, j;            /* Seqs we're checking */
  int **ct;            /* ct matrix for all the seqs */
  
  /*****************************************
   * 1.  Allocate and fill in ct array
   *****************************************/
  ct = MallocOrDie (sizeof(int *) * msa->nseq);
  if (msa->ss_cons != NULL) {
    ct[0] = rjk_KHS2ct (msa->ss_cons, msa->alen);
  } else {
    ct[0] = rjk_KHS2ct (msa->ss[0], msa->alen);
  }
  for (i=1; i<msa->nseq; i++) {
    if (msa->ss_cons != NULL) {
      ct[i] = ct[0];
    } else {
      ct[i] = rjk_KHS2ct (msa->ss[i], msa->alen);
    }
  }
  for (i=0; i<msa->nseq; i++) {
    if (ct[i] == NULL) {
      Die ("CT string %d is NULL\n", i);
    }
  }
  
  /**********************************
   * 2.  Count
   **********************************/
  for (i = 0; i < msa->nseq; i++) {
    for (j = 0; j < i; j++) {
      
      /* First, make sure it's above the cutoff */
      if (simple_identity(msa->aseq[i], msa->aseq[j]) < 0.01*(float)cutoff_perc)
	continue;       /* Not above cutoff */
      
      cur_wgt = 0.5 * (msa->wgt[i] + msa->wgt[j]);
      
      count_background_nts(msa->aseq[i], msa->aseq[j], msa->alen, background_nt, cur_wgt);
      
      count_paired_cols  (msa->aseq[i], msa->aseq[j], msa->alen, fullmat->paired,   ct[i], ct[j], cur_wgt);
      count_unpaired_cols(msa->aseq[i], msa->aseq[j], msa->alen, fullmat->unpaired, ct[i], ct[j], cur_wgt);
      
    }
  }
  
  /* Free ct arrays */
  if (ct[0] == ct[1]) {
    free (ct[0]);
  } else {
    for (i=0; i<msa->nseq; i++) {
      free (ct[i]);
    }
  }
  free (ct);
}

void
DLog2(double *vec, int n)
{
  int x;
  for (x = 0; x < n; x++)
    if (vec[x] > 0.) vec[x] = LOG2(vec[x]);
    else vec[x] = -DBL_MAX;
}

/* Function: FreeFullMatrix()
 * Date:     ER, Wed Jan 22 14:07:26 CST 2003 [St. Louis]
 *           
 *
 * Purpose:  free fullmat structure
 *
 * Args:     
 *
 * Returns:  void
 *           fullmat is freed
 */
void
FreeFullMatrix (fullmat_t *fullmat) {

  free (fullmat->name);

  FreeMatrix(fullmat->unpaired);
  FreeMatrix(fullmat->paired);
  
  free(fullmat);
}

void
FreeFullCondMatrix (fullcondmat_t *fullcondmat) {

  free (fullcondmat->name);

  FreeCondMatrix(fullcondmat->marg);
  FreeCondMatrix(fullcondmat->cond);
  
  free(fullcondmat);
}

void
FreeMatrix (matrix_t *mat) {

  free(mat->matrix);
  free(mat);
}

void
FreeCondMatrix (condmatrix_t *mat) {

  free(mat->matrix);
  free(mat);
}

/* Function: LogMatrix()
 * Date:     ER, Thu Jan 23 17:29:56 CST 2003 [St. Louis]
 *           
 *
 * Purpose:  calculate log-probs or log-odds-probs
 *           using the paired columns
 *
 * Args:     
 *
 * Returns:  void
 *           
 */
void
Log2Matrix (matrix_t *mat) 
{  
  double cur_q;
  int    i, j;

  for (i = 0; i < mat->edge_size; i++) {
    for (j = 0; j <=i; j++) {
      
      cur_q = LOG2(mat->matrix[matrix_index(j,i)]);
            
      mat->matrix[matrix_index(j,i)] = cur_q - ((i==j) ? 0.0 : 1.0);
    }
  }

}

void
Log2toOddsMatrix (matrix_t *mat, double *background_nt) 
{  
  double cur_q, cur_p;
  int    i, j;

  for (i = 0; i < mat->edge_size; i++) 
    for (j = 0; j <= i; j++) {
      
      cur_q = mat->matrix[matrix_index(j,i)] + ((i==j) ? 0.0 : 1.0);
      
      if (mat->edge_size == RNA_ALPHABET_SIZE) 
	cur_p = background_nt[i] + background_nt[j];
      else
	cur_p = 
	  + background_nt[i/RNA_ALPHABET_SIZE] 
	  + background_nt[i%RNA_ALPHABET_SIZE] 
	  + background_nt[j/RNA_ALPHABET_SIZE] 
	  + background_nt[j%RNA_ALPHABET_SIZE];
      
      mat->matrix[matrix_index(j,i)] -= cur_p;
      
      mat->H += EXP2(cur_q) * mat->matrix[matrix_index(j,i)];
      mat->E += EXP2(cur_p) * mat->matrix[matrix_index(j,i)];
    }
  
}

/*
 * MatFileOpen
 *
 * Given three strings, tries combinations to open the matrix file
 * as follows:
 *
 * MATRIX_DIR = default matrix directory provided at compile-time through
 *   ./configure and setting of data directory ($prefix/share/rsearch/matrices)
 * deflt = default matrix name
 * matdir = matrix directory from RNAMAT environment variable
 * matfile = filenane/matrix name override from -m parameter
 *
 * Order to test:
 * 1.  matfile
 * 2.  matfile.mat
 * 3.  matdir/matfile
 * 4.  matdir/matfile.mat
 * 5.  matdir/default
 * 6.  matdir/default.mat
 * 7.  MATRIX_DIR/matfile
 * 8.  MATRIX_DIR/matfile.mat
 * 9.  MATRIX_DIR/default
 * 10. MATRIX_DIR/default.mat
 */
FILE *
MatFileOpen (char *deflt, char *matdir, char *matfile) {
     char buf[1024];
     FILE *fp;
     
     /* Only do 1-4 if matfile defined */
     if (matfile[0] != '\0') {
       /* 1.  matfile */
       fp = fopen (matfile, "r");
       if (fp != NULL) return (fp);

       /* 2.  matfile.mat */
       snprintf (buf, 1023, "%s.mat", matfile);
       buf[1023] = '\0';
       fp = fopen (buf, "r");
       if (fp != NULL) return (fp);

       /* 3. matdir/matfile */
       snprintf (buf, 1023, "%s/%s", matdir, matfile);
       buf[1023] = '\0';
       fp = fopen (buf, "r");
       if (fp != NULL) return (fp);

       /* 4.  matdir/matfile.mat */
       snprintf (buf, 1023, "%s/%s.mat", matdir, matfile);
       buf[1023] = '\0';
       fp = fopen (buf, "r");
       if (fp != NULL) return (fp);
     }

     /* 5.  matdir/default */
     snprintf (buf, 1023, "%s/%s", matdir, deflt);
     buf[1023] = '\0';
     fp = fopen (buf, "r");
     if (fp != NULL) return (fp);

     /* 6.  matdir/default.mat */
     snprintf (buf, 1023, "%s/%s.mat", matdir, deflt);
     buf[1023] = '\0';
     fp = fopen (buf, "r");
     if (fp != NULL) return (fp);

     /* Only do 7-8 if matfile defined */
     if (matfile[0] != '\0') {
      /* 7. QRNADB/matfile */
       snprintf (buf, 1023, "%s/%s", "QRNADB", matfile);
       buf[1023] = '\0';
       fp = fopen (buf, "r");
       if (fp != NULL) return (fp);

       /* 8.  QRNADB/matfile.mat */
       snprintf (buf, 1023, "%s/%s.mat", "QRNADB", matfile);
       buf[1023] = '\0';
       fp = fopen (buf, "r");
       if (fp != NULL) return (fp);
     }

     /* 9.  QRNADB/default */
     snprintf (buf, 1023, "%s/%s", "QRNADB", deflt);
     buf[1023] = '\0';
     fp = fopen (buf, "r");
     if (fp != NULL) return (fp);

     /* 10.  QRNADB/default.mat */
     snprintf (buf, 1023, "%s/%s.mat", "QRNADB", deflt);
     buf[1023] = '\0';
     fp = fopen (buf, "r");
     if (fp != NULL) return (fp);

     return (NULL);
}

/* Robbie's function 
 *
 * Maps c as follows:
 * A->0
 * C->1
 * G->2
 * T->3
 * U->3
 * else->-1
 */
int
numbered_nucleotide (char c) {
  switch (c) {
  case 'A':
  case 'a':
    return (0);
  case 'C':
  case 'c':
    return (1);
  case 'G':
  case 'g':
    return (2);
  case 'T':
  case 't':
  case 'U':
  case 'u':
    return (3);
  }
  return (-1);
}

/* Robbie's function 
 *
 * Maps base pair c,d as follows:
 *
 * AA -> 0
 * AC -> 1
 * ....
 * TG -> 15
 * TT -> 16 (T==U)
 * Anything else maps to -1
 */
int 
numbered_basepair (char c, char d) {
  int c_num, d_num;
  c_num = numbered_nucleotide (c);
  d_num = numbered_nucleotide (d);
  if (c_num < 0 || d_num < 0) {
    return (-1);
  } else {
    return ((c_num << 2) | d_num);
  }
}

/* Robbie's function -- it only works for ungapped and symetric matrices
 *
 * print_matrix
 *
 * Dumps the paired and unpaired matrices and gap penalties
 */
void 
PrintFullMatrix (FILE *fp, fullmat_t *fullmat) {
  
  int i, j;
  
  fprintf (fp, "%s\n\n", fullmat->name);
  
  fprintf (fp, "    ");
  for (i=0; i<sizeof(RNA_ALPHABET)-1; i++) { 
    fprintf (fp, "%c         ", RNA_ALPHABET[i]); 
  } 
  fprintf (fp, "\n"); 
  for (i=0; i<sizeof(RNA_ALPHABET)-1; i++) { 
    fprintf (fp, "%c   ", RNA_ALPHABET[i]); 
    for (j=0; j<=i; j++) { 
      fprintf (fp, "%-9.2f ", fullmat->unpaired->matrix[matrix_index(numbered_nucleotide(RNA_ALPHABET[i]), numbered_nucleotide(RNA_ALPHABET[j]))]); 
    } 
    fprintf (fp, "\n"); 
  } 
  
  if (strstr (fullmat->name, "RIBOPROB") == NULL)    /* Not probability mat */
    fprintf (fp, "H: %.4f\nE: %.4f\n", fullmat->unpaired->H, fullmat->unpaired->E);
  
  fprintf (fp, "\n    ");
  for (i=0; i<sizeof(RNAPAIR_ALPHABET)-1; i++) {
    fprintf (fp, "%c%c        ", RNAPAIR_ALPHABET[i], RNAPAIR_ALPHABET2[i]);
  }
  fprintf (fp, "\n");
  for (i=0; i<sizeof(RNAPAIR_ALPHABET)-1; i++) {
    fprintf (fp, "%c%c  ", RNAPAIR_ALPHABET[i], RNAPAIR_ALPHABET2[i]);
    for (j=0; j<=i; j++) {
      fprintf (fp, "%-9.2f ", fullmat->paired->matrix[matrix_index(numbered_basepair(RNAPAIR_ALPHABET[i], RNAPAIR_ALPHABET2[i]), numbered_basepair (RNAPAIR_ALPHABET[j], RNAPAIR_ALPHABET2[j]))]);
    }
    fprintf (fp, "\n");
  }
  
  if (strstr (fullmat->name, "RIBOPROB") == NULL)    /* Not probability mat */
    fprintf (fp, "H: %.4f\nE: %.4f\n", fullmat->paired->H, fullmat->paired->E);
  fprintf (fp, "\n");
}

/* Function: PrintFullCondMatrix()
 * Date:     ER, Thu Jan 30 11:13:09 CST 2003 [St. Louis]
 *           
 *
 * Purpose:  prints a conditional fullcondmat_t structure. Matrices have to be ungapped (4x4 or 16x16)
 * Args:     
 *
 * Returns:  void
 *           
 */
void 
PrintFullCondMatrix (FILE *fp, fullcondmat_t *fullcondmat, int ispaircond, int islog2) 
{
  int L;
  int L2;
  int i, j;

  L = fullcondmat->marg->size;
  L2 = L*L;
  
  fprintf (fp, "\n%s\n\n", fullcondmat->name);
  
  fprintf (fp, "    ");
  for (i = 0; i < L; i++) { 
    fprintf (fp, "%c         ", RNA_ALPHABET[i]); 
  } 
  fprintf (fp, "\n"); 
  for (i = 0; i < L; i++) { 
    fprintf (fp, "%c   ", RNA_ALPHABET[i]); 
    for (j = 0; j < L; j++) { 
      fprintf (fp, "%-9.4f ", (islog2)? fullcondmat->marg->matrix[i*L+j] : EXP2(fullcondmat->marg->matrix[i*L+j])); 
    } 
    fprintf (fp, "\n"); 
  } 
  
  fprintf (fp, "\n    ");
  if (ispaircond)
    {
      for (i = 0; i < L2; i++) 
	fprintf (fp, "%c%c        ", RNAPAIR_ALPHABET[i], RNAPAIR_ALPHABET2[i]);
    } 
  else 
    {  
      for (i = 0; i < L; i++) 
	for (j = 0; j < L; j++) 
	  fprintf (fp, "%c         ", RNA_ALPHABET[i]); 
      fprintf (fp, "\n    ");
      for (i = 0; i < L; i++) 
	for (j = 0; j < L; j++) 
	  fprintf (fp, "%c         ", RNA_ALPHABET[j]); 
      
    }
  fprintf (fp, "\n");
  
  for (i = 0; i < L2; i++) {
    if (ispaircond) { fprintf (fp, "%c%c  ", RNAPAIR_ALPHABET[i], RNAPAIR_ALPHABET2[i]); }
    else            { fprintf (fp, "%c\n", RNAPAIR_ALPHABET[i]); fprintf (fp, "%c  ", RNAPAIR_ALPHABET2[i]); }

    for (j = 0; j < L2; j++) 
      fprintf (fp, "%-9.4f ", (islog2)? fullcondmat->cond->matrix[i*L2+j] : EXP2(fullcondmat->cond->matrix[i*L2+j]));
    fprintf (fp, "\n");
  }
  
}

/* Function: PrintCondMatrix()
 * Date:     ER, Thu Jan 30 11:11:54 CST 2003 [St. Louis]
 *           
 *
 * Purpose:  prints a conditional ungapped matrix
 * Args:     
 *
 * Returns:  void
 *           
 */
void 
PrintCondMatrix (FILE *fp, condmatrix_t *condmat, int ispaircond, int islog2, char *title) 
{
  int L;
  int Lhalf;
  int i, j;

  L     = condmat->size;
  Lhalf = (int)sqrt(L);
  
  fprintf (fp, "\n%s\n\n", title);
  
  fprintf (fp, "\n    ");
  if (ispaircond)
    {
      for (i = 0; i < L; i++) 
	fprintf (fp, "%c%c        ", RNAPAIR_ALPHABET[i], RNAPAIR_ALPHABET2[i]);
    } 
  else 
    {  
      for (i = 0; i < Lhalf; i++) 
	for (j = 0; j < Lhalf; j++) 
	  fprintf (fp, "%c         ", RNA_ALPHABET[i]); 
      fprintf (fp, "\n    ");
      for (i = 0; i < Lhalf; i++) 
	for (j = 0; j < Lhalf; j++) 
	  fprintf (fp, "%c         ", RNA_ALPHABET[j]); 
      
    }
  fprintf (fp, "\n");
  
  for (i = 0; i < L; i++) {
    if (ispaircond) { fprintf (fp, "%c%c  ", RNAPAIR_ALPHABET[i], RNAPAIR_ALPHABET2[i]); }
    else            { fprintf (fp, "%c\n", RNAPAIR_ALPHABET[i]); fprintf (fp, "%c  ", RNAPAIR_ALPHABET2[i]); }

    for (j = 0; j < L; j++) 
      fprintf (fp, "%-9.4f ", (islog2)? condmat->matrix[i*L+j] : EXP2(condmat->matrix[i*L+j]));
    fprintf (fp, "\n");
  }
  
}

/*
 * Read the matrix from a file
 */
fullmat_t *
ReadMatrix(FILE *matfp) {
  char linebuf[256];
  char fullbuf[16384];
  int fullbuf_used = 0;
  fullmat_t *fullmat;
  int i;
  char *cp, *end_mat_pos;

  fullmat = AllocFullMatrix (RNA_ALPHABET_SIZE);

  while (fgets (linebuf, 255, matfp)) {
    strncpy (fullbuf+fullbuf_used, linebuf, 16384-fullbuf_used-1);
    fullbuf_used += strlen(linebuf);
    if (fullbuf_used >= 16384) {
      Die ("ERROR: Matrix file bigger than 16kb\n");
    }
  }

  /* First, find RIBO, and copy matrix name to fullmat->name */
  cp = strstr (fullbuf, "RIBO");
  for (i = 0; cp[i] && !isspace(cp[i]); i++);   /* Find space after RIBO */
  fullmat->name = MallocOrDie(sizeof(char)*(i+1));
  strncpy (fullmat->name, cp, i);
  fullmat->name[i] = '\0';
  cp = cp + i;

  /* Now, find the first A */
  cp = strchr (cp, 'A');
  fullmat->unpaired->edge_size = 0;
  /* And count how edge size of the matrix */
  while (*cp != '\n' && cp-fullbuf < fullbuf_used) {
    if (!isspace (cp[0]) && isspace (cp[1])) {
      fullmat->unpaired->edge_size++;
    }
    cp++;
  }

  /* Find next A */
  while (*cp != 'A' && (cp-fullbuf) < fullbuf_used) cp++;
  
  /* Take numbers until we hit the H: */
  end_mat_pos = strstr (cp, "H:");
  for (i=0; cp - fullbuf < end_mat_pos-fullbuf; i++) {
    while (!isdigit(*cp) && *cp != '-' && *cp != '.' && \
	   cp-fullbuf < fullbuf_used && cp != end_mat_pos) { 
	cp++;
    }
    if (cp == end_mat_pos)
      break;
    if (cp-fullbuf < fullbuf_used) {
      fullmat->unpaired->matrix[i] = atof(cp);
      while ((isdigit (*cp) || *cp == '-' || *cp == '.') &&\
	     (cp-fullbuf <fullbuf_used)) {
	cp++;
      }
    }
  }
  fullmat->unpaired->full_size = i;

  /* Skip the H: */
  cp += 2;
  fullmat->unpaired->H = atof(cp);

  /* Now, go past the E: */
  cp = strstr (cp, "E:") + 2;
  fullmat->unpaired->E = atof(cp);

  /********* PAIRED MATRIX ************/
  /* Now, find the first A */
  cp = strchr (cp, 'A');
  fullmat->paired->edge_size = 0;
  /* And count how edge size of the matrix */
  while (*cp != '\n') {
    if (!isspace (cp[0]) && isspace (cp[1])) {
      fullmat->paired->edge_size++;
    }
    cp++;
  }

  /* Find next A */
  while (*cp != 'A' && (cp-fullbuf) < fullbuf_used) cp++;

  /* Take numbers until we hit the H: */
  end_mat_pos = strstr (cp, "H:");
  for (i=0; cp - fullbuf < end_mat_pos-fullbuf; i++) {
    while (!isdigit(*cp) && *cp != '-' && *cp != '.' && \
	   cp-fullbuf < fullbuf_used && cp != end_mat_pos) { 
	cp++;
    }
    if (cp == end_mat_pos)
      break;
    if (cp-fullbuf < fullbuf_used) {
      fullmat->paired->matrix[i] = atof(cp);
      while ((isdigit (*cp) || *cp == '-' || *cp == '.') &&\
	     (cp-fullbuf <fullbuf_used)) {
	cp++;
      }
    }
  }
  fullmat->paired->full_size = i;

  /* Skip the H: */
  cp += 2;
  fullmat->paired->H = atof(cp);

  /* Now, go past the E: */
  cp = strstr (cp, "E:") + 2;
  fullmat->paired->E = atof(cp);

  return (fullmat);
}

/* Function: rjk_KHS2ct()
 * Incept:   SRE 29 Feb 2000 [Seattle]; from COVE 1.0 code
 * Modified: RJK 27 Feb 2002 [St. Louis]; from Infernal code (rna_ops.c)
 * Purpose:  Convert a secondary structure string (0..len-1) to an array of 
 *           integers representing what position each position is base-paired
 *           to (0..len-1) or -1 if none.  This is a change from what Sean
 *           did in the Infernal code back towards the original way it was
 *           done in the Squid code (compstruct_main.c).  In this case, the
 *           numbering scheme does not match Zuker's .ct files, but does
 *           match the way the MSA is stored using the SQUID library
 *           functions.
 *           
 *           This version does not allow pseudoknots.  Thus ">" and "<" are
 *           used for base pairs, and all other characters, including white
 *           space, are taken to mean unpaired nucleotides.
 *
 * Return:   ret_ct is allocated here and must be free'd by caller.
 *           Returns pointer to ret_ct, or NULL if ss is somehow inconsistent.
 */
int *
rjk_KHS2ct(char *ss, int len) {
  Nstack_t *pda;                 
  int      *ct;
  int       pos, pair;

 /* Initialization: always initialize the main pda (0),
   */
  pda = CreateNstack();

  ct = MallocOrDie (len * sizeof(int));
  for (pos = 0; pos < len; pos++)
    ct[pos] = -1;

  for (pos = 0; pos < len; pos++) {
      if (!isprint(ss[pos])) {   /* armor against garbage strings */
	free (ct);
	FreeNstack(pda);
	return (NULL);
      } else if (ss[pos] == '>') {  /* left side of a pair: push onto stack */
        PushNstack(pda, pos);
      } else if (ss[pos] == '<') { /* right side of a pair; resolve pair */
	if (! PopNstack(pda, &pair)) {
	  free (ct);
	  FreeNstack(pda);
	  return (NULL);
	} else {
	  ct[pos]  = pair;
	  ct[pair] = pos;
	}
      }
  }
                                /* nothing should be left on stacks */
  if (! NstackIsEmpty(pda)) {
    free (ct);
    FreeNstack(pda);
    return (NULL);
  }
  FreeNstack(pda);

  return (ct);
}

  


/* Function: count_background_nts()
 * Date:     ER, Thu Jan 23 10:57:03 CST 2003 [St. Louis]
 *           
 *
 * Purpose:  calcultes single nucleotide counts from a pairwise alignment
 *           using the paired columns
 *
 * Args:     
 *
 * Returns:  void
 *           pairedmat is filled here.
 */
void
count_background_nts(char *segX, char *segY, int alen, double *background_nt, double wgt)
{
  int col;
  int x, y;
  int xchar, ychar;
  
  for (col = 0; col < alen; col++) {
    
    xchar = segX[col];
    ychar = segY[col];
    
    if (is_rna_nucleotide(xchar) && is_rna_nucleotide (ychar)) 
      {
	x = numbered_nucleotide(xchar);
	y = numbered_nucleotide(ychar);   
	
	background_nt[x] += wgt;
	background_nt[y] += wgt;
      }
  }
  
}

/* Function: count_paired_cols()
 * Date:     ER, Thu Jan 23 10:18:29 CST 2003 [St. Louis]
 *           
 *
 * Purpose:  calcultes counts from a pairwise alignment
 *           using the paired columns
 *
 * Args:     
 *
 * Returns:  void
 *           pairedmat is filled here.
 */
void
count_paired_cols(char *segX, char *segY, int alen, matrix_t *pairedmat, int *ctX, int *ctY, double wgt)
{      
  char  xlchar, ylchar;
  int   col;
  int   prev_col;
  int   xccol, yccol;
  int   xl, xr;
  int   yl, yr;
  int   xpair, ypair;
  
  
  for (prev_col = 0; 
       prev_col < alen          &&
	 isgap (segX[prev_col]) && 
	 isgap (segY[prev_col]);
       prev_col++);
  
  for (col = prev_col; col < alen; col++) {
    
    xlchar = segX[col];
    ylchar = segY[col];
    
    if (is_rna_nucleotide(xlchar) && is_rna_nucleotide (ylchar)) {
      
      xl = numbered_nucleotide(xlchar);
      yl = numbered_nucleotide(ylchar);
      
      xccol = ctX[col];
      yccol = ctY[col];
      
      /* If both positions are defined nucleotides */
      /* If both are left bps and match to same right bps, continue 
	 If both are right bps and match to same left bps, add to \
	 pairedmat.  Otherwise, add to unpairedmat */
      if (xccol >= 0 && yccol >= 0 && xccol == yccol) {        /* Base pairs and equal */
	if (is_rna_nucleotide(segX[xccol]) && \
	    is_rna_nucleotide(segY[yccol])) {  
	  
	  xpair = numbered_basepair(segX[xccol], xlchar);
	  ypair = numbered_basepair(segY[yccol], ylchar);
	  
	  xr = numbered_nucleotide(segX[xccol]);
	  yr = numbered_nucleotide(segY[yccol]);
	  
	  /* Both are RNA nucleotides */
	  if (col < xccol && col < yccol) { /* Both left bps */
	    continue;
	  } else {    
	    
	    
	    /* Both right bps */
	    pairedmat->matrix[matrix_index(xpair, ypair)] += wgt;
	    
	    continue;
	  }
	}	
      }	
    }
  } /* while there are cols */
  
}

/* Function: count_unpaired_cols()
 * Date:     ER, Thu Jan 23 11:08:13 CST 2003 [St. Louis]
 *           
 *
 * Purpose:  calcultes counts from a pairwise alignment
 *           using the unpaired columns
 *
 * Args:     
 *
 * Returns:  void
 *           unpairedmat is filled here.
 */
void
count_unpaired_cols(char *segX, char *segY, int alen, matrix_t *unpairedmat, int *ctX, int *ctY, double wgt)
{      
  char  xlchar, ylchar;
  int   col;
  int   prev_col;
  int   xccol, yccol;
  int   x, y;
  
  
  for (prev_col = 0; 
       prev_col < alen          &&
	 isgap (segX[prev_col]) && 
	 isgap (segY[prev_col]);
       prev_col++);
  
  for (col = prev_col; col < alen; col++) {       
    
    xlchar = segX[col];
    ylchar = segY[col];
    
    if (is_rna_nucleotide(xlchar) && is_rna_nucleotide (ylchar)) {
      
      x = numbered_nucleotide(xlchar);
      y = numbered_nucleotide(ylchar);
      
      xccol = ctX[col];
      yccol = ctY[col];
      
      if (xccol >= 0 && yccol >= 0 && xccol == yccol && is_rna_nucleotide(segX[xccol]) && is_rna_nucleotide(segY[yccol]) ) continue;
      
      unpairedmat->matrix[matrix_index(x,y)] += wgt;          
    }
    
  } /* while there are cols left */
}

/* TAKEN FROM SQUID's weight.c's simple_distance, but rewritten to
 *  be simple_identity
 * Function: simple_identity()
 * 
 * Purpose:  For two identical-length null-terminated strings, return
 *           the fractional identity between them. (0..1)
 *           (Gaps don't count toward anything.)
 */
float
simple_identity(char *s1, char *s2)
{
  int diff  = 0;
  int valid = 0;

  for (; *s1 != '\0'; s1++, s2++)
    {
      if (isgap(*s1) || isgap(*s2)) continue;
      if (*s1 == *s2) diff++;
      valid++;
    }
  return (valid > 0 ? ((float) diff / (float) valid) : 0.0);
}
    
