static char rcsid[] = "$Id: samread.c 224757 2021-12-13 00:43:16Z twu $";
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>		/* For strcpy */
#include <strings.h>		/* For rindex */
#include <ctype.h>

#include "except.h"
#include "mem.h"
#include "assert.h"
#include "bool.h"
#include "complement.h"

#include "samread.h"


#ifdef DEBUG
#define debug(x) x
#else
#define debug(x)
#endif

/* Prints location of parsing error for Unexpected output type */
#ifdef DEBUG1
#define debug1(x) x
#else
#define debug1(x)
#endif


static char complCode[128] = COMPLEMENT_LC;

static void
make_complement_inplace (char *sequence, unsigned int length) {
  char temp;
  unsigned int i, j;

  for (i = 0, j = length-1; i < length/2; i++, j--) {
    temp = complCode[(int) sequence[i]];
    sequence[i] = complCode[(int) sequence[j]];
    sequence[j] = temp;
  }
  if (i == j) {
    sequence[i] = complCode[(int) sequence[i]];
  }

  return;
}


int
Samread_cigar_string_readlength (int *hardclip_low, int *hardclip_high, char *cigar) {
  int readlength;
  unsigned int npos;
  char *p, type, c;
  bool firstp = true;

  *hardclip_low = *hardclip_high = 0;
  if (cigar[0] == '*') {
    return 0;
  } else {
    readlength = 0;

    p = cigar;
    while (*p != '\0') {
      npos = 0;
      while ((c = *p++) != '\0' && isdigit(c)) {
	npos = 10*npos + (c - '0');
      }
      if (*--p == '\0') {
	fprintf(stderr,"Unable to parse cigar %s.  No letter after number %u\n",cigar,npos);
	exit(9);
      } else {
	type = *p++;
      }

      if (type == 'S' || type == 'M' || type == 'I') {
	readlength += (int) npos;

      } else if (type == 'H') {
	if (firstp == true) {
	  *hardclip_low = npos;
	} else {
	  *hardclip_high = npos;
	}
	readlength += (int) npos;

      } else if (type == 'D' || type == 'N') {
	/* Ignore */
      } else if (type == 'P') {
	/* Ignore */
      } else {
	fprintf(stderr,"samread.c cannot parse type %c\n",type);
	exit(9);
      }

      firstp = false;
    }
  }

  return readlength;
}


int
Samread_cigar_string_initial_softclip (char *cigar) {
  int npos;
  char *p, c;

  if (cigar[0] == '*') {
    return 0;
  } else {
    p = cigar;

    npos = 0;
    while ((c = *p++) != '\0' && isdigit(c)) {
      npos = 10*npos + (c - '0');
    }
    if (*--p == '\0') {
      fprintf(stderr,"Unable to parse cigar %s.  No letter after number %u\n",cigar,npos);
      exit(9);
    } else if (*p != 'S') {
      return 0;
    } else {
      return npos;
    }
  }
}


#define HITI_MAXDIGITS 10

/* Called just after we read in '\t', so should start at a field */
/* Can call parse_XO_and_HI_fromfile only once per line */
SAM_split_output_type
Samread_parse_XO_and_HI (char **hiti, char *line) {
  SAM_split_output_type split_output = OUTPUT_NONE;
  char *p, c = '\t', c0, c1;
  char abbrev0, abbrev1;

  *hiti = MALLOC((HITI_MAXDIGITS + 1) * sizeof(char));
  (*hiti)[0] = '\0';

  while (c != '\n') {
    c0 = *line++;
    c1 = *line++;

    if (c0 == 'H' && c1 == 'I') {
      line += 3;		/* :type: */
      
      p = *hiti;
      while ((c = *p++ = *line++) != '\n' && c != '\t') ;
      *--p = '\0';			/* terminating char */
      
    } else if (c0 == 'X' && c1 == 'O') {
      line += 3;		/* :type: */
      abbrev0 = *line++;
      abbrev1 = *line++;
      switch (abbrev0) {
      case 'N':
	if (abbrev1 == 'M') {
	  split_output = OUTPUT_NM;
	} else {
	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 1: "));
	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
	  split_output = OUTPUT_NONE;
	}
	break;
      case 'C':
	switch (abbrev1) {
	case 'U': split_output = OUTPUT_CU; break;
	case 'C': split_output = OUTPUT_CC; break;
	case 'T': split_output = OUTPUT_CT; break;
	case 'M': split_output = OUTPUT_CM; break;
	case 'X': split_output = OUTPUT_CX; break;
	default:
	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 2: "));
	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
	  split_output = OUTPUT_NONE;
	}
	break;
      case 'H':
	switch (abbrev1) {
	case 'U': split_output = OUTPUT_HU; break;
	case 'C': split_output = OUTPUT_HC; break;
	case 'T': split_output = OUTPUT_HT; break;
	case 'M': split_output = OUTPUT_HM; break;
	case 'X': split_output = OUTPUT_HX; break;
	default:
	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 3: "));
	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
	  split_output = OUTPUT_NONE;
	}
	break;
      case 'U':
	switch (abbrev1) {
	case 'U': split_output = OUTPUT_UU; break;
	case 'C': split_output = OUTPUT_UC; break;
	case 'T': split_output = OUTPUT_UT; break;
	case 'M': split_output = OUTPUT_UM; break;
	case 'X': split_output = OUTPUT_UX; break;
	default:
	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 4: "));
	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
	  split_output = OUTPUT_NONE;
	}
	break;
      case 'P':
	switch (abbrev1) {
	case 'C': split_output = OUTPUT_PC; break;
	case 'I': split_output = OUTPUT_PI; break;
	case 'S': split_output = OUTPUT_PS; break;
	case 'L': split_output = OUTPUT_PL; break;
	case 'M': split_output = OUTPUT_PM; break;
	case 'X': split_output = OUTPUT_PX; break;
	default:
	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 5: "));
	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
	  split_output = OUTPUT_NONE;
	}
	break;
      default:
	debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 6: "));
	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
	split_output = OUTPUT_NONE;
      }

      while ((c = *line++) != '\n' && c != '\t') ;

    } else {
      while ((c = *line++) != '\n' && c != '\t') ;
    }
  }

  return split_output;
}


/* Called just after we read in '\t', so should start at a field */
/* Can call parse_XO_and_HI_fromfile only once per line */
char *
Samread_parse_XH_XO_and_HI (SAM_split_output_type *split_output, char **hiti, char *line, int linelength) {
  char *hardclip_string = NULL;
  char *p, c = '\t', c0, c1;
  char abbrev0, abbrev1;

  *split_output = OUTPUT_NONE;

  *hiti = MALLOC((HITI_MAXDIGITS + 1) * sizeof(char));
  (*hiti)[0] = '\0';

  while (c != '\n') {
    c0 = *line++;
    c1 = *line++;

    if (c0 == 'X' && c1 == 'H') {
      line += 3;		/* :type: */
      p = hardclip_string = MALLOC((linelength+1) * sizeof(char));
      while ((c = *p++ = *line++) != '\n' && c != '\t') ;
      *--p = '\0';			/* terminating char */

    } else if (c0 == 'H' && c1 == 'I') {
      line += 3;		/* :type: */
      
      p = *hiti;
      while ((c = *p++ = *line++) != '\n' && c != '\t') ;
      *--p = '\0';			/* terminating char */
      
    } else if (c0 == 'X' && c1 == 'O') {
      line += 3;		/* :type: */
      abbrev0 = *line++;
      abbrev1 = *line++;
      switch (abbrev0) {
      case 'N':
	if (abbrev1 == 'M') {
	  *split_output = OUTPUT_NM;
	} else {
	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 1: "));
	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
	  *split_output = OUTPUT_NONE;
	}
	break;
      case 'C':
	switch (abbrev1) {
	case 'U': *split_output = OUTPUT_CU; break;
	case 'C': *split_output = OUTPUT_CC; break;
	case 'T': *split_output = OUTPUT_CT; break;
	case 'M': *split_output = OUTPUT_CM; break;
	case 'X': *split_output = OUTPUT_CX; break;
	default:
	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 2: "));
	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
	  *split_output = OUTPUT_NONE;
	}
	break;
      case 'H':
	switch (abbrev1) {
	case 'U': *split_output = OUTPUT_HU; break;
	case 'C': *split_output = OUTPUT_HC; break;
	case 'T': *split_output = OUTPUT_HT; break;
	case 'M': *split_output = OUTPUT_HM; break;
	case 'X': *split_output = OUTPUT_HX; break;
	default:
	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 3: "));
	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
	  *split_output = OUTPUT_NONE;
	}
	break;
      case 'U':
	switch (abbrev1) {
	case 'U': *split_output = OUTPUT_UU; break;
	case 'C': *split_output = OUTPUT_UC; break;
	case 'T': *split_output = OUTPUT_UT; break;
	case 'M': *split_output = OUTPUT_UM; break;
	case 'X': *split_output = OUTPUT_UX; break;
	default:
	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 4: "));
	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
	  *split_output = OUTPUT_NONE;
	}
	break;
      case 'P':
	switch (abbrev1) {
	case 'C': *split_output = OUTPUT_PC; break;
	case 'I': *split_output = OUTPUT_PI; break;
	case 'S': *split_output = OUTPUT_PS; break;
	case 'L': *split_output = OUTPUT_PL; break;
	case 'M': *split_output = OUTPUT_PM; break;
	case 'X': *split_output = OUTPUT_PX; break;
	default:
	  debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 5: "));
	  fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
	  *split_output = OUTPUT_NONE;
	}
	break;
      default:
	debug1(fprintf(stderr,"parse_XO_and_HI_fromfile 6: "));
	fprintf(stderr,"Unexpected output type %c%c\n",abbrev0,abbrev1);
	*split_output = OUTPUT_NONE;
      }

      while ((c = *line++) != '\n' && c != '\t') ;

    } else {
      while ((c = *line++) != '\n' && c != '\t') ;
    }
  }

  return hardclip_string;
}


/* Called just after we read in '\t', so should start at a field */
/* Can call parse_XH_and_HI_fromfile only once per line */
static char *
parse_XH (char *line, int linelength) {
  char *hardclip_string;
  char *p, c = '\t', c0, c1;

  while (c != '\n') {
    c0 = *line++;
    c1 = *line++;

    if (c0 == 'X' && c1 == 'H') {
      line += 3;		/* :type: */
      p = hardclip_string = MALLOC((linelength+1) * sizeof(char));
      while ((c = *p++ = *line++) != '\n' && c != '\t') ;
      *--p = '\0';			/* terminating char */

      return hardclip_string;
      
    } else {
      while ((c = *line++) != '\n' && c != '\t') ;
    }
  }

  return (char *) NULL;
}


/* Called just after we read in '\t', so should start at a field */
/* Can call parse_XH_and_HI_fromfile only once per line */
static char *
parse_XH_and_HI (char **hiti, char *line, int linelength) {
  char *hardclip_string = NULL;
  char *p, c = '\t', c0, c1;

  *hiti = MALLOC((HITI_MAXDIGITS + 1) * sizeof(char));
  (*hiti)[0] = '\0';

  while (c != '\n') {
    c0 = *line++;
    c1 = *line++;

    if (c0 == 'X' && c1 == 'H') {
      line += 3;		/* :type: */
      p = hardclip_string = MALLOC((linelength+1) * sizeof(char));
      while ((c = *p++ = *line++) != '\n' && c != '\t') ;
      *--p = '\0';			/* terminating char */

    } else if (c0 == 'H' && c1 == 'I') {
      line += 3;		/* :type: */
      
      p = *hiti;
      while ((c = *p++ = *line++) != '\n' && c != '\t') ;
      *--p = '\0';			/* terminating char */
      
    } else {
      while ((c = *line++) != '\n' && c != '\t') ;
    }
  }

  return hardclip_string;
}


/* ILLUMINA-A1CCE9_0004:1:1:1103:6310#0	0	20	33639850	255	55M21S	*	0	0	AAAAATTGTATACCGCAGATTCAGGCATGGATTCCGTGAAGGAACAACACCTAAANCCAAAGNTCGGAAGANCGGN	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCBCCBDBDCBCCDCDC@CCC&AAAA################	NM:i:2 */

#if 0
char *
Samread_parse_line (char **acc, unsigned int *flag, int *mapq, char **chr, Chrpos_T *chrpos, char **cigar,
		    char **mate_chr, Chrpos_T *mate_chrpos_low, int *insertlength,
		    int *readlength, char **read, char **quality_string, char *line) {
  char *p, *q;
  int length, i;

  debug(printf("Entering Samread_parse_line with %s\n",line));

  p = line;
  while (!isspace(*p)) p++;
  length = (p - line)/sizeof(char);
  *acc = (char *) MALLOC((length+1)*sizeof(char));
  strncpy(*acc,line,length);
  (*acc)[length] = '\0';

  if (*p != '\0') {		/* Skip over tab */
    p++;
  }

  if (sscanf(p,"%u",&(*flag)) != 1) {
    fprintf(stderr,"Unable to find flag in %s\n",p);
    abort();
  } else {
    debug(printf("  flag = %u\n",*flag));
  }

  while (!isspace(*p)) p++;	/* Skip over flag */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse chr part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }
  q = p;
  while (!isspace(*q)) q++;
  length = (q - p)/sizeof(char);
  *chr = (char *) MALLOC((length+1)*sizeof(char));
  strncpy(*chr,p,length);
  (*chr)[length] = '\0';

  debug(printf("  chr = %s\n",*chr));
  if (*q != '\0') {
    q++;
  }


  p = q;
  if (sscanf(p,"%u",&(*chrpos)) != 1) {
    fprintf(stderr,"Unable to find chrpos in %s\n",p);
    abort();
  } else {
    debug(printf("  chrpos = %u\n",*chrpos));
  }


  while (!isspace(*p)) p++;	/* Skip over chrpos */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse chrpos part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }

  /* Read mapping quality */
  if (sscanf(p,"%d",&(*mapq)) != 1) {
    fprintf(stderr,"Unable to find mapq in %s\n",p);
    abort();
  } else {
    debug(printf("  mapq = %d\n",*mapq));
  }

  /* Skip past mapping quality */
  while (!isspace(*p)) p++;


  if (*p == '\0') {
    fprintf(stderr,"Can't parse cigar part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }
  q = p;
  while (!isspace(*q)) q++;
  length = (q - p)/sizeof(char);
  *cigar = (char *) MALLOC((length+1)*sizeof(char));
  strncpy(*cigar,p,length);
  (*cigar)[length] = '\0';

  debug(printf("  cigar = %s\n",*cigar));
  

  /* mate chr */
  p = q;
  if (*p != '\0') {
    p++;			/* Should be a tab */
  }
  q = p;
  while (!isspace(*q)) q++;
  length = (q - p)/sizeof(char);
  *mate_chr = (char *) MALLOC((length+1)*sizeof(char));
  strncpy(*mate_chr,p,length);
  (*mate_chr)[length] = '\0';

  debug(printf("  mate_chr = %s\n",*mate_chr));
  if (*q == '\0') {
    fprintf(stderr,"Can't parse mate chr part of %s\n",line);
    abort();
  } else {
    q++;
  }

  /* mate chrpos low */
  p = q;
  if (sscanf(p,"%u",&(*mate_chrpos_low)) != 1) {
    fprintf(stderr,"Unable to find mate_chrpos_low in %s\n",p);
    abort();
  } else {
    debug(printf("  mate_chrpos_low = %u\n",*mate_chrpos_low));
  }

  while (!isspace(*p)) p++;	/* Skip over mate_chrpos */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse mate chrpos part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }


  /* insert size */
  p = q;
  if (sscanf(p,"%d",&(*insertlength)) != 1) {
    fprintf(stderr,"Unable to find insertlength in %s\n",p);
    abort();
  } else {
    debug(printf("  insertlength = %d\n",*insertlength));
  }

  while (!isspace(*p)) p++;	/* Skip over insertlength */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse insertlength part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }


  /* read */
  q = p;
  while (!isspace(*q)) q++;
  *readlength = (q - p)/sizeof(char);
  if (*q == '\t') q++;
  debug(printf("  readlength = %d\n",*readlength));

  *read = (char *) MALLOC(((*readlength)+1)*sizeof(char));
  strncpy(*read,p,*readlength);
  (*read)[*readlength] = '\0';

  debug(printf("  read = %s\n",*read));

  p = q;
  while (!isspace(*q)) q++;
  length = (q - p)/sizeof(char);
  *quality_string = (char *) MALLOC(((*readlength)+1)*sizeof(char));
  if (length == *readlength) {
    strncpy(*quality_string,p,length);
    (*quality_string)[*readlength] = '\0';

  } else {
    for (i = 0; i < *readlength; i++) {
      (*quality_string)[i] = ' ';
    }
    (*quality_string)[*readlength] = '\0';
  }

  if (*q == '\t') q++;

  return q;
}
#endif


/* For a single-end read, returns readlength instead of insertlength */
char *
Samread_auxinfo (unsigned int *flag, int *fragment_length, char *line) {
  char *p, *q;
  int readlength, insertlength;

  debug(printf("Entering Samread_auxinfo with %s\n",line));

  p = line;
  while (!isspace(*p)) p++;
  if (*p != '\0') {		/* Skip over tab */
    p++;
  }

  if (sscanf(p,"%u",&(*flag)) != 1) {
    fprintf(stderr,"Unable to find flag in %s\n",p);
    abort();
  } else {
    debug(printf("  flag = %u\n",*flag));
  }

  while (!isspace(*p)) p++;	/* Skip over flag */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse chr part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }

  while (!isspace(*p)) p++;	/* chr */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse chr part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tap */
  }

  while (!isspace(*p)) p++;	/* chrpos */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse chrpos part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }

  while (!isspace(*p)) p++;	/* mapq */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse mapq part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }

  while (!isspace(*p)) p++;	/* cigar */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse mapq part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }

  while (!isspace(*p)) p++;	/* mate_chr */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse mate_chr part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tap */
  }

  while (!isspace(*p)) p++;	/* mate_chrpos */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse mate_chrpos part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }

  /* insert size */
  if (sscanf(p,"%d",&insertlength) != 1) {
    fprintf(stderr,"Unable to find insertlength in %s\n",p);
    abort();
  } else {
    debug(printf("  insertlength = %d\n",insertlength));
  }

  while (!isspace(*p)) p++;	/* Skip over insertlength */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse insertlength part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }

  /* read */
  q = p;
  while (!isspace(*q)) q++;
  readlength = (q - p)/sizeof(char);
  p = q; p++;			/* Skip over tab */
  debug(printf("  readlength = %d\n",*readlength));


  while (!isspace(*p)) p++;	/* quality string */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse quality string part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }

  if ((*flag) & PAIRED_READ) {
    *fragment_length = insertlength;
  } else {
    *fragment_length = readlength;
  }

  return p;
}



#if 0
/* Assume we have just read in \t */
/* Can call Samread_parse_aux_fromfile only once per line */
char *
parse_aux_fromfile (FILE *fp, char *auxfield, int linelength) {
  char *value, *p, c = '\t', c0, c1;

  while (c != EOF && c != '\n') {
    c0 = fgetc(fp);
    c1 = fgetc(fp);

    if (c0 == auxfield[0] && c1 == auxfield[1]) {
      fgetc(fp);		/* : */
      fgetc(fp);		/* type */
      fgetc(fp);		/* : */
      p = value = MALLOC((linelength+1) * sizeof(char));
      while ((c = *p++ = fgetc(fp)) != EOF && c != '\n' && c != '\t') ;
      *--p = '\0';			/* terminating char */
      return value;
    }
    while ((c = fgetc(fp)) != EOF && c != '\n' && c != '\t') ;
  }

  return (char *) NULL;
}
#endif


char *
Samread_parse_read (char *line, int linelength, unsigned int flag) {
  char *read;
  char *p, *q, c;
  char *substring, *clipped;
  int readlength, hardclip_low, hardclip_high;

  substring = MALLOCA((linelength + 1) * sizeof(char));

  /* 1. QNAME.  Skip */
  while (*line++ != '\t') ;

  /* 2. FLAG.  Skip */
  while (*line++ != '\t') ;

  /* 3. RNAME: chr.  Skip */
  while (*line++ != '\t') ;

  /* 4. POS: chrpos.  Skip */
  while (*line++ != '\t') ;

  /* 5. MAPQ: Mapping quality.  Skip */
  while (*line++ != '\t') ;

  /* 6. CIGAR.  Parse for cigar_readlength. */
  p = substring;
  while ((*p++ = *line++) != '\t') ;
  *--p = '\0';			/* add terminating char */

  /* For a nomapper, this readlength is incorrect */
  readlength = Samread_cigar_string_readlength(&hardclip_low,&hardclip_high,substring);


  /* 7. MRNM: Mate chr.  Skip */
  while (*line++ != '\t') ;

  /* 8. MPOS: Mate chrpos.  Skip */
  while (*line++ != '\t') ;

  /* 9. ISIZE: Insert size.  Skip. */
  while (*line++ != '\t') ;

  /* 10. SEQ: queryseq */
  if (readlength == 0) {
    /* No-mapper, with CIGAR string of '*' */
    p = line;			/* Save current position */
    while (*line++ != '\t') ;
    readlength = (line - p - 1)/sizeof(char);
    line = p;			/* Restore current position */

    p = read = (char *) MALLOC((readlength + 1) * sizeof(char));
    while ((*p++ = *line++) != '\t') ;
    *--p = '\0';			/* add terminating char */

  } else if (hardclip_low == 0 && hardclip_high == 0) {
    /* CIGAR string gives correct readlength */
    p = read = (char *) MALLOC((readlength + 1) * sizeof(char));
    while ((*p++ = *line++) != '\t') ;
    *--p = '\0';			/* add terminating char */
    
  } else {
    /* Hard-clipped.  CIGAR string gives correct readlength.  Will still need to retrieve XH field */
    read = (char *) MALLOC((readlength + 1) * sizeof(char));
    p = &(read[hardclip_low]);
    while ((*p++ = *line++) != '\t') ;

    /* 11. QUAL: quality scores.  Skip. */
    while (*line++ != '\t') ;

    /* Can call parse_XH only once per line */
    if ((clipped = parse_XH(line,linelength)) == NULL) {
      /* Not possible, since we read XH field in Samread_parse_line */
      abort();

    } else {
      if (hardclip_low > 0) {
	p = &(read[0]);
	q = clipped;
	while ((c = *q++) != '\0') {
	  *p++ = c;
	}
      } else {
	p = &(read[readlength - hardclip_high]);
	q = clipped;
	while ((c = *q++) != '\0') {
	  *p++ = c;
	}
      }
      FREE(clipped);

    }
  }

  read[readlength] = '\0';

  if (flag & QUERY_MINUSP) {
    make_complement_inplace(read,readlength);
  }

  FREEA(substring);

  return read;
}


char *
Samread_parse_read_and_hiti (char **hiti, char *line, int linelength, unsigned int flag) {
  char *read;
  char *p, *q, c;
  char *substring, *clipped;
  int readlength, hardclip_low, hardclip_high;

  substring = MALLOCA((linelength + 1) * sizeof(char));

  /* 1. QNAME.  Skip */
  while (*line++ != '\t') ;

  /* 2. FLAG.  Skip */
  while (*line++ != '\t') ;

  /* 3. RNAME: chr.  Skip */
  while (*line++ != '\t') ;

  /* 4. POS: chrpos.  Skip */
  while (*line++ != '\t') ;

  /* 5. MAPQ: Mapping quality.  Skip */
  while (*line++ != '\t') ;

  /* 6. CIGAR.  Parse for cigar_readlength. */
  p = substring;
  while ((*p++ = *line++) != '\t') ;
  *--p = '\0';			/* add terminating char */

  /* For a nomapper, this readlength is incorrect */
  readlength = Samread_cigar_string_readlength(&hardclip_low,&hardclip_high,substring);


  /* 7. MRNM: Mate chr.  Skip */
  while (*line++ != '\t') ;

  /* 8. MPOS: Mate chrpos.  Skip */
  while (*line++ != '\t') ;

  /* 9. ISIZE: Insert size.  Skip. */
  while (*line++ != '\t') ;

  /* 10. SEQ: queryseq */
  if (readlength == 0) {
    /* No-mapper, with CIGAR string of '*' */
    p = line;			/* Save current position */
    while (*line++ != '\t') ;
    readlength = (line - p - 1)/sizeof(char);
    line = p;			/* Restore current position */

    p = read = (char *) MALLOC((readlength + 1) * sizeof(char));
    while ((*p++ = *line++) != '\t') ;
    *--p = '\0';			/* add terminating char */

  } else if (hardclip_low == 0 && hardclip_high == 0) {
    /* CIGAR string gives correct readlength */
    p = read = (char *) MALLOC((readlength + 1) * sizeof(char));
    while ((*p++ = *line++) != '\t') ;
    *--p = '\0';			/* add terminating char */
    
  } else {
    /* Hard-clipped.  CIGAR string gives correct readlength.  Will still need to retrieve XH field */
    read = (char *) MALLOC((readlength + 1) * sizeof(char));
    p = &(read[hardclip_low]);
    while ((*p++ = *line++) != '\t') ;

    /* 11. QUAL: quality scores.  Skip. */
    while (*line++ != '\t') ;

    /* Can call parse_aux_from file only once per line */
    if ((clipped = parse_XH_and_HI(&(*hiti),line,linelength)) == NULL) {
      /* Not possible, since we read XH field in Samread_parse_line */
      abort();

    } else {
      if (hardclip_low > 0) {
	p = &(read[0]);
	q = clipped;
	while ((c = *q++) != '\0') {
	  *p++ = c;
	}
      } else {
	p = &(read[readlength - hardclip_high]);
	q = clipped;
	while ((c = *q++) != '\0') {
	  *p++ = c;
	}
      }
      FREE(clipped);

    }
  }

  read[readlength] = '\0';

  if (flag & QUERY_MINUSP) {
    make_complement_inplace(read,readlength);
  }

  FREEA(substring);

  return read;
}


#if 0
char *
Samread_chrinfo (Chrpos_T *chrpos, char **cigar, char *line) {
  char *chr;
  unsigned int flag;
  int mapq;

  char *p, *q;
  int length;

  debug(printf("Entering Samread_chrinfo with %s\n",line));

  p = line;
  while (!isspace(*p)) p++;
  length = (p - line)/sizeof(char);
#if 0
  *acc = (char *) MALLOC((length+1)*sizeof(char));
  strncpy(*acc,line,length);
  (*acc)[length] = '\0';
#endif

  if (*p != '\0') {		/* Skip over tab */
    p++;
  }

  if (sscanf(p,"%u",&flag) != 1) {
    fprintf(stderr,"Unable to find flag in %s\n",p);
    abort();
  } else {
    debug(printf("  flag = %u\n",*flag));
  }

  while (!isspace(*p)) p++;	/* Skip over flag */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse chr part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }
  q = p;
  while (!isspace(*q)) q++;
  length = (q - p)/sizeof(char);
  chr = (char *) MALLOC((length+1)*sizeof(char));
  strncpy(chr,p,length);
  chr[length] = '\0';

  debug(printf("  chr = %s\n",chr));
  if (*q != '\0') {
    q++;
  }


  p = q;
  if (sscanf(p,"%u",&(*chrpos)) != 1) {
    fprintf(stderr,"Unable to find chrpos in %s\n",p);
    abort();
  } else {
    debug(printf("  chrpos = %u\n",*chrpos));
  }

  while (!isspace(*p)) p++;	/* Skip over chrpos */
  if (*p == '\0') {
    fprintf(stderr,"Can't parse chrpos part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }

  /* Read mapping quality */
  if (sscanf(p,"%d",&mapq) != 1) {
    fprintf(stderr,"Unable to find mapq in %s\n",p);
    abort();
  } else {
    debug(printf("  mapq = %d\n",mapq));
  }

  /* Skip past mapping quality */
  while (!isspace(*p)) p++;


  if (*p == '\0') {
    fprintf(stderr,"Can't parse cigar part of %s\n",line);
    abort();
  } else {
    p++;			/* Skip over tab */
  }
  q = p;
  while (!isspace(*q)) q++;
  length = (q - p)/sizeof(char);
  *cigar = (char *) MALLOC((length+1)*sizeof(char));
  strncpy(*cigar,p,length);
  (*cigar)[length] = '\0';

  debug(printf("  cigar = %s\n",*cigar));

  return chr;
}
#endif



#define FLAGLEN 10

char *
Samread_flag_duplicate (char *line, int linelength, unsigned int flag) {
  char *newline, *p;
  char flag_string[FLAGLEN], newflag_string[FLAGLEN];
  int newflag_strlen, flag_strlen;

  sprintf(flag_string,"%u",flag);
  flag_strlen = strlen(flag_string);
  sprintf(newflag_string,"%u",flag | DUPLICATE_READ);
  newflag_strlen = strlen(newflag_string);

  p = newline = (char *) MALLOC((linelength + newflag_strlen - flag_strlen + 1)*sizeof(char));

  /* 1. QNAME */
  while ((*p++ = *line++) != '\t') ;
  
  /* 2. FLAG */
  strcpy(p,newflag_string);
  p += newflag_strlen;
  line += flag_strlen;

  /* 3... Rest */
  strcpy(p,line);

  return newline;
}



#if 0
char
Samread_splice_strand (char *auxinfo) {
  char *p;
  char tag1, tag2;

  debug(printf("Entering Samread_splice_strand with %s\n",auxinfo));

  p = auxinfo;
  while (*p != '\0' && *p != '\n') {
    tag1 = p[0];
    tag2 = p[1];

    if (tag1 == 'X' && tag2 == 'S') {
      debug(printf("Found tag XS\n"));
      /* XS:A: */
      p += 5;

      if (*p == '+') {
	return '+';
      } else if (*p == '-') {
	return '-';
      } else if (*p == '?') {
	return '?';
      } else {
	fprintf(stderr,"Cannot parse strand %c after XS tag\n",*p);
	return ' ';
      }
    } else {
      while (*p != '\0' && *p != '\t') {
	p++;
      }
      if (*p == '\t') {
	p++;
      }
    }
  }

  return ' ';
}
#endif


#if 0
Intlist_T
Samread_parse_cigar (Uintlist_T *npositions, int *readlength, char *cigar) {
  Intlist_T types = NULL;
  unsigned int npos;
  char *p, type;

  *npositions = (Uintlist_T) NULL;
  *readlength = 0;

  if (cigar[0] == '*') {
    return (Intlist_T) NULL;
  }

  p = cigar;
  while (*p != '\0') {
    if (sscanf(p,"%u",&npos) != 1) {
      fprintf(stderr,"Unable to parse cigar %s.  No number in %s\n",cigar,p);
      abort();
    } else {
      *npositions = Uintlist_push(*npositions,npos);
    }

    while (*p != '\0' && isdigit(*p)) {
      p++;
    }
    if (*p == '\0') {
      fprintf(stderr,"Unable to parse cigar %s.  No letter after number %u\n",cigar,npos);
      exit(9);
    } else {
      type = *p++;
      types = Intlist_push(types,(int) type);
    }

    if (type == 'S' || type == 'M' || type == 'I') {
      *readlength += npos;
    } else if (type == 'H') {
      *readlength += npos;
    } else if (type == 'D' || type == 'N') {
      /* Ignore */
    } else {
      fprintf(stderr,"Unable to parse cigar %s.  Do not recognize letter %c\n",cigar,type);
      exit(9);
    }
  }

  *npositions = Uintlist_reverse(*npositions);
  return Intlist_reverse(types);
}
#endif


#if 0
void
Samread_print_cigar (Intlist_T types, Uintlist_T npositions) {
  Intlist_T p;
  Uintlist_T q;

  for (p = types, q = npositions; p != NULL; p = Intlist_next(p), q = Uintlist_next(q)) {
    printf("%u%c",Uintlist_head(q),Intlist_head(p));
  }
  return;
}
#endif


#if 0
Chrpos_T
Samread_chrpos_high (Intlist_T types, Uintlist_T npositions, Chrpos_T chrpos_low) {
  Intlist_T p;
  Uintlist_T q;
  Chrpos_T chrpos_high;
  int type;

  chrpos_high = chrpos_low;
  for (p = types, q = npositions; p != NULL; p = Intlist_next(p), q = Uintlist_next(q)) {
    if ((type = Intlist_head(p)) == 'S') {
      /* Ignore */

    } else if (type == 'H') {
      /* Ignore */

    } else if (type == 'M') {
      chrpos_high += Uintlist_head(q);

    } else if (type == 'N') {
      chrpos_high += Uintlist_head(q);

    } else if (type == 'I') {
      /* Do nothing */

    } else if (type == 'D') {
      /* CHECK */
      chrpos_high += Uintlist_head(q);

    } else {
      fprintf(stderr,"Cannot parse type %c\n",type);
      exit(9);
    }
    debug(printf("  type = %c, chrpos = %u\n",type,chrpos_high));
  }

  return chrpos_high - 1U;
}
#endif


#if 0
int
Samread_get_query_coordinates (int *query5, int *query3, Intlist_T types, Uintlist_T npositions,
			       int readlength, char *cigar) {
  int validlength;
  Intlist_T p;
  Uintlist_T q;
  int type;

  *query5 = 1;			/* 1-based */
  *query3 = readlength;
  validlength = 0;

  p = types;
  q = npositions;
  while (p != NULL) {
    if ((type = Intlist_head(p)) == 'S') {
      if (p == types) {
	*query5 = Uintlist_head(q) + 1; /* 1-based */
      } else if (Intlist_next(p) == NULL) {
	*query3 = readlength - Uintlist_head(q);
      } else {
	fprintf(stderr,"Cannot parse cigar %s.  Type S occurs in middle\n",cigar);
	exit(9);
      }
    } else if (type == 'H') {
      /* Do nothing */
    } else if (type == 'M') {
      validlength += Uintlist_head(q);
    } else if (type == 'N') {
      /* Do nothing */
    } else if (type == 'I') {
      validlength += Uintlist_head(q);
    } else if (type == 'D') {
      /* Do nothing */
    }
    p = Intlist_next(p);
    q = Uintlist_next(q);
  }

  debug(printf("Got query %d to %d, with length %d\n",*query5,*query3,validlength));
  if (validlength != (*query3) - (*query5) + 1) {
    fprintf(stderr,"Validlength %d from cigar != %d - %d + 1\n",validlength,*query3,*query5);
    abort();
  }

  return validlength;
}
#endif



#if 0
int
get_substrings (int *querylength, int **query_starts, Chrpos_T **genomic_starts, Chrpos_T **genomic_ends,
		char *cigar, Chrpos_T chrpos_low) {
  int nsubstrings = 0;
  unsigned int npos;
  char *p, type;

  int querypos = 0;
  Chrpos_T genomicpos = chrpos_low;
  Intlist_T query_starts_list = NULL;
  Uintlist_T genomic_starts_list = NULL, genomic_ends_list = NULL;

  if (cigar[0] == '*') {
    *querylength = 0;
    *query_starts = (int *) NULL;
    *genomic_starts = (Chrpos_T *) NULL;
    *genomic_ends = (Chrpos_T *) NULL;
    return 0;
  }

  query_starts_list = Intlist_push(NULL,querypos);
  genomic_starts_list = Uintlist_push(NULL,genomicpos);

  p = cigar;
  while (*p != '\0') {
    if (sscanf(p,"%u",&npos) != 1) {
      fprintf(stderr,"Unable to parse cigar %s in get_substrings.  No number in %s\n",cigar,p);
      abort();
    }

    while (*p != '\0' && isdigit(*p)) {
      p++;
    }
    if (*p == '\0') {
      fprintf(stderr,"Unable to parse cigar %s.  No letter after number %u\n",cigar,npos);
      exit(9);
    } else {
      type = *p++;
    }

    if (type == 'S') {
      querypos += npos;

    } else if (type == 'M') {
      querypos += npos;
      genomicpos += npos;

    } else if (type == 'I') {
      querypos += npos;

    } else if (type == 'H') {
      /* ? querypos += npos; */

    } else if (type == 'D') {
      genomicpos += npos;

    } else if (type == 'N') {
      genomic_ends_list = Uintlist_push(genomic_ends_list,genomicpos);
      /* nsubstrings++; */

      genomicpos += npos;

      query_starts_list = Intlist_push(query_starts_list,querypos);
      genomic_starts_list = Uintlist_push(genomic_starts_list,genomicpos);

    } else {
      fprintf(stderr,"Unable to parse cigar %s.  Do not recognize letter %c\n",cigar,type);
      exit(9);
    }
  }

  *querylength = querypos;
  genomic_ends_list = Uintlist_push(genomic_ends_list,genomicpos);
  /* nsubstrings++; */


  /* Convert lists to arrays */
  query_starts_list = Intlist_reverse(query_starts_list);
  *query_starts = Intlist_to_array(&nsubstrings,query_starts_list);
  Intlist_free(&query_starts_list);

  genomic_starts_list = Uintlist_reverse(genomic_starts_list);
  *genomic_starts = Uintlist_to_array_n(&nsubstrings,genomic_starts_list);
  Uintlist_free(&genomic_starts_list);

  genomic_ends_list = Uintlist_reverse(genomic_ends_list);
  *genomic_ends = Uintlist_to_array_n(&nsubstrings,genomic_ends_list);
  Uintlist_free(&genomic_ends_list);

  return nsubstrings;
}
#endif



#if 0
int
Samread_compute_insert_length (int *querylength5, int *querylength3,
			       char *cigar5, Chrpos_T chrpos_low_5, char *cigar3, Chrpos_T chrpos_low_3) {
  int insert_length;
  int nsubstrings5, nsubstrings3, i, j;
  int *query_starts_5, *query_starts_3;
  Chrpos_T *genomic_starts_5, *genomic_ends_5, *genomic_starts_3, *genomic_ends_3;
  Chrpos_T pos5, pos3;

  if (cigar5[0] == '*' || cigar3[0] == '*') {
    return 0;
  }

  nsubstrings5 = get_substrings(&(*querylength5),&query_starts_5,&genomic_starts_5,&genomic_ends_5,cigar5,chrpos_low_5);
  nsubstrings3 = get_substrings(&(*querylength3),&query_starts_3,&genomic_starts_3,&genomic_ends_3,cigar3,chrpos_low_3);

  for (i = 0; i < nsubstrings5; i++) {
    for (j = 0; j < nsubstrings3; j++) {
      if (genomic_ends_5[i] < genomic_starts_3[j]) {
	/* No overlap */
      } else if (genomic_starts_5[i] > genomic_ends_3[j]) {
	/* No overlap */
      } else {
	pos5 = genomic_starts_5[i] - query_starts_5[i];
	pos3 = genomic_starts_3[j] - query_starts_3[j];

	FREE(query_starts_5);
	FREE(genomic_starts_5);
	FREE(genomic_ends_5);
	FREE(query_starts_3);
	FREE(genomic_starts_3);
	FREE(genomic_ends_3);

	if (pos5 > pos3) {
	  return (int) (pos5 - pos3);
	} else {
	  return (int) (pos3 - pos5);
	}
      }
    }
  }

  if (genomic_ends_5[nsubstrings5-1] < genomic_starts_3[0]) {
    insert_length = genomic_starts_3[0] - genomic_ends_5[nsubstrings5-1] + (*querylength5) + (*querylength3);
  } else if (genomic_ends_3[nsubstrings3-1] < genomic_starts_5[0]) {
    insert_length = genomic_starts_5[0] - genomic_ends_3[nsubstrings3-1] + (*querylength5) + (*querylength3);
  } else {
    insert_length = 0;
  }

  FREE(query_starts_5);
  FREE(genomic_starts_5);
  FREE(genomic_ends_5);

  FREE(query_starts_3);
  FREE(genomic_starts_3);
  FREE(genomic_ends_3);

  return insert_length;
}
#endif



/* Allocates memory for the string */
char *
Samread_get_aux_string (const char *tags, char *auxinfo) {
  char *string;
  char desired_tag1, desired_tag2, *p, *pstart;
  int length;

  desired_tag1 = tags[0];
  desired_tag2 = tags[1];

  p = auxinfo;
  while (*p != '\0') {
    if (p[0] == desired_tag1 && p[1] == desired_tag2) {
      debug(printf("Found tag\n"));
      p += 5;			/* Skip XX:Z: */
      pstart = p;

      while (!isspace(*p)) p++;
      length = (p - pstart)/sizeof(char);
      string = (char *) MALLOC((length+1)*sizeof(char));
      strncpy(string,pstart,length);
      string[length] = '\0';
      return string;

    } else {
      while (!isspace(*p++)) ;	/* Okay if string ends with '\n' */
    }
  }

  return (char *) NULL;
}


int
Samread_get_aux_int (const char *tags, char *auxinfo) {
  int value;
  char desired_tag1, desired_tag2, *p;

  desired_tag1 = tags[0];
  desired_tag2 = tags[1];

  p = auxinfo;
  while (*p != '\0') {
    if (p[0] == desired_tag1 && p[1] == desired_tag2) {
      debug(printf("Found tag\n"));
      p += 5;			/* Skip XX:i: */
      if (sscanf(p,"%d",&value) > 0) {
	return value;
      }

    } else {
      while (!isspace(*p++)) ;	/* Okay if string ends with '\n' */
    }
  }

  return 0;
}


/* Looks for CB, UR, XX, and XY fields */
/* Allocates memory for the cell_name and umi_barcode strings */
char *
Samread_single_cell_info (char **umi_barcode, char **transcripts_consistent,
			  char **transcripts_inconsistent, char *auxinfo) {
  char *cell_name;
  char *p, *pstart;
  int length;

  cell_name = (char *) NULL;
  *umi_barcode = (char *) NULL;
  *transcripts_consistent = (char *) NULL;
  *transcripts_inconsistent = (char *) NULL;

  p = auxinfo;
  while (*p != '\0') {
    if (p[0] == 'C' && p[1] == 'B') {
      p += 5;			/* Skip CB:Z: */
      pstart = p;

      while (!isspace(*p)) p++;
      length = (p - pstart)/sizeof(char);
      cell_name = (char *) MALLOC((length+1)*sizeof(char));
      strncpy(cell_name,pstart,length);
      cell_name[length] = '\0';

    } else if (p[0] == 'U' && p[1] == 'R') {
      p += 5;			/* Skip UR:Z: */
      pstart = p;

      while (!isspace(*p)) p++;
      length = (p - pstart)/sizeof(char);
      *umi_barcode = (char *) MALLOC((length+1)*sizeof(char));
      strncpy(*umi_barcode,pstart,length);
      (*umi_barcode)[length] = '\0';

    } else if (p[0] == 'X' && p[1] == 'X') {
      p += 5;			/* Skip XX:Z: */
      *transcripts_consistent = p;
      while (!isspace(*p)) p++;

    } else if (p[0] == 'X' && p[1] == 'Y') {
      p += 5;			/* Skip XY:Z: */
      *transcripts_inconsistent = p;
      while (!isspace(*p)) p++;

    } else {
      while (!isspace(*p++)) ;	/* Okay if string ends with '\n' */
    }
  }

  if (cell_name == NULL) {
    FREE(*umi_barcode);
    return (char *) NULL;

  } else if (*umi_barcode == NULL) {
    FREE(cell_name);
    return (char *) NULL;
    
  } else {
    return cell_name;
  }
}


/* Does not allocate transcript strings */
void
Samread_get_transcripts (List_T *transcripts_consistent_list, List_T *transcripts_inconsistent_list,
			 char *line) {
  char *transcripts, *p;
  unsigned int flag;
  int fragment_length;

  p = /* auxinfo = */ Samread_auxinfo(&flag,&fragment_length,line);

  while (*p != '\0') {
    if (p[0] == 'X' && p[1] == 'X') {
      p += 5;			/* Skip XX:Z: */
      transcripts = p;
      *transcripts_consistent_list = List_push(*transcripts_consistent_list,(void *) transcripts);
      while (!isspace(*p)) p++;
      
    } else if (p[0] == 'X' && p[1] == 'Y') {
      p += 5;			/* Skip XY:Z: */
      transcripts = p;
      *transcripts_inconsistent_list = List_push(*transcripts_inconsistent_list,(void *) transcripts);
      while (!isspace(*p)) p++;
      
    } else {
      while (!isspace(*p++)) ;	/* Okay if string ends with '\n' */
    }
  }

  return;
}

