/*******************************************************
                        PFTOOLS
 *******************************************************
  Oct 3, 2011 pfsearch.c
 *******************************************************
 (C) 2011 Swiss Institute of Bioinformatics
     Thierry Schuepbach (thierry.schuepbach@isb-sib.ch)
 *******************************************************/
#define _GNU_SOURCE
#include <features.h>
#include <stdlib.h>
#include <stdio.h>
#include <mm_malloc.h>
#include <sys/time.h>
#include <pthread.h>
#include <alloca.h>
#include <inttypes.h>
#include <getopt.h>
#ifdef __USE_AFFINITY__
# include <unistd.h>
# include <sched.h>
#endif
#ifdef __USE_MMAP__
#include <sys/mman.h>
#endif

#define NALI 256
#define HEADER "%----------------------------------------------------------------------------%\n"\
	       "|                              PFSEARCH v3.0                                 |\n"\
	       "%----------------------------------------------------------------------------%\n"\
	       "| Built on " __DATE__ " at " __TIME__ ".                                          |\n"

#include "profile.h"
#include "sequence.h"
#include "system.h"

typedef union TransposeMatrix { const int * i; float * f;} TransposeMatrix;
typedef union __32bitData { 
   unsigned int ToDoID;
   int          SignedScores;
   unsigned int UnsignedScores;
   float        FloatScores;
} __32bitData;

struct ThreadData {
  const struct Profile * prf;
  const FASTAStructure * FASTA;
#ifdef __USE_MMAP__ 
  const char * SequenceFileMap;
#ifdef MMAP_DEBUG
  size_t * maplength;
#endif
#else
  char * SequenceFileName;
#endif
  char * Sequences;
  __32bitData * Array;
  int * FilterScores;
  TransposeMatrix TransposeMatch;
  FILE ** TempOutputFile;
  size_t start;
  size_t stop;
  unsigned int counter; /* WARNING: in filter phase counter == 0 means stop when cutoff reached, 1 -> compute real filter value */ 
  size_t threadId;
};

static SystemInfo System;
#ifdef __NUMA__
static int noNUMA = 0;
#endif
static int indexDatabase = 0;
#ifdef __NUMA__
static const char opt_to_test[] = "C:c:i:t:T:vhAandDN:S:sl:m:o:W:";
#else
# ifdef __USE_AFFINITY__
static const char opt_to_test[] = "C:c:i:t:T:vhAasdDl:m:0:W:M:bkfo:";
# else
static const char opt_to_test[] = "C:c:i:t:T:vhAasdDl:m:0:W:o:";
# endif
#endif
static const struct option long_options[] =
{
        /*
	 * These options set a flag. 
	 */

	
        /* 
	 * These options don't set a flag. We distinguish them by their indices. 
	 */
	{"help",               		no_argument,       	0,	'h'},
	{"sse2",			no_argument,		0,	's'},
#ifdef __NUMA__
	/* NUMA */
        {"no-numa",			no_argument,		0,     	'n'},
	{"nnodes",			required_argument,	0,	'N'},
	{"max-heuristic-nnodes",	required_argument,	0,	'S'},
#endif
	/* Heuristic */
	{"dump-heuristic-scores",	no_argument,		0,	'A'},
	{"dump-filter-sequences",	no_argument,		0,	'd'},
	/* Filter */
	{"dump-alignment-sequences",	no_argument,		0,	'D'},
	/* Database indexing options */ 
	{"create-index-database",	required_argument,	0,	'c'},
	{"use-index-database",		required_argument,	0,	'i'},
	/* Others */
	{"cutoff",			required_argument,	0,	'C'},
	{"level" ,			required_argument,	0,	'l'},
	{"mode",			required_argument,	0,	'm'},
	/* SMP options*/
	{"nthreads",			required_argument,	0,	't'},
	{"max-heuristic-nthreads",	required_argument,	0,	'T'},
#ifdef __USE_AFFINITY__
	{"no-affinity",                 no_argument,            0,      'f'},
	{"split", 			no_argument,		0,	'b'},
	{"thread-affinity",		required_argument,	0,	'M'},
	{"no-shared-core",		no_argument,		0,	'k'},
#endif
	/* Print ouptut methods*/
	{"output-method",		required_argument,	0,	'o'},
	{"output-length",		required_argument,	0,	'W'},
	{"verbose",		no_argument,	0,	'v'},
	{0, 0, 0, 0}
};

static int (*xali1_ptr)(const struct Profile * const restrict, const unsigned char * const restrict,
			int * const, const size_t, const size_t, const int, const _Bool);
static int (*xalip_ptr)( const struct Profile * const restrict, const unsigned char * const restrict,
           union lScores * const restrict, union Positions * const restrict,
           union Positions * const restrict, const size_t, const size_t,
           struct Alignment * const restrict, _Bool * const restrict, const size_t, const size_t, const _Bool,
           const int, const size_t);
static int (*xalit_ptr)(const struct Profile * const restrict, const size_t, const size_t, const size_t, const size_t,
          const unsigned char * const restrict, char * const restrict, union lScores * const restrict,
          struct Alignment * const restrict, const _Bool * const restrict);

static void* (*thread_heuristic_ptr)(void*);

static void* (*numa_node_ptr)(void*);

SNormalizationItem * Normalization = 0;
NormalizedToRawFunctionPtr NormalizedToRawFunction = 0;
RawToNormalizedFunctionPtr RawToNormalizedFunction = 0;
SAverage * Average = 0;
unsigned int OutputPrintWidth = 60;
bool OutputVerbose = false;
int SearchLevel = 0;

static PrintFunctionPtr PrintFunction = &PrintDefault;
static pthread_mutex_t PrintLock;

#ifdef __NUMA__
# define SIMD(x) x ## _sse2
# define SIMD_VER 2
# include "numa_threads.h"
# undef SIMD
# undef SIMD_VER
# ifdef __SSE_4_1__
#  define SIMD(x) x ## _sse41
#  define SIMD_VER 41
#  include "numa_threads.h"
#  undef SIMD
#  undef SIMD_VER
# endif
#endif

#include "threads.h"
#include "../include/system.h"

static void __attribute__((noreturn)) Usage(FILE * stream)
{
  fputs(
	" pfsearch [options] profile database\n\n"
	" Options:\n"
	"  Profile\n" 
	"   --level                    [-l] : level to use for cutoff\n"
	"   --mode                     [-m] : mode to use for normalization\n\n"
	"  Database\n"
	"   --create-index-database    [-c] : output indices to given file\n"
	"   --use-index-database       [-i] : use indices stored in given file\n\n"
	"  Heuristic\n"
	"   --cutoff                   [-C] : heuristic cutoff value\n"
	"   --dump-heuristic-scores    [-A] : only print heuristic scores to stdout\n"
	"   --dump-heuristic-sequences [-D] : dump passed heuristic sequences\n\n"
	"  Filter\n"
	"   --dump-filter-scores       [-a] : only print filter scores to stdout\n"
	"   --dump-filter-sequences    [-d] : dump passed heuristic and filter\n"
	"                                     sequences\n\n"
	"  Optimizations\n"
	"   --sse2                     [-s] : enforces SSE 2 only instruction set\n"
	"   --nthreads                 [-t] : max number of threads to use\n"
	"   --max-heuristic-nthreads   [-T] : max number of threads to use for\n"
	"                                     heuristic phase only. (IO bounds)\n"
#ifdef __USE_AFFINITY__
	"   --no-affinity              [-f] : disable CPU affinity file\n"
	"   --thread-affinity          [-M] : file containing thread mask,\n"
	"                                     one row for one thread\n" 
	"   --no-shared-core           [-k] : Prevent core resource sharing\n"
	"   --split                    [-b] : if both SSE 2 & 4.1 are available,\n"
	"                                     split half-half using linked resources\n"
#endif
#ifdef __NUMA__
	"   --no-numa                  [-n] : NUMA will be disabled\n"
	"   --nnodes                   [-N] : NUMA nodes number (maximum is NUMA API)\n"
	"   --max-heuristic-nnodes     [-S] : NUMA max number of threads per node\n\n"
#endif
	"  Printing output\n"
	"   --output-method            [-o] : printing output method\n"
	"                                     == 0 replicates the pfseach output without\n"
    "                                          options (DEFAULT)\n"
	"                                     == 1 simple ouput\n"
	"                                     == 2 replicates pfsearch -lxz output\n"
	"                                     == 3 replicates pfscan -lxz output\n"
	"                                     == 4 tsv output (single line tab delimited)\n"
	"   --output-length            [-W] : maximum number of column for sequence\n" 
	"                                     ouput printing\n"
	"  Other\n"
        "   --verbose                  [-v] : verbose on stderr\n"
	"   --help                     [-h] : output command help\n",
	
	
	stream);
  exit(0);
}

int main (int argc, char *argv[])
{
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // LOCAL STRUCTURES
  ////////////////////////////////////////////////////////////////////////////////////////////////
  
  struct Profile prf; 			/* Profile */
  FASTAStructure FASTA;			/* Sequence Database File */
  Sequence SeqData;			/* Sequence data to work on */
  struct timeval _t0, _t1;		/* Timing structures */
  
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // LOCAL DATA
  ////////////////////////////////////////////////////////////////////////////////////////////////
  
  PFSequence * PFSeq;			/* Pointer to translated alphabet sequence */
  size_t nCPUs=0;			/* number of threads */
  size_t nCPUsHeuristic=0;              /* maximum number of threads for heuristic phase */
#ifdef __NUMA__
  size_t nNodes=0;			/* number of NUMA nodes (max is NUMA API dictated) */
  size_t nThreadsPerNodes=0;		/* maximum number of threads per NUMA node */
#endif
  size_t HeuristicCounter = 0;		/* number of sequences passing heuristic */ 
  size_t FilterCounter = 0;		/* number of sequences passing filter */ 
  int res, Score;
  int HeuristicCutOff = -1;		/* Default heuristic cutoff from command line, if not zero then enforces that value*/
  _Bool levelGiven = false;		/* True if --level is given */
  int Level = 0;			/* Default level used from command line, if not zero then enforces that value */
  int Mode = 0;				/* Default Mode for normalization from command line, if not zero then enforces that value */
  _Bool SSE41 = true;
  _Bool ExportIndices = false;		/* Does export indices after analyzing fasta file */
  char *ExportFileName = NULL;		/* If so this is the file name */
  _Bool ImportIndices = false;		/* Does import indices from file */
  char *ImportFileName = NULL;		/* If so this is the file name */
  _Bool DumpHeuristicOnly = false;	/* If set only dump heuritic scores for each sequence */
  _Bool DumpFilterOnly = false;         /* If set only dump filter scores for each sequence */
  _Bool DumpFilterSequences = false;	/* Dump sequence that passed the heuristic */
  _Bool DumpAlignmentSequences = false;	/* Dump sequence that passed the heuristic and the filter */
  _Bool DumpBoth            = false;	/* Dump both heuristic and filter scores */
  int * restrict FilterScores = NULL;   /* Array of filter scores */
  unsigned int * restrict HeuristicScores = NULL; /* Array of heuristic scores used when dumping both filter and heuristic scores */
  char * ProfileFile;			/* Profile file */
  char *DB;				/* FASTA sequence file */
  
  size_t * shares = 0;
  __32bitData * restrict YesNoID = NULL;/* Allocate memory for sequence YesNo or ID to be done */
  struct ThreadData *threads_arg = NULL;/* Allocate stack memory for posix thread structures */
  pthread_t *threads = 0;
#ifdef __USE_AFFINITY__
  cpu_set_t * Thread_masks[2] = {0,0};	/* Define variables to hold thread affinity mask */
  unsigned int Thread_count[2] = {0,0}; 
  pthread_attr_t * restrict threads_attr = NULL;	
  char buffer[128] __attribute__((aligned(16))); /* buffer to read affinity file mask */
  _Bool noAffinity = false;		/* disable use of cpu affinity */
  _Bool split = false;
  _Bool noSharedCore = false;		/* Prevent hyperthreading or AMD compute unit to share resources */
  _Bool GivenAffinityFile = false;	/* File holding a mask for each thread */
  char * AffinityMaskFileName;		/* Name of affinity mask file provided by option m */
#endif
#ifdef __USE_MMAP__
  int fd;
  size_t length;
  char * restrict SequenceFileMap;
#endif
  
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // SYSTEM ARCHITECTURE ANALYSIS
  ////////////////////////////////////////////////////////////////////////////////////////////////
  getSystemInfo(&System);
   
#ifdef USE_AFFINITY
/*  Thread_Mask_Size = CPU_ALLOC_SIZE(System.nOverallCores);
  Thread_masks = (cpu_set_t**) alloca(System.nOverallCores*sizeof(cpu_set_t*));
  register _Bool Error = false;
  for (size_t t=0; t<System.nOverallCores; ++t) {
    Thread_masks[t] = CPU_ALLOC(System.nOverallCores);
    Error |= Thread_masks[t] == NULL;
  }
  if (Error) {
    printf("Error in allocating thread affinity masks.\n");
    exit(1);
  }     
  for (int t=0; t<(int)System.nOverallCores; ++t) {
    CPU_ZERO_S(Thread_Mask_Size, Thread_masks[t]);
    CPU_SET_S(t, Thread_Mask_Size, Thread_masks[t]);
  }*/ 
#endif

  /* Check for minimum requirement */
  if (!(System.Extensions & MM_SSE2)) {
      fputs("pfsearch requires at least a CPU capable of SSE 2.\n", stderr);
      exit(1);
  }
  
  /* Allow fast SSE 4.1 extensions ? */
  if (System.Extensions & MM_SSE41) {
      xali1_ptr = xali1_sse41;
      xalit_ptr = xalit_sse41;
      xalip_ptr = xalip_sse41;
      //thread_heuristic_cutoff_ptr = thread_heuristic_cutoff_sse41;
      thread_heuristic_ptr = thread_heuristic_sse41;
# ifdef __NUMA__
      numa_node_ptr = numa_node_sse41;
# endif
      SSE41 = true;
  } else {
      xali1_ptr = xali1_sse2;
      xalit_ptr = xalit_sse2;
      xalip_ptr = xalip_sse2;
      thread_heuristic_ptr = thread_heuristic_sse2;
#ifdef __NUMA__
      numa_node_ptr = numa_node_sse2;
#endif
  }
  
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // OPTIONS
  ////////////////////////////////////////////////////////////////////////////////////////////////
  while (1) {
    /* getopt_long stores the option index here. */
    int option_index = 0;

    const int c = getopt_long (argc, argv, opt_to_test, long_options, &option_index);

    /* Detect the end of the options. */
    if (c == -1) break;
    switch (c) {
#ifdef __USE_AFFINITY__
      case 'f':
        noAffinity = true; 
        break;
      case 'k':
	noSharedCore = true;
	break;
      case 'b':
	if (SSE41) {
	  split = true;
	} else {
	  fputs("Split not possible without SSE 4.1\n", stderr);
	  exit(0);
	}
	break;
      case 'M':
	GivenAffinityFile = true;
	AffinityMaskFileName = optarg;
	break;
      case 'v':
	OutputVerbose = true;
	break;
#endif
      case 'o':
	{
	  const int method = atoi(optarg);
	  if (method <= 0) {
	    PrintFunction = &PrintDefault;
	  } else if (method == 1) {
	    PrintFunction = &PrintSimple;
	  } else if (method == 2) {
	    PrintFunction = &PrintInterpro;
	  } else if (method == 3) {
	    PrintFunction = &PrintPfscan;
	  } else if (method == 4) {
	    PrintFunction = &PrintTSV;
	  } else {
	    fputs("Unrecognized ouput method.\n", stderr);
	    exit(1);
	  }
	}
	break;
      case 'W':
	OutputPrintWidth = (unsigned int) atoi(optarg);
	break;
      case 'm':
	Mode = atoi(optarg);
	break;
      case 'l':
	levelGiven = true;
	Level = atoi(optarg);
	SearchLevel = Level;
	break;
      case 'a':
	DumpFilterOnly = true;
	break;
      case 'A':
	DumpHeuristicOnly = true;
	break;
      case 't':
	nCPUs = (size_t) atoi(optarg);
	break;
      case 'T':
	nCPUsHeuristic = (size_t) atoi(optarg);
	break;
      case 'D':
	DumpFilterSequences = true;
	break;
      case 'd':
	DumpAlignmentSequences = true;
	break;
      case 's':
	xali1_ptr = xali1_sse2;
	xalit_ptr = xalit_sse2;
	xalip_ptr = xalip_sse2;
	thread_heuristic_ptr = thread_heuristic_sse2;
# ifdef __NUMA__
	numa_node_ptr = numa_node_sse2;
# endif
	SSE41 = false;
	break;
      case 'c':
	ExportIndices = true;
	ExportFileName = optarg;
	break;
      case 'i':
	ImportIndices = true;
	ImportFileName = optarg;
	break;
      case 'C':
	HeuristicCutOff = atoi(optarg);
	break;
#ifdef __NUMA__
      case 'n':
	if (System.NumaAble) {
	  noNUMA = 1;
	  fputs("NUMA disactivated as requested.\n", stdout);
	}
	break;
      case 'N':
	if (System.NumaAble) {
	  nNodes = (size_t) atoi(optarg);
	  if (nNodes > System.nNodes) {
	    nNodes = System.nNodes;
	    if (OutputVerbose)
            fprintf(stderr, "System cannot have more than %u NUMA nodes, number lowered to system max\n",
		        System.nNodes);
	  }
	}
	break;
      case 'S':
	if (System.NumaAble) {
	  nThreadsPerNodes = (size_t) atoi(optarg);
	}
	break;
#endif
      case 'h':
      default:
	Usage(stdout);
    }
  }

  if (optind == argc) {
    fputs("Error in given options\n", stderr);
    Usage(stderr);
  } else {
    ProfileFile = argv[optind];
    DB = argv[optind+1];
  }

  if (OutputVerbose) {
   fputs(HEADER 
#ifdef __USE_MMAP__
        "| Using Linux kernel MMAP function.                                          |\n"
#endif
      ,stderr);
    printSystemInfo(&System);
    if (!SSE41 && (System.Extensions & MM_SSE41)) {
	fputs("Enforcing SSE 2...\n", stderr); 
    }
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // INPUT ANALYSIS
  ////////////////////////////////////////////////////////////////////////////////////////////////

  /* 
   * Read the profile and output some infos
   */
  gettimeofday(&_t0,0);
  res = ReadProfile(ProfileFile, &prf);
  gettimeofday(&_t1,0);
  {
    const double T = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
    if (OutputVerbose)
        fprintf(stderr, "Profile reading took %lf seconds.\n", T);
  }
  if (res != 0) {
    fputs("Error found.\n", stderr);
    return 1;
  }
  if (OutputVerbose)
    fprintf(stderr,"Profile %s has length %lu and alphabet size of %lu\nCutoff value is set to %i\n",
         ProfileFile, prf.Length, prf.Alphabet_Length, prf.CutOffData.Values[prf.Level].ICUT);

  if (OutputVerbose)
  {
    fputs("Alphabet Mapping\n",stderr);
    for (size_t i=0; i<ALPHABET_SIZE; ++i) {
        fprintf(stderr,"Map %c=%2u  ", (char) ((unsigned char) 'A' + (unsigned char) i), (unsigned int) prf.Alphabet_Mapping[i]);
        if ((i+1) % 8 == 0 ) fputs("\n",stderr);
    }
    fputs("\n\n",stderr);
  fprintf(stderr,"Disjoint set: %i to %i\n", prf.DisjointData.NDIP[0], prf.DisjointData.NDIP[1]);
  }

  /*
   * Treats the level of cutoff by seeking the index corresponding to the wanted level number,
   * then storing it into the profile Level member.
   * BE CAREFUL THIS IS THE INDEX AND NOT THE VALUE OF THE LEVEL !!!
   */
  const SCutOffItem * const restrict cutItems = prf.CutOffData.Values;
  if (levelGiven) {
    _Bool levelfound = false;
    for (size_t icut=0; icut<(size_t)prf.CutOffData.JCUT; ++icut) {
      if ( cutItems[icut].MCLE == Level ) {
	prf.Level = icut;
	if (levelfound) {
	  fprintf(stderr, "ERROR: profile contains several %i-level!\n", Level);
	  goto END;
	}
	levelfound = true;
      }
    }
    if (! levelfound ) { 
      fprintf(stderr,"Profile does not contain a level %i.\n", Level);
      goto END;
    }
  }
  /*
  * Treats the normalization method 
  */
  {
    const SCutOffItem * const restrict cutItem = &cutItems[prf.Level];
    if (OutputVerbose)
        fprintf(stderr, "Normalization count for level %i: %i\n", Level, cutItem->JCNM);
    if ( cutItem->JCNM > 0 ) {
      SNormalizationItem *NormItems = &(prf.NormalizationData.Values[0]);
      
      register size_t NormalizationMode = 0;
      int Priority =  -1;//NormItems[NormalizationMode].NNPR;
      int Mode     =  -1;//NormItems[NormalizationMode].MNOR;
      
      for (int iCutoffMode=0; iCutoffMode<cutItem->JCNM; ++iCutoffMode) {
	const int CutOffMode = cutItem->MCUT[iCutoffMode];
	for (int iNormalizationMode=0; iNormalizationMode<prf.NormalizationData.JNOR; ++iNormalizationMode) {
	  if (OutputVerbose)
	      fprintf(stderr, "Cutoff mode %i, normalization mode %i\n" , CutOffMode, NormItems[iNormalizationMode].NNOR); 
	  if ( CutOffMode == NormItems[iNormalizationMode].NNOR ) {
	    if (Mode < 0) {
	      Priority =  NormItems[NormalizationMode].NNPR;
	      Mode     =  NormItems[NormalizationMode].MNOR;
	      NormalizationMode = (size_t) iNormalizationMode;
	    } 
	    else if (NormItems[NormalizationMode].NNPR < Priority) {
	      Priority =  NormItems[NormalizationMode].NNPR;
	      Mode     =  NormItems[NormalizationMode].MNOR;
	      NormalizationMode = (size_t) iNormalizationMode;
	    }
	  }
	}
      }
      
      switch(Mode) {
	case 0:
	  NormalizedToRawFunction = &N2R_1;
	  RawToNormalizedFunction = &R2N_1;
	  if (OutputVerbose)
	    fprintf(stderr, "Normalization using %s function\n", prf.NormalizationData.CNOR[0]);
	  break;
	case 1:
	  NormalizedToRawFunction = &N2R_2;
	  RawToNormalizedFunction = &R2N_2;
	  if (OutputVerbose)
	    fprintf(stderr, "Normalization using %s function\n", prf.NormalizationData.CNOR[1]);
	  break;
	case 2:
	  NormalizedToRawFunction = &N2R_3;
	  RawToNormalizedFunction = &R2N_3;
	  if (OutputVerbose)
	    fprintf(stderr, "Normalization using %s function\n", prf.NormalizationData.CNOR[2]);
	  InitAverage(&(prf.Scores), prf.Length, prf.Alphabet_Length, Average);
	  if (Average == 0) {
	    fputs("Unable to allocate memory for normalization averaging.n", stderr);
	    goto END;
	  }
	  break;
	default:
	  fprintf(stderr, "No normalization data satisfies level %i modes.\n", Level);
  // 	NormalizedToRawFunction = 0;
  // 	RawToNormalizedFunction = 0;
	  goto END;
      }
      Normalization = &NormItems[NormalizationMode];
      if (OutputVerbose)
      {
        fprintf(stderr, "Normalization Mode %i with coeffs: ", Normalization->NNOR);
        for (int c=0; c<5; ++c) fprintf(stderr,"%f ", Normalization->RNOP[c]);
        fputs("\n", stderr);
      }
      prf.Mode = NormalizationMode;
    }
  }
  /* 
   * Read the FASTA file 
   */
  
  gettimeofday(&_t0,0);
  if (!ImportIndices) {
    res = AnalyzeFASTAStructure(DB, &FASTA);
  } else {
    FILE* inIndex = fopen(ImportFileName, "rb");
    if (inIndex != NULL) {
      res = ImportFASTAStructure(inIndex, &FASTA);
      fclose(inIndex);
    } else {
      if (OutputVerbose)
        fprintf(stderr,"Unable to open index file %s, will analyze database instead.\n",ImportFileName); 
      res = AnalyzeFASTAStructure(DB, &FASTA);
    }
  }
  gettimeofday(&_t1,0);
  {
    const double T = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
    if (OutputVerbose)
        fprintf(stderr, "Sequence file indexing took %lf seconds.\n", T);
  }
  if (res != 0) {
    fputs("Error found.\n", stderr);
    return 1;
  }
  
  if (OutputVerbose)
  {
    fprintf(stderr,
	  "FASTA file %s analyzed\n"
	  "\tFound %lu sequences within %lu bytes\n" 
	  "\tBiggest sequence entry is %lu bytes\n",
          DB, FASTA.SequenceCount, FASTA.FileSize, FASTA.MaxSequenceSize);
  }
  
  if (ExportIndices) {
    FILE *io = fopen(ExportFileName, "wb");
    if ( io != NULL ) {
      if (ExportFASTAStructure(io, &FASTA)>0) {
        if (OutputVerbose)
            fprintf(stderr, "Export of indices failed, check space for %s\n", ExportFileName);
      } else {
        if (OutputVerbose)
	        fprintf(stderr, "Export of indices to file %s\n", ExportFileName);
      }
      fclose(io);
    } else {
        if (OutputVerbose)
            fprintf(stderr, "Export of indices failed, check write permission for %s\n", ExportFileName);
    }
  }
  
#ifdef __NUMA__
  if (!(System.NumaAble && noNUMA == 0)) {
#endif
  ////////////////////////////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////
  //                                       NOT NUMA                                             //
  ////////////////////////////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////
  
  /* 
   * Retrieve number of cores 
   */
  nCPUs = (nCPUs == 0) ? (size_t) System.nOverallCores : nCPUs;
  
#ifdef __USE_AFFINITY__
  if (noAffinity) {
    // -----------------------------------------------------------------------------
    //                        ***  NO AFFINITY ***
    // -----------------------------------------------------------------------------
    if (OutputVerbose)
        fputs("Thread affinity disabled\n", stderr); 
    Thread_count[0] = System.nOverallCores;
    Thread_masks[0] = (cpu_set_t*) malloc(System.nOverallCores*sizeof(cpu_set_t));
    for (size_t thread=0; thread<System.nOverallCores; ++thread) {
      CPU_ZERO(&Thread_masks[0][thread]);
      for (int i=0; i<(int) System.nOverallCores; ++i) CPU_SET(i, &Thread_masks[0][thread]);
    }
  } else if (GivenAffinityFile) {
    // -----------------------------------------------------------------------------
    //                     ***  INPUT FILE HOLDING NASKS ***
    // -----------------------------------------------------------------------------
    if (OutputVerbose)
        fprintf(stderr,"Parsing file %s for affinity mask and number of threads\n", AffinityMaskFileName);
    FILE* in = fopen(AffinityMaskFileName, "r");
    if (in == NULL) {
	fprintf(stderr, "Cannot open thread affinity file %s.\n", optarg);
	exit(1);
    }
    size_t lines = 0;
    while (!feof(in)) {
	int num = fread(buffer, sizeof(char), 64, in);
	for (unsigned int i=0; i<num; i++)
	    if (buffer[i] == '\n') lines++;
    }
    rewind(in);
    if (lines != 0) {
	if (lines > System.nOverallCores) lines = System.nOverallCores;
	Thread_masks[0] = (cpu_set_t*) malloc(lines*sizeof(cpu_set_t));
	for (size_t i=0; i<lines; i++) {
	    fscanf(in, "%s\n", buffer);
	    const size_t tmp_size = strlen(buffer) - 1;
	    CPU_ZERO(&Thread_masks[0][i]);
	    for (int j=tmp_size; j>=0; j--) {
		if (buffer[j] != '0') CPU_SET(j, &Thread_masks[0][i]);
	    }
	}
	Thread_count[0] = lines;
    if (OutputVerbose)
	    fprintf(stderr,"Found %2lu threads affinity masks.",nCPUs);
    } else {
    if (OutputVerbose)
	    printf("Cannot understand cpu mask, keep on normally\n");
    }
    fclose(in);
  } else if ( split ) {
    // -----------------------------------------------------------------------------
    //                 ***  HALF SSE 2 HALF SSE 4.1 HYPERTHREADING***
    // -----------------------------------------------------------------------------
    Thread_count[0] = getMasks(&System, -1, -1, 1, &Thread_masks[0]);
    if (Thread_count[0] == 0) {
      fputs("No potential affinity mask found !!!\n", stderr);
      exit(0);
    }
    Thread_count[1] = getMasks(&System, -1, -1, 2, &Thread_masks[1]);
    if (Thread_count[1] == 0) {
      fputs("No potential affinity mask found with hyperthreading !!!\n", stderr);
      exit(0);
    }
    if (OutputVerbose)
        fprintf(stderr, "%u threads will use SSE 4.1 and %u SSE 2\n", Thread_count[0], Thread_count[1]);
  } else if (noSharedCore) {
    if (OutputVerbose)
        fputs("No sharing of core resources will be used: Intel Hyperthreading or AMD Compute Unit\n", stderr);
    Thread_count[0] = getMasks(&System, -1, -1, 1, &Thread_masks[0]);
    if (Thread_count[0] == 0) {
      fputs("No potential affinity mask found !!!\n", stderr);
      exit(0);
    }
  } else {
    // -----------------------------------------------------------------------------
    //                        *** OPERATING SYSTEM CHOICE ***
    // -----------------------------------------------------------------------------
    Thread_count[0] = getMasks(&System, -1, -1, -1, &Thread_masks[0]);
    if (Thread_count[0] == 0) {
      fputs("No potential affinity mask found !!!\n", stderr);
      exit(0);
    }
  }

  {
    register size_t total = (size_t) (Thread_count[0] + Thread_count[1]);
    if (nCPUs > total) nCPUs = total;
  }
      
  threads_attr = (pthread_attr_t*) alloca(nCPUs*sizeof(pthread_attr_t));
  {
    register const cpu_set_t * current = &Thread_masks[0][0]; 
    for (size_t i=0; i<nCPUs; ++i) {
      pthread_attr_init(&threads_attr[i]);
      if (i == (size_t) Thread_count[0]) current = &Thread_masks[1][0];
      pthread_attr_setaffinity_np(&threads_attr[i], sizeof(cpu_set_t), current);
      ++current;
    }
  } 
#endif
  if (OutputVerbose)
    fprintf(stderr, "Job dispatched over %lu cores.\n", nCPUs);
  
#ifdef __USE_MMAP__
  fd = open(DB, O_RDONLY );
  length = (size_t) FASTA.FileSize;
  SequenceFileMap = (char *) mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, 0);
  if (SequenceFileMap == NULL) {
    fputs("Unable to map sequence file to memory\n", stderr);
    exit(1);
  } 
#endif
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // HEURISTIC
  ////////////////////////////////////////////////////////////////////////////////////////////////

  /* Get Heuristic cutoff from command line */
  if (HeuristicCutOff >= 0) prf.CutOffData.Values[prf.Level].HCUT = (unsigned int) HeuristicCutOff;
  if (OutputVerbose)
    fprintf(stderr,"Heuristic cutoff set to %u\n", prf.CutOffData.Values[prf.Level].HCUT);

  /* Prepare structure common to filter and alignment */
  shares = alloca((nCPUs+1)*sizeof(size_t));
  
  /* Allocate memory for sequence YesNo or ID to be done */
  YesNoID = _mm_malloc( FASTA.SequenceCount*sizeof(__32bitData), 16);
  if (YesNoID == NULL) {
      fputs("Cannot allocate memory.\n", stderr);
      goto END;
  }
    
  /* Allocate stack memory for posix thread structures */
  threads_arg = alloca(nCPUs*sizeof(struct ThreadData));
  threads = (pthread_t*) alloca(nCPUs*sizeof(pthread_t));
  
  /* Dispatch to threads */
    for (size_t i=0; i<nCPUs; ++i) {
      threads_arg[i].prf                       = &prf;
      threads_arg[i].FASTA                     = &FASTA;
      threads_arg[i].Array                     = YesNoID;
#ifdef __USE_MMAP__
      threads_arg[i].SequenceFileMap           = SequenceFileMap;
# ifdef NUMA_DEBUG
      threads_arg[i].maplength                 = &length;
# endif
#else
      threads_arg[i].SequenceFileName          = DB;
#endif
      threads_arg[i].threadId                  = i;
    }
  
  DumpBoth = DumpHeuristicOnly && DumpFilterOnly;
  if (prf.CutOffData.Values[prf.Level].HCUT > 0 || DumpHeuristicOnly && !DumpFilterOnly) {
    /* Compute Match Score Matrix transpose */ 
    gettimeofday(&_t0,0);
    TransposeMatrix TIMatch;
#ifdef __USE_AFFINITY__
    TransposeMatrix TIMatch1;
    if (split) {
      TIMatch.i  = TransposeAndConvertMatchMatrix(&(prf.Scores), prf.Alphabet_Length, prf.Length);
      TIMatch1.f = TransposeAndConvertToFloatMatchMatrix(&(prf.Scores), prf.Alphabet_Length, prf.Length);
    } else 
#endif
    if (SSE41) {
      TIMatch.i = TransposeAndConvertMatchMatrix(&(prf.Scores), prf.Alphabet_Length, prf.Length);
    } else {
      TIMatch.f = TransposeAndConvertToFloatMatchMatrix(&(prf.Scores), prf.Alphabet_Length, prf.Length);
    }
    gettimeofday(&_t1,0);
    {
      const double t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
      if (OutputVerbose)
        fprintf(stderr,"Transposing Match matrix took %lf seconds.\n", t);
    }
    
    /* Limit number of threads for heuristic */
    if ( nCPUsHeuristic == 0) nCPUsHeuristic = nCPUs;
    
    /* Share according to file size */
    {
      size_t FileShare = (size_t) FASTA.FileSize / nCPUsHeuristic;
      FileShare += ((size_t) FASTA.FileSize % nCPUsHeuristic) > (nCPUsHeuristic-1) ? 1 : 0;
      const s_Data * DataPtr = FASTA.DataPtr;
      register size_t counter = 0;
      shares[0] = 0;
      for (size_t i=1; i<nCPUsHeuristic; ++i) {
	register size_t tmp = i*FileShare;
	while ( (size_t) DataPtr->Offset < tmp) { ++DataPtr; ++counter; }
	shares[i] = counter;
      }
      shares[nCPUsHeuristic] = FASTA.SequenceCount;
    }
    
    gettimeofday(&_t0,0);
#ifdef __USE_AFFINITY__
    if (split) {
      for (size_t i=0; i<Thread_count[0]; ++i) {
	threads_arg[i].start                     = shares[i];
	threads_arg[i].stop                      = shares[i+1];
	threads_arg[i].TransposeMatch            = TIMatch;
	if (pthread_create (&threads[i],  &threads_attr[i], thread_heuristic_sse41,  (void*) &threads_arg[i]) != 0) 
	{
	  fputs("Fail to create thread.\n", stderr);
	  exit(0);
	}
      }
      for (size_t i=0; i<Thread_count[1]; ++i) {
	threads_arg[Thread_count[0]+i].start                     = shares[Thread_count[0]+i];
	threads_arg[Thread_count[0]+i].stop                      = shares[Thread_count[0]+i+1];
	threads_arg[Thread_count[0]+i].TransposeMatch            = TIMatch1;
	if (pthread_create (&threads[Thread_count[0]+i],  &threads_attr[Thread_count[0]+i], thread_heuristic_sse2,  (void*) &threads_arg[Thread_count[0]+i]) != 0) 
	{
	  fputs("Fail to create thread.\n", stderr);
	  exit(0);
	}
      }
    } else {
#endif
      for (size_t i=0; i<nCPUsHeuristic; ++i) {
	threads_arg[i].start                     = shares[i];
	threads_arg[i].stop                      = shares[i+1];
	threads_arg[i].TransposeMatch            = TIMatch;
#ifdef __USE_AFFINITY__
	if (pthread_create (&threads[i],  &threads_attr[i], thread_heuristic_ptr,  (void*) &threads_arg[i]) != 0) 
#else
	if (pthread_create (&threads[i],  NULL, thread_heuristic_ptr,  (void*) &threads_arg[i]) != 0) 
#endif
	{
	  fputs("Fail to create thread.\n", stderr);
	  exit(0);
	}
      }
#ifdef __USE_AFFINITY__ 
    }
#endif
    for (size_t i=0; i<nCPUsHeuristic; ++i) {
      pthread_join(threads[i], NULL);  
    }
    gettimeofday(&_t1,0);
    double t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
    
    _mm_free(TIMatch.f);
#ifdef __USE_AFFINITY__
    if (split) _mm_free(TIMatch1.f);
#endif
    
    /* Do we go for dump only, then output and quit */
    if (DumpHeuristicOnly && !DumpBoth) {
      /* Allocate memory to hold sequence */
      SeqData.Data.Memory = malloc(FASTA.MaxSequenceSize*sizeof(unsigned char));
      if (SeqData.Data.Memory == NULL) {
	fputs("Thread Cannot allocate memory for sequence.\n", stderr);
	_mm_free(YesNoID);
	goto END;
      }
      
      /* Open sequence file*/
#ifndef __USE_MMAP__
      FILE* inSequence = fopen(DB, "r");
#endif    

      for (size_t iseq=0; iseq<FASTA.SequenceCount; ++iseq) {
	/* Read sequence */
#  ifndef __USE_MMAP__
	PFSeq = ReadSequenceIndex(&SeqData, iseq, inSequence, FASTA.DataPtr);
#  else
	PFSeq = MMAPReadSequenceIndex(&SeqData, iseq, SequenceFileMap, FASTA.DataPtr, 0
#    ifdef MMAP_DEBUG
      , ((struct ThreadData*) _Data)->threadId, 0, *(((struct ThreadData*) _Data)->maplength)
#    endif
      );
#  endif
	/* Translate first sequence */
	PFSeq = TranslateSequenceToIndex(PFSeq, prf.Alphabet_Mapping);
	
	char * ptr = SeqData.Data.Header;
	while (*ptr != ' ' && *ptr != '\n') ptr++;
	if (*ptr == ' ') *ptr = '\0';
	
	/* Ouput results */
	fprintf(stdout, "%u\t%s\n", YesNoID[iseq].UnsignedScores, SeqData.Data.Header);
      }
      
#ifndef __USE_MMAP__
      fclose(inSequence);
#endif  
      _mm_free(YesNoID);
      free(SeqData.Data.Memory);
      goto END;
    }
    
    if (DumpBoth) {
       HeuristicCounter = FASTA.SequenceCount;
       HeuristicScores = _mm_malloc(FASTA.SequenceCount*sizeof(unsigned int), 16);
       if ( HeuristicScores == NULL) {
	 fputs("Unable to allocate memory for the heuristic scores\n",stderr);
	  exit(1);
       }
       memcpy(HeuristicScores, YesNoID, FASTA.SequenceCount*sizeof(unsigned int));
    }
 
    /* Gather the one that passed th1e heuristic */
    HeuristicCounter = 0;
    register const unsigned int lHeuristicCutOff = prf.CutOffData.Values[prf.Level].HCUT;
    for (size_t iseq=0; iseq<FASTA.SequenceCount; ++iseq) {
      if (YesNoID[iseq].UnsignedScores >= lHeuristicCutOff) {
	YesNoID[HeuristicCounter].ToDoID = (unsigned int) iseq;
	++HeuristicCounter;
      }
    }

    if (OutputVerbose)
        fprintf(stderr,"Overall there are %lu/%lu sequences passing heuristic. These took %lf seconds to treat on %lu cores.\n",
	    HeuristicCounter, FASTA.SequenceCount, t, nCPUsHeuristic);
    
    /* Print out the sequences passing heuristic cutoff */
    if (DumpFilterSequences) {
      /* Allocate memory to hold sequence */
      SeqData.Data.Memory = malloc(FASTA.MaxSequenceSize*sizeof(unsigned char));
      if (SeqData.Data.Memory == NULL) {
	fputs("Program cannot allocate memory for sequence.\n", stderr);
	_mm_free(YesNoID);
	goto END;
      }
      
#ifndef __USE_MMAP__
      FILE* inSequence = fopen(DB, "r");
#endif
      
      for (size_t iseq=0; iseq<HeuristicCounter; ++iseq) {
	  /* Read sequence */
	  const size_t sequence_index = YesNoID[iseq].ToDoID;
#ifndef __USE_MMAP__
	  PFSeq = ReadSequenceIndex(&SeqData, sequence_index, inSequence, FASTA.DataPtr);
#else
	  PFSeq = MMAPReadSequenceIndex(&SeqData, sequence_index, SequenceFileMap, FASTA.DataPtr, 0
#  ifdef MMAP_DEBUG
      , ((struct ThreadData*) _Data)->threadId, 0, *(((struct ThreadData*) _Data)->maplength)
#  endif
      );
#endif
	  /* Ouput results */
	  fprintf(stdout, "%s\n%s\n", SeqData.Data.Header, SeqData.ProfileData.ProfileIndex);
	}
#ifndef __USE_MMAP__
      fclose(inSequence);
#endif
      free(SeqData.Data.Memory);
      goto END;
    }
    if (DumpBoth) {
      HeuristicCounter = FASTA.SequenceCount;
      for (size_t iseq=0; iseq<FASTA.SequenceCount; ++iseq) {
	 YesNoID[iseq].ToDoID  = (unsigned int) iseq;
      }
    }
  }  else {
    if (OutputVerbose)
        fputs("Bypassing heuristic computation...\n",stderr);
    HeuristicCounter = FASTA.SequenceCount;
    for (size_t iseq=0; iseq<FASTA.SequenceCount; ++iseq) YesNoID[iseq].ToDoID = (unsigned int) iseq;
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // FILTER
  ////////////////////////////////////////////////////////////////////////////////////////////////
  {
    if (OutputVerbose)
        fprintf(stderr,"Filter cutoff set to %i\n", prf.CutOffData.Values[prf.Level].ICUT);
    
    /* Allocate memory for the filter scores */
    FilterScores = _mm_malloc(HeuristicCounter*sizeof(int), 16);
    if (FilterScores == NULL) {
      fputs("Unable to allocate memory for the filter scores\n",stderr);
      exit(1);
    }
    
    /* Compute the new share for each thread */
    size_t SequenceShare = HeuristicCounter / nCPUs;
    SequenceShare += (HeuristicCounter % nCPUs) > (nCPUs-1) ? 1 : 0;
    shares[0] = 0;
    for (size_t i=1; i<nCPUs; ++i) shares[i] = i*SequenceShare;
    
    shares[nCPUs] = HeuristicCounter;
    
    /* Dispatch to threads */
    {
      const unsigned int realFilterScore = DumpFilterOnly ? 1 : 0;
      gettimeofday(&_t0,0);
      for (size_t i=0; i<nCPUs; ++i) {
	threads_arg[i].start        = shares[i];
	threads_arg[i].stop         = shares[i+1];
	threads_arg[i].FilterScores = FilterScores;
	threads_arg[i].counter      = realFilterScore;
	if (pthread_create (&threads[i],
#ifdef __USE_AFFINITY__	
			    &threads_attr[i],
#else
			    NULL,
#endif
			    thread_xali1,
			    (void*) &threads_arg[i]) != 0) 
	{
	  return 1;
	}
      }
    }

    for (size_t i=0; i<nCPUs; i++) {
      pthread_join(threads[i], NULL);  
    }

    gettimeofday(&_t1,0);

    if (DumpFilterOnly) {
      /* Allocate memory to hold sequence */
      SeqData.Data.Memory = malloc(FASTA.MaxSequenceSize*sizeof(unsigned char));
      if (SeqData.Data.Memory == NULL) {
	fputs("Pfsearch cannot allocate memory for sequence.\n", stderr);
	_mm_free(YesNoID);
	_mm_free(FilterScores);
	goto END;
      }
      
      /* Open sequence file*/
#ifndef __USE_MMAP__
      FILE* inSequence = fopen(DB, "r");
#endif    

      for (size_t iseq=0; iseq<HeuristicCounter; ++iseq) {
	/* Read sequence */
#  ifndef __USE_MMAP__
	PFSeq = ReadSequenceIndex(&SeqData,iseq, inSequence, FASTA.DataPtr);
#  else
	PFSeq = MMAPReadSequenceIndex(&SeqData, iseq, SequenceFileMap, FASTA.DataPtr, 0
#    ifdef MMAP_DEBUG
      , ((struct ThreadData*) _Data)->threadId, 0, *(((struct ThreadData*) _Data)->maplength)
#    endif
      );
#  endif
	/* Translate first sequence */
	PFSeq = TranslateSequenceToIndex(PFSeq, prf.Alphabet_Mapping);
	
	char * ptr = SeqData.Data.Header;
	while (*ptr != ' ' && *ptr != '\n') ptr++;
	if (*ptr == ' ') *ptr = '\0';
	
	/* Ouput results */
	if (DumpBoth) 
	  fprintf(stdout, "%u\t%i\t%s\n", HeuristicScores[iseq], FilterScores[iseq], SeqData.Data.Header);
	else 
	  fprintf(stdout, "%i\t%s\n", FilterScores[iseq], SeqData.Data.Header);
      }
      
#ifndef __USE_MMAP__
      fclose(inSequence);
#endif  
      _mm_free(YesNoID);
      free(SeqData.Data.Memory);
      _mm_free(FilterScores);
      if (DumpBoth) _mm_free(HeuristicScores);
      goto END;
    } 
    
    
    /* Gather the one that passed xali1 */
    FilterCounter = 0;
    register const int lFilterCutoff = prf.CutOffData.Values[prf.Level].ICUT;
    for (size_t iseq=0; iseq<HeuristicCounter; ++iseq) {
	if ( FilterScores[iseq] >= lFilterCutoff ) {
	  YesNoID[FilterCounter].ToDoID = YesNoID[iseq].ToDoID;
	  ++FilterCounter;
	}
    }
    
    const double t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
    if (OutputVerbose)
        fprintf(stderr,"Overall there are %lu/%lu sequences passing filter. These took %lf seconds to treat on %li cores.\n",
	    FilterCounter, HeuristicCounter, t, nCPUs);
    
    /* Print out the sequences passing heuristic and filter cutoff */
    if (DumpAlignmentSequences) {
      /* Allocate memory to hold sequence */
      SeqData.Data.Memory = malloc(FASTA.MaxSequenceSize*sizeof(unsigned char));
      if (SeqData.Data.Memory == NULL) {
	fputs("Program cannot allocate memory for sequence.\n", stderr);
	_mm_free(YesNoID);
	goto END;
      }
      
#ifndef __USE_MMAP__
      FILE* inSequence = fopen(DB, "r");
#endif
      
      for (size_t iseq=0; iseq<FilterCounter; ++iseq) {
	  /* Read sequence */
	  const size_t sequence_index = YesNoID[iseq].ToDoID;
#ifndef __USE_MMAP__
	  PFSeq = ReadSequenceIndex(&SeqData, sequence_index, inSequence, FASTA.DataPtr);
#else
	  PFSeq = MMAPReadSequenceIndex(&SeqData, sequence_index, SequenceFileMap, FASTA.DataPtr, 0
#  ifdef MMAP_DEBUG
      , ((struct ThreadData*) _Data)->threadId, 0, *(((struct ThreadData*) _Data)->maplength)
#  endif
      );
#endif
	  /* Ouput results */
	  fprintf(stdout, "%s\n%s\n", SeqData.Data.Header, SeqData.ProfileData.ProfileIndex);
	}
#ifndef __USE_MMAP__
      fclose(inSequence);
#endif
      free(SeqData.Data.Memory);
      _mm_free(FilterScores);
      goto END;
    }    
    _mm_free(FilterScores);
    
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // ALIGNMENT
  ////////////////////////////////////////////////////////////////////////////////////////////////
  if (FilterCounter > 0) {
    /* Initialize the print mutex */
    pthread_mutex_init(&PrintLock, NULL);
    
    /* Allocate memory for the alignment */
    const size_t AlignedSequencesStep = NALI*(prf.Length+1)*3;
    char * AlignedSequences = malloc(nCPUs*AlignedSequencesStep*sizeof(char));
    if (AlignedSequences == NULL) {
      fputs("Unable to allocate memory for resulting aligned sequences.\n", stderr);
      exit(1);
    }
    
    /* Compute the new share for each thread */
    {
      size_t SequenceShare = FilterCounter / nCPUs;
      SequenceShare += (FilterCounter % nCPUs) > (nCPUs-1) ? 1 : 0;
      shares[0] = 0;
      for (size_t i=1; i<nCPUs; ++i) {
	shares[i] = i*SequenceShare;
//  	fprintf(stderr,"share %lu starts at %lu and stops at %lu\n", i, shares[i-1], shares[i]);
      }
      shares[nCPUs] = FilterCounter;
//       fprintf(stderr,"share %lu starts at %lu and stops at %lu\n", nCPUs, shares[nCPUs-1], shares[nCPUs]);
    }
  
    /* Dispatch to threads */
    gettimeofday(&_t0,0); 
    for (size_t i=0; i<nCPUs; ++i) {
      threads_arg[i].Sequences   = &AlignedSequences[i*AlignedSequencesStep];
      threads_arg[i].start       = shares[i];
      threads_arg[i].stop        = shares[i+1];
#ifdef __USE_AFFINITY__
      if (pthread_create (&threads[i],  &threads_attr[i], thread_xaliPT,  (void*) &threads_arg[i]) != 0)
#else
      if (pthread_create (&threads[i],  NULL, thread_xaliPT,  (void*) &threads_arg[i]) != 0)
#endif
      {
	return 1;
      }
    }

    for (size_t i=0; i<nCPUs; i++) {
      pthread_join(threads[i], NULL);  
    }
    gettimeofday(&_t1,0);
   
    unsigned int AlignedSequencesCounter = threads_arg[0].counter;
    for (size_t i=1; i<nCPUs; i++) AlignedSequencesCounter += threads_arg[i].counter;
    
    {
      const double t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
      if (OutputVerbose)
        fprintf(stderr,"Overall there are %u aligned sequences found. These took %lf seconds to align on %li cores.\n", AlignedSequencesCounter, t, nCPUs);
    }
    
    /* Free the print mutex */
    pthread_mutex_destroy(&PrintLock);
  }

  _mm_free(YesNoID);
#ifdef __USE_MMAP__
  munmap((void*)SequenceFileMap, length);
  close(fd);
#endif
#ifdef __USE_AFFINITY__
  if (Thread_masks[0]) free(Thread_masks[0]);
  if (Thread_masks[1]) free(Thread_masks[1]);
#endif
#ifdef __NUMA__
  } 
  else 
  {
  unsigned int * restrict SeqID = NULL;   
  ////////////////////////////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////
  //                                        NUMA                                                //
  ////////////////////////////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////
  
  size_t nHeuristicPassed, nFilterPassed;
  numa_set_strict(1);
  
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // Initialization
  ////////////////////////////////////////////////////////////////////////////////////////////////
  pthread_mutex_init(&NodeMutex, NULL);
  pthread_cond_init(&NodeCond, NULL);
  pthread_mutex_init(&MasterMutex, NULL);
  pthread_cond_init(&MasterCond, NULL);
  
  nNodes = (nNodes == 0) ? (size_t) System.nNodes : nNodes;
  nThreadsPerNodes = (nThreadsPerNodes == 0) ? (size_t) System.nCpusPerNode : nThreadsPerNodes;
  
  if (nCPUs != 0) {
      if (OutputVerbose)
        fputs("In NUMA mode --nthreads is useless, try setting --nnodes instead\n", stderr);
  }
//   nCPUs = (nCPUs == 0) ? (size_t) sysconf(_SC_NPROCESSORS_CONF) : nCPUs;
  nCPUs = nNodes*nThreadsPerNodes;
  
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // Node memory allocation
  ////////////////////////////////////////////////////////////////////////////////////////////////
  
  /* Share according to file size */  
  size_t * Shares = alloca((nCPUs+1)*sizeof(size_t));
  {
    size_t FileShare = (size_t) FASTA.FileSize / nCPUs;
    FileShare += ((size_t) FASTA.FileSize % nCPUs) > (nCPUs-1) ? 1 : 0;
    const s_Data * DataPtr = FASTA.DataPtr;
    size_t counter = 0;
    Shares[0] = 0;
    for (size_t i=1; i<nCPUs; ++i) {
      register size_t tmp = i*FileShare;
      while ( (size_t) DataPtr->Offset < tmp) { ++DataPtr; ++counter; }
      Shares[i] = counter;
//       printf("NodeShares %i starts at %li and stops at %li\n", i, Shares[i-1], counter);
    }
    Shares[nCPUs] = FASTA.SequenceCount;
//     printf("NodeShares %i starts at %li and stops at %li\n", nCPUs, Shares[nCPUs-1], Shares[nCPUs]);
  }
  
  /* Allocation is based upon the maximum requested for all nodes */
  size_t MaxNodeShare = 0, NodeShare = 0; //, nNodes = 1;
  for (size_t i=1; i<=nCPUs; ++i) {
    NodeShare += Shares[i];
    if ( i % nThreadsPerNodes == 0 ) {
	MaxNodeShare = MaxNodeShare < NodeShare ? NodeShare : MaxNodeShare;
	NodeShare = 0;
// 	if ( nCPUs-i > 0 ) ++nNodes;
    } 
  }
  MaxNodeShare = MaxNodeShare < NodeShare ? NodeShare : MaxNodeShare;
     
  if (OutputVerbose)
    fprintf(stderr, "Job dispatched over %lu cores using %lu/%u nodes each having %lu/%u threads.\n",
	  nCPUs, nNodes, System.nNodes, nThreadsPerNodes, System.nCpusPerNode);
  
  __32bitData ** YesNoID = (__32bitData**) alloca(nNodes*sizeof(__32bitData*));
  register _Bool OK = true;
  for (size_t i=0; i<nNodes; ++i) {
      YesNoID[i] = numa_alloc_onnode(MaxNodeShare*sizeof(__32bitData), i);
      OK &= YesNoID[i] != NULL ? true : false;
  }
  if (!OK) {
      fputs("NUMA local allocation on nodes was not possible, not enough memory.\n", stderr);
      exit(1);
  }
  
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // HEURISTIC
  ////////////////////////////////////////////////////////////////////////////////////////////////

  /* Get Heuristic cutoff from command line */
  prf.HeuristicCutOff = HeuristicCutOff;
  printf("Heuristic cutoff set to %f\n", prf.HeuristicCutOff);
  
  /* Allocate stack memory for posix thread structures */
  struct numa_NodeData *threads_arg = alloca(nNodes*sizeof(struct numa_NodeData));
  pthread_t *threads = (pthread_t*) alloca(nNodes*sizeof(pthread_t));
  
  NodeCounter = nNodes;

  /* Dispatch to threads */
  gettimeofday(&_t0,0);
  for (size_t i=0; i<nNodes; ++i) {
    threads_arg[i].prf                       = &prf;
    threads_arg[i].FASTA                     = &FASTA;
    threads_arg[i].Array                     = YesNoID[i];
    threads_arg[i].SequenceFileName          = DB;
    threads_arg[i].shares                    = Shares;
    threads_arg[i].NodeId                    = i;
    threads_arg[i].shareStart                = i*nThreadsPerNodes;
    if ((i+1)*(size_t)nThreadsPerNodes < nCPUs) {
      threads_arg[i].nthreads                = nThreadsPerNodes;
    } else {
      threads_arg[i].nthreads                = (nCPUs - i*nThreadsPerNodes);
    }
    if (pthread_create (&threads[i],  NULL, numa_node_ptr,  (void*) &threads_arg[i]) != 0) {
      fputs("Fail to create NUMA master node thread.\n", stderr);
      exit(1);
    }
  }
  
  /* Wait for master nodes */
  pthread_mutex_lock(&NodeMutex);
  if (NodeCounter > 0) pthread_cond_wait(&NodeCond, &NodeMutex);
  pthread_mutex_unlock(&NodeMutex);

  gettimeofday(&_t1,0);
  const double t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;

  nHeuristicPassed = Shares[0];
  for (size_t i=1; i<nNodes; ++i) nHeuristicPassed += Shares[i];
  
  if (OutputVerbose)
    fprintf(stderr, "Overall %lu sequences passed heuristic test in %lf [s].\n", nHeuristicPassed, t);
  if (nHeuristicPassed == 0) goto CLEAN;
  
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // FILTER
  ////////////////////////////////////////////////////////////////////////////////////////////////
  /* Allocate master memory to hold sequences ID */
  SeqID = _mm_malloc(nHeuristicPassed*sizeof(unsigned int), 16);
  if (SeqID == NULL) {
      fputs("Master unable to allocate memory\n", stderr);
      goto CLEAN;
  }
  {
    unsigned int * restrict SeqIDptr = SeqID;
    for (size_t i=0; i<nNodes; ++i) {
	memcpy(SeqIDptr, YesNoID[i], Shares[i]*sizeof(unsigned int));
	SeqIDptr += Shares[i];
    }
    
    /* Share according to overall number of cores */  
    size_t CoreShare = nHeuristicPassed / nCPUs;
    CoreShare += (nHeuristicPassed % nCPUs) > (nCPUs-1) ? 1 : 0;
    Shares[0] = 0;
    for (size_t i=1; i<nCPUs; ++i) {
      Shares[i] = i*CoreShare;
      if (OutputVerbose)
        fprintf(stderr,"NodeShares %lu starts at %lu and stops at %lu\n", i, Shares[i-1], Shares[i]);
    }
    Shares[nCPUs] = nHeuristicPassed;
    if (OutputVerbose)
        fprintf(stderr,"NodeShares %lu starts at %lu and stops at %lu\n", nCPUs, Shares[nCPUs-1], Shares[nCPUs]);
    
    SeqIDptr = SeqID;
    const size_t datasize = nThreadsPerNodes*CoreShare;
    for (size_t i=0; i<nNodes-1; ++i) {
	memcpy(YesNoID[i], SeqIDptr, datasize*sizeof(unsigned int));
	SeqIDptr += datasize;
    if (OutputVerbose)
	    fprintf(stderr,"Node %lu has %lu elements\n", i+1, datasize);
    }
    memcpy(YesNoID[nNodes-1], SeqIDptr, (nHeuristicPassed - (nNodes-1)*datasize)*sizeof(unsigned int));
    if (OutputVerbose)
        fprintf(stderr,"Node %lu has %lu elements\n", nNodes, (nHeuristicPassed - (nNodes-1)*datasize));
  }
  
#ifdef NUMA_DEBUG
  if (OutputVerbose)
    fputs("Master triggering filter phase\n", stderr);
#endif
  gettimeofday(&_t0,0);
  
  pthread_mutex_lock(&MasterMutex);
  NodeCounter = nNodes;
  MasterDone = false;
  pthread_cond_broadcast(&MasterCond);
  pthread_mutex_unlock(&MasterMutex);
  
  /* Wait for master nodes */
  pthread_mutex_lock(&NodeMutex);
  if (NodeCounter > 0) pthread_cond_wait(&NodeCond, &NodeMutex);
  pthread_mutex_unlock(&NodeMutex);

  gettimeofday(&_t1,0);
  
  nFilterPassed = Shares[0];
  {
    for (size_t i=1; i<nNodes; ++i) nFilterPassed += Shares[i];
    const double t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
    if (OutputVerbose)
        fprintf(stderr, "Overall %lu sequences passed filter test in %lf [s].\n",nFilterPassed, t);
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////
  // ALIGNMENT
  ////////////////////////////////////////////////////////////////////////////////////////////////
  {
    unsigned int * restrict SeqIDptr = SeqID;
    for (size_t i=0; i<nNodes; ++i) {
	memcpy(SeqIDptr, YesNoID[i], Shares[i]*sizeof(unsigned int));
	SeqIDptr += Shares[i];
    }
    
    /* Share according to overall number of cores */  
    size_t CoreShare = nFilterPassed / nCPUs;
    CoreShare += (nFilterPassed % nCPUs) > (nCPUs-1) ? 1 : 0;
    Shares[0] = 0;
    for (size_t i=1; i<nCPUs; ++i) {
      Shares[i] = i*CoreShare;
      if (OutputVerbose)
        fprintf(stderr,"NodeShares %lu starts at %lu and stops at %lu\n", i, Shares[i-1], Shares[i]);
    }
    Shares[nCPUs] = nFilterPassed;
    if (OutputVerbose)
        fprintf(stderr,"NodeShares %lu starts at %lu and stops at %lu\n", nCPUs, Shares[nCPUs-1], Shares[nCPUs]);
    
    SeqIDptr = SeqID;
    const size_t datasize = nThreadsPerNodes*CoreShare;
    for (size_t i=0; i<nNodes-1; ++i) {
	memcpy(YesNoID[i], SeqIDptr, datasize*sizeof(unsigned int));
	SeqIDptr += datasize;
    if (OutputVerbose)
	    fprintf(stderr,"Node %lu has %lu elements\n", i+1, datasize);
    }
    memcpy(YesNoID[nNodes-1], SeqIDptr, (nFilterPassed - (nNodes-1)*datasize)*sizeof(unsigned int));
    if (OutputVerbose)
        fprintf(stderr,"Node %lu has %lu elements\n", nNodes, (nFilterPassed - (nNodes-1)*datasize));
  }
  
#ifdef NUMA_DEBUG
  if (OutputVerbose)
    fputs("Master triggering alignment phase\n", stderr);
#endif
  gettimeofday(&_t0,0);
  
  pthread_mutex_lock(&MasterMutex);
  NodeCounter = nNodes;
  MasterDone = true;
  pthread_cond_broadcast(&MasterCond);
  pthread_mutex_unlock(&MasterMutex);
  
  /* Wait for master nodes */
  pthread_mutex_lock(&NodeMutex);
  if (NodeCounter > 0) pthread_cond_wait(&NodeCond, &NodeMutex);
  pthread_mutex_unlock(&NodeMutex);

  gettimeofday(&_t1,0);
  {
    size_t nAlignPassed = Shares[0];
    for (size_t i=1; i<nNodes; ++i) nFilterPassed += Shares[i];
    const double t = (double) (_t1.tv_sec - _t0.tv_sec) + (double) (_t1.tv_usec - _t0.tv_usec) * 0.000001;
    if (OutputVerbose)
        fprintf(stderr, "Overall %lu sequences passed alignment in %lf [s].\n",nFilterPassed, t);
  }
  
CLEAN:
  /* Destroy mutexes and condition variables */
  pthread_mutex_destroy(&NodeMutex);
  pthread_cond_destroy(&NodeCond);
  pthread_mutex_destroy(&MasterMutex);
  pthread_cond_destroy(&MasterCond);
  
  for (size_t i=0; i<nNodes; ++i) {
    numa_free(YesNoID[i], MaxNodeShare*sizeof(unsigned int));
  }
  
  } 
#endif
  /* Free Memory */
END:
  FreeProfile(&prf);
  FreeFASTAStructure(&FASTA);
  freeSystemInfo(&System);

  exit(0);
}
