/*
 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
 *
 * This file is part of Bowtie 2.
 *
 * Bowtie 2 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Bowtie 2 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef EBWT_H_
#define EBWT_H_

#include <stdint.h>
#include <string.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <memory>
#include <fcntl.h>
#include <math.h>
#include <errno.h>
#include <stdexcept>
#include <sys/stat.h>
#ifdef BOWTIE_MM
#include <sys/mman.h>
#include <sys/shm.h>
#endif
#include "shmem.h"
#include "alphabet.h"
#include "assert_helpers.h"
#include "bitpack.h"
#include "blockwise_sa.h"
#include "endian_swap.h"
#include "word_io.h"
#include "random_source.h"
#include "ref_read.h"
#include "threading.h"
#include "str_util.h"
#include "mm.h"
#include "timer.h"
#include "reference.h"
#include "search_globals.h"
#include "ds.h"
#include "random_source.h"
#include "mem_ids.h"
#include "btypes.h"
#include "threadpool.h"

#ifdef POPCNT_CAPABILITY
#include "processor_support.h"
#endif

#if __cplusplus <= 199711L
#define unique_ptr auto_ptr
#endif

using namespace std;

// From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl
extern uint8_t cCntLUT_4[4][4][256];

static const uint64_t c_table[4] = {
	0xffffffffffffffff,
	0xaaaaaaaaaaaaaaaa,
	0x5555555555555555,
	0x0000000000000000
};

#ifndef VMSG_NL
#define VMSG_NL(...)				\
	if(this->verbose()) {			\
		stringstream tmp;		\
		tmp << __VA_ARGS__ << endl;	\
		this->verbose(tmp.str());	\
	}
#endif

#ifndef VMSG
#define VMSG(...)				\
	if(this->verbose()) {			\
		stringstream tmp;		\
		tmp << __VA_ARGS__;		\
		this->verbose(tmp.str());	\
	}
#endif

/**
 * Flags describing type of Ebwt.
 */
enum EBWT_FLAGS {
	EBWT_COLOR = 2,     // true -> Ebwt is colorspace
	EBWT_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole
	                    // concatenated string reversed, rather than
	// each stretch reversed
};

/**
 * Extended Burrows-Wheeler transform header.  This together with the
 * actual data arrays and other text-specific parameters defined in
 * class Ebwt constitute the entire Ebwt.
 */
class EbwtParams {

public:
	EbwtParams() { }

	EbwtParams(
		TIndexOffU len,
		int32_t lineRate,
		int32_t offRate,
		int32_t ftabChars,
		bool color,
		bool entireReverse)
		{
			init(len, lineRate, offRate, ftabChars, color, entireReverse);
		}

	EbwtParams(const EbwtParams& eh) {
		init(eh._len, eh._lineRate, eh._offRate,
		     eh._ftabChars, eh._color, eh._entireReverse);
	}

	void init(
		TIndexOffU len,
		int32_t lineRate,
		int32_t offRate,
		int32_t ftabChars,
		bool color,
		bool entireReverse)
		{
			_color = color;
			_entireReverse = entireReverse;
			_len = len;
			_bwtLen = _len + 1;
			_sz = (len+3)/4;
			_bwtSz = (len/4 + 1);
			_lineRate = lineRate;
			_origOffRate = offRate;
			_offRate = offRate;
			_offMask = OFF_MASK << _offRate;
			_ftabChars = ftabChars;
			_eftabLen = _ftabChars*2;
			_eftabSz = _eftabLen*OFF_SIZE;
			_ftabLen = (1 << (_ftabChars*2))+1;
			_ftabSz = _ftabLen*OFF_SIZE;
			_offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
			_offsSz = (uint64_t)_offsLen*OFF_SIZE;
			_lineSz = 1 << _lineRate;
			_sideSz = _lineSz * 1 /* lines per side */;
			_sideBwtSz = _sideSz - OFF_SIZE*4;
			_sideBwtLen = _sideBwtSz*4;
			_numSides = (_bwtSz+(_sideBwtSz)-1)/(_sideBwtSz);
			_numLines = _numSides * 1 /* lines per side */;
			_ebwtTotLen = _numSides * _sideSz;
			_ebwtTotSz = _ebwtTotLen;
			assert(repOk());
		}

	TIndexOffU len() const           { return _len; }
	TIndexOffU lenNucs() const       { return _len + (_color ? 1 : 0); }
	TIndexOffU bwtLen() const        { return _bwtLen; }
	TIndexOffU sz() const            { return _sz; }
	TIndexOffU bwtSz() const         { return _bwtSz; }
	int32_t   lineRate() const      { return _lineRate; }
	int32_t   origOffRate() const   { return _origOffRate; }
	int32_t   offRate() const       { return _offRate; }
	TIndexOffU offMask() const       { return _offMask; }
	int32_t   ftabChars() const     { return _ftabChars; }
	int32_t eftabLen() const      { return _eftabLen; }
	int32_t eftabSz() const       { return _eftabSz; }
	TIndexOffU ftabLen() const       { return _ftabLen; }
	TIndexOffU ftabSz() const        { return _ftabSz; }
	TIndexOffU offsLen() const       { return _offsLen; }
	uint64_t offsSz() const        { return _offsSz; }
	int32_t lineSz() const        { return _lineSz; }
	int32_t sideSz() const        { return _sideSz; }
	int32_t sideBwtSz() const     { return _sideBwtSz; }
	int32_t sideBwtLen() const    { return _sideBwtLen; }
	TIndexOffU numSides() const      { return _numSides; }
	TIndexOffU numLines() const      { return _numLines; }
	TIndexOffU ebwtTotLen() const    { return _ebwtTotLen; }
	TIndexOffU ebwtTotSz() const     { return _ebwtTotSz; }
	bool color() const             { return _color; }
	bool entireReverse() const     { return _entireReverse; }

	/**
	 * Set a new suffix-array sampling rate, which involves updating
	 * rate, mask, sample length, and sample size.
	 */
	void setOffRate(int __offRate) {
		_offRate = __offRate;
		_offMask = OFF_MASK << _offRate;
		_offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
		_offsSz = (uint64_t)_offsLen * OFF_SIZE;
	}

#ifndef NDEBUG
	/// Check that this EbwtParams is internally consistent
	bool repOk() const {
		assert_gt(_len, 0);
		assert_gt(_lineRate, 3);
		assert_geq(_offRate, 0);
		assert_leq(_ftabChars, 16);
		assert_geq(_ftabChars, 1);
		assert_lt(_lineRate, 32);
		assert_lt(_ftabChars, 32);
		assert_eq(0, _ebwtTotSz % _lineSz);
		return true;
	}
#endif

	/**
	 * Pretty-print the header contents to the given output stream.
	 */
	void print(ostream& out) const {
		out << "Headers:" << endl
		    << "    len: "          << _len << endl
		    << "    bwtLen: "       << _bwtLen << endl
		    << "    sz: "           << _sz << endl
		    << "    bwtSz: "        << _bwtSz << endl
		    << "    lineRate: "     << _lineRate << endl
		    << "    offRate: "      << _offRate << endl
		    << "    offMask: 0x"    << hex << _offMask << dec << endl
		    << "    ftabChars: "    << _ftabChars << endl
		    << "    eftabLen: "     << _eftabLen << endl
		    << "    eftabSz: "      << _eftabSz << endl
		    << "    ftabLen: "      << _ftabLen << endl
		    << "    ftabSz: "       << _ftabSz << endl
		    << "    offsLen: "      << _offsLen << endl
		    << "    offsSz: "       << _offsSz << endl
		    << "    lineSz: "       << _lineSz << endl
		    << "    sideSz: "       << _sideSz << endl
		    << "    sideBwtSz: "    << _sideBwtSz << endl
		    << "    sideBwtLen: "   << _sideBwtLen << endl
		    << "    numSides: "     << _numSides << endl
		    << "    numLines: "     << _numLines << endl
		    << "    ebwtTotLen: "   << _ebwtTotLen << endl
		    << "    ebwtTotSz: "    << _ebwtTotSz << endl
		    << "    color: "        << _color << endl
		    << "    reverse: "      << _entireReverse << endl;
	}

	TIndexOffU _len;
	TIndexOffU _bwtLen;
	TIndexOffU _sz;
	TIndexOffU _bwtSz;
	int32_t  _lineRate;
	int32_t  _origOffRate;
	int32_t  _offRate;
	TIndexOffU _offMask;
	int32_t  _ftabChars;
	uint32_t _eftabLen;
	uint32_t _eftabSz;
	TIndexOffU _ftabLen;
	TIndexOffU _ftabSz;
	TIndexOffU _offsLen;
	uint64_t _offsSz;
	uint32_t _lineSz;
	uint32_t _sideSz;
	uint32_t _sideBwtSz;
	uint32_t _sideBwtLen;
	TIndexOffU _numSides;
	TIndexOffU _numLines;
	TIndexOffU _ebwtTotLen;
	TIndexOffU _ebwtTotSz;
	bool     _color;
	bool     _entireReverse;
};

/**
 * Exception to throw when a file-realted error occurs.
 */
class EbwtFileOpenException : public std::runtime_error {
public:
	EbwtFileOpenException(const std::string& msg = "") :
		std::runtime_error(msg) { }
};

/**
 * Calculate size of file with given name.
 */
static inline int64_t fileSize(const char* name) {
	std::ifstream f;
	f.open(name, std::ios_base::binary | std::ios_base::in);
	if (!f.good() || f.eof() || !f.is_open()) { return 0; }
	f.seekg(0, std::ios_base::beg);
	std::ifstream::pos_type begin_pos = f.tellg();
	f.seekg(0, std::ios_base::end);
	return static_cast<int64_t>(f.tellg() - begin_pos);
}

/**
 * Encapsulates a location in the bwt text in terms of the side it
 * occurs in and its offset within the side.
 */
struct SideLocus {
	SideLocus() :
		_sideByteOff(0),
		_sideNum(0),
		_charOff(0),
		_by(-1),
		_bp(-1) { }

	/**
	 * Construct from row and other relevant information about the Ebwt.
	 */
	SideLocus(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) {
		initFromRow(row, ep, ebwt);
	}

	/**
	 * Init two SideLocus objects from a top/bot pair, using the result
	 * from one call to initFromRow to possibly avoid a second call.
	 */
	static void initFromTopBot(
		TIndexOffU top,
		TIndexOffU bot,
		const EbwtParams& ep,
		const uint8_t* ebwt,
		SideLocus& ltop,
		SideLocus& lbot,
		bool prefetch=true)
		{
			const TIndexOffU sideBwtLen = ep._sideBwtLen;
			assert_gt(bot, top);
			ltop.initFromRow(top, ep, ebwt, prefetch);
			TIndexOffU spread = bot - top;
			// Many cache misses on the following lines
			const TIndexOffU charOffSum = ltop._charOff + spread;
			if(charOffSum < sideBwtLen) {
				const uint32_t bcharOff = (uint32_t) charOffSum;
				lbot._charOff = bcharOff;
				lbot._sideNum = ltop._sideNum;
				lbot._sideByteOff = ltop._sideByteOff;
				lbot._by = bcharOff >> 2;
				assert_lt(lbot._by, (int)ep._sideBwtSz);
				lbot._bp = bcharOff & 3;
				// no need for any prefetch, same _sideByteOff
			} else {
				lbot.initFromRow(bot, ep, ebwt, prefetch);
			}
		}

	static void prefetchFromTopBot(
		TIndexOffU top,
		TIndexOffU bot,
		const EbwtParams& ep,
		const uint8_t* ebwt)
		{
			prefetchFromRow(top, ep, ebwt);
			// not trying to be smart... prefetches are cheap
			prefetchFromRow(bot, ep, ebwt);
		}

	/**
	 * Calculate SideLocus based on a row and other relevant
	 * information about the shape of the Ebwt.
	 */
	void initFromRow(TIndexOffU row,
			const EbwtParams& ep,
			const uint8_t* ebwt,
			bool prefetch=true) {
		const int32_t sideSz     = ep._sideSz;
		// Side length is hard-coded for now; this allows the compiler
		// to do clever things to accelerate / and %.
		const TIndexOffU sideNum  = row / (48*OFF_SIZE);
		assert_lt(sideNum, ep._numSides);
		const int32_t charOff     = row % (48*OFF_SIZE);
		_sideNum                  = sideNum;
		_charOff                  = charOff;
		const TIndexOffU sByteOff = sideNum * sideSz;
		if (prefetch) {
			__builtin_prefetch(ebwt + sByteOff);
			__builtin_prefetch(ebwt + sByteOff + 64); //64 byte cache lines
#if (OFF_SIZE>4)
			__builtin_prefetch(ebwt + sByteOff + 2*64);
#endif
                }

		_sideByteOff              = sByteOff;
		assert_leq(row, ep._len);
		assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz);
		// Tons of cache misses on the next line
		_by = charOff >> 2; // byte within side
		assert_lt(_by, (int)ep._sideBwtSz);
		_bp = charOff & 3;  // bit-pair within byte
	}

	/**
 	 * Prefetch cache lines used by side(row). 
 	 */ 
	static void prefetchFromRow(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) {
		const int32_t sideSz         = ep._sideSz;
		// Side length is hard-coded for now; this allows the compiler
		// to do clever things to accelerate / and %.
		const TIndexOffU sideNum     = row / (48*OFF_SIZE);
		const TIndexOffU sideByteOff = sideNum * sideSz;
		__builtin_prefetch(ebwt + sideByteOff);
		__builtin_prefetch(ebwt + sideByteOff + 64); //64 byte cache lines
#if (OFF_SIZE>4)
		__builtin_prefetch(ebwt + sideByteOff + 2*64);
#endif
	}

	void prefetch(const uint8_t* ebwt) const {
                __builtin_prefetch(ebwt + _sideByteOff);
                __builtin_prefetch(ebwt + _sideByteOff + 64); //64 byte cache lines
#if (OFF_SIZE>4)
                __builtin_prefetch(ebwt + _sideByteOff + 2*64);
#endif
	}

	/**
	 * Transform this SideLocus to refer to the next side (i.e. the one
	 * corresponding to the next side downstream).  Set all cursors to
	 * point to the beginning of the side.
	 */
	void nextSide(const EbwtParams& ep) {
		assert(valid());
		_sideByteOff += ep.sideSz();
		_sideNum++;
		_by = _bp = _charOff = 0;
		assert(valid());
	}

	/**
	 * Return true iff this is an initialized SideLocus
	 */
	bool valid() const {
		if(_bp != -1) {
			return true;
		}
		return false;
	}

	/**
	 * Convert locus to BW row it corresponds to.
	 */
	TIndexOffU toBWRow() const {
		return _sideNum * 48*OFF_SIZE + _charOff;
	}

#ifndef NDEBUG
	/**
	 * Check that SideLocus is internally consistent and consistent
	 * with the (provided) EbwtParams.
	 */
	bool repOk(const EbwtParams& ep) const {
		ASSERT_ONLY(TIndexOffU row = _sideNum * 48*OFF_SIZE + _charOff);
		assert_leq(row, ep._len);
		assert_range(-1, 3, _bp);
		assert_range(0, (int)ep._sideBwtSz, _by);
		return true;
	}
#endif

	/// Make this look like an invalid SideLocus
	void invalidate() {
		_bp = -1;
	}

	/**
	 * Return a read-only pointer to the beginning of the top side.
	 */
	const uint8_t *side(const uint8_t* ebwt) const {
		return ebwt + _sideByteOff;
	}

	TIndexOffU _sideByteOff; // offset of top side within ebwt[]
	TIndexOffU _sideNum;     // index of side
	uint32_t _charOff;      // character offset within side
	int32_t _by;            // byte within side (not adjusted for bw sides)
	int32_t _bp;            // bitpair within byte (not adjusted for bw sides)
};
#ifdef POPCNT_CAPABILITY   // wrapping of "struct"
struct USE_POPCNT_GENERIC {
#endif
// Use this standard bit-bashing population count
	inline static int pop64(uint64_t x) {
		// Lots of cache misses on following lines (>10K)
		x = x - ((x >> 1) & 0x5555555555555555llu);
		x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu);
		x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu;
		x = x + (x >> 8);
		x = x + (x >> 16);
		x = x + (x >> 32);
		return (int)(x & 0x3Fllu);
	}
#ifdef POPCNT_CAPABILITY  // wrapping a "struct"
};
#endif

#ifdef POPCNT_CAPABILITY
struct USE_POPCNT_INSTRUCTION {
	inline static int pop64(uint64_t x) {
		return __builtin_popcountll(x);
	}
};
#endif

/**
 * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
 * within a 64-bit argument.
 */
#ifdef POPCNT_CAPABILITY
template<typename Operation>
#endif
inline static int countInU64(int c, uint64_t dw) {
	uint64_t c0 = c_table[c];
	uint64_t x0 = dw ^ c0;
	uint64_t x1 = (x0 >> 1);
	uint64_t x2 = x1 & (0x5555555555555555);
	uint64_t x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
	uint64_t tmp = Operation().pop64(x3);
#else
	uint64_t tmp = pop64(x3);
#endif
	return (int) tmp;
}

// Forward declarations for Ebwt class
class EbwtSearchParams;

/**
 * Extended Burrows-Wheeler transform data.
 *
 * An Ebwt may be transferred to and from RAM with calls to
 * evictFromMemory() and loadIntoMemory().  By default, a newly-created
 * Ebwt is not loaded into memory; if the user would like to use a
 * newly-created Ebwt to answer queries, they must first call
 * loadIntoMemory().
 */
class Ebwt {
public:
#define Ebwt_INITS					\
	_switchEndian(false),				\
		_overrideOffRate(overrideOffRate),	\
		_verbose(verbose),			\
		_passMemExc(passMemExc),		\
		_sanity(sanityCheck),			\
		fw_(fw),				\
		_in1(NULL),				\
		_in2(NULL),				\
		_zOff(OFF_MASK),			\
		_zEbwtByteOff(OFF_MASK),		\
		_zEbwtBpOff(-1),			\
		_nPat(0),				\
		_nFrag(0),				\
		_plen(EBWT_CAT),			\
		_rstarts(EBWT_CAT),			\
		_fchr(EBWT_CAT),			\
		_ftab(EBWT_CAT),			\
		_eftab(EBWT_CAT),			\
		_offs(EBWT_CAT),			\
		_ebwt(EBWT_CAT),			\
		_useMm(false),				\
		useShmem_(false),			\
		_refnames(EBWT_CAT),			\
		mmFile1_(NULL),				\
		mmFile2_(NULL)

	/// Construct an Ebwt from the given input file
	Ebwt(const string& in,
	     int color,
	     int needEntireReverse,
	     bool fw,
	     int32_t overrideOffRate, // = -1,
	     int32_t offRatePlus, // = -1,
	     bool useMm, // = false,
	     bool useShmem, // = false,
	     bool mmSweep, // = false,
	     bool loadNames, // = false,
	     bool loadSASamp, // = true,
	     bool loadFtab, // = true,
	     bool loadRstarts, // = true,
	     bool verbose, // = false,
	     bool startVerbose, // = false,
	     bool passMemExc, // = false,
	     bool sanityCheck) : // = false) :
		Ebwt_INITS
	{
		assert(!useMm || !useShmem);
#ifdef POPCNT_CAPABILITY
		ProcessorSupport ps;
		_usePOPCNTinstruction = ps.POPCNTenabled();
#endif
		packed_ = false;
		_useMm = useMm;
		useShmem_ = useShmem;
		_in1Str = in + ".1." + gEbwt_ext;
		_in2Str = in + ".2." + gEbwt_ext;
		readIntoMemory(
			color,       // expect index to be colorspace?
			fw ? -1 : needEntireReverse, // need REF_READ_REVERSE
			loadSASamp,  // load the SA sample portion?
			loadFtab,    // load the ftab & eftab?
			loadRstarts, // load the rstarts array?
			true,        // stop after loading the header portion?
			&_eh,        // params
			mmSweep,     // mmSweep
			loadNames,   // loadNames
			startVerbose); // startVerbose
		// If the offRate has been overridden, reflect that in the
		// _eh._offRate field
		if(offRatePlus > 0 && _overrideOffRate == -1) {
			_overrideOffRate = _eh._offRate + offRatePlus;
		}
		if(_overrideOffRate > _eh._offRate) {
			_eh.setOffRate(_overrideOffRate);
			assert_eq(_overrideOffRate, _eh._offRate);
		}
		assert(repOk());
	}

	/// Construct an Ebwt from the given header parameters and string
	/// vector, optionally using a blockwise suffix sorter with the
	/// given 'bmax' and 'dcv' parameters.  The string vector is
	/// ultimately joined and the joined string is passed to buildToDisk().
	template<typename TStr>
	Ebwt(
		TStr exampleStr,
		bool packed,
		int color,
		int needEntireReverse,
		int32_t lineRate,
		int32_t offRate,
		int32_t ftabChars,
		int nthreads,
		const string& file,   // base filename for EBWT files
		bool fw,
		bool useBlockwise,
		TIndexOffU bmax,
		TIndexOffU bmaxSqrtMult,
		TIndexOffU bmaxDivN,
		int dcv,
		EList<FileBuf*>& is,
		EList<RefRecord>& szs,
		TIndexOffU sztot,
		const RefReadInParams& refparams,
		uint32_t seed,
		int32_t overrideOffRate = -1,
		bool doSaFile = false,
		bool doBwtFile = false,
		bool verbose = false,
		bool passMemExc = false,
		bool sanityCheck = false) :
		Ebwt_INITS,
	_eh(
		joinedLen(szs),
		lineRate,
		offRate,
		ftabChars,
		color,
		refparams.reverse == REF_READ_REVERSE)
	{
#ifdef POPCNT_CAPABILITY
		ProcessorSupport ps;
		_usePOPCNTinstruction = ps.POPCNTenabled();
#endif
		_in1Str = file + ".1." + gEbwt_ext + ".tmp";
		_in2Str = file + ".2." + gEbwt_ext + ".tmp";
		packed_ = packed;
		// Open output files
		ofstream fout1(_in1Str.c_str(), ios::binary);
		if(!fout1.good()) {
			cerr << "Could not open index file for writing: \"" << _in1Str.c_str() << "\"" << endl
			     << "Please make sure the directory exists and that permissions allow writing by" << endl
			     << "Bowtie." << endl;
			throw 1;
		}
		ofstream fout2(_in2Str.c_str(), ios::binary);
		if(!fout2.good()) {
			cerr << "Could not open index file for writing: \"" << _in2Str.c_str() << "\"" << endl
			     << "Please make sure the directory exists and that permissions allow writing by" << endl
			     << "Bowtie." << endl;
			throw 1;
		}
		_inSaStr = file + ".sa";
		_inBwtStr = file + ".bwt";
		ofstream *saOut = NULL, *bwtOut = NULL;
		if(doSaFile) {
			saOut = new ofstream(_inSaStr.c_str(), ios::binary);
			if(!saOut->good()) {
				cerr << "Could not open suffix-array file for writing: \"" << _inSaStr.c_str() << "\"" << endl
				     << "Please make sure the directory exists and that permissions allow writing by" << endl
				     << "Bowtie." << endl;
				throw 1;
			}
		}
		if(doBwtFile) {
			bwtOut = new ofstream(_inBwtStr.c_str(), ios::binary);
			if(!bwtOut->good()) {
				cerr << "Could not open suffix-array file for writing: \"" << _inBwtStr.c_str() << "\"" << endl
				     << "Please make sure the directory exists and that permissions allow writing by" << endl
				     << "Bowtie." << endl;
				throw 1;
			}
		}
		// Build SA(T) and BWT(T) block by block
		initFromVector<TStr>(
			is,
			szs,
			sztot,
			refparams,
			fout1,
			fout2,
			file,
			saOut,
			bwtOut,
			nthreads,
			useBlockwise,
			bmax,
			bmaxSqrtMult,
			bmaxDivN,
			dcv,
			seed,
			verbose);
		// Close output files
		fout1.flush();

		int64_t tellpSz1 = (int64_t)fout1.tellp();
		VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary EBWT file: " << _in1Str.c_str());
		fout1.close();
		bool err = false;
		if(tellpSz1 > fileSize(_in1Str.c_str())) {
			err = true;
			cerr << "Index is corrupt: File size for " << _in1Str.c_str() << " should have been " << tellpSz1
			     << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl;
		}
		fout2.flush();

		int64_t tellpSz2 = (int64_t)fout2.tellp();
		VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary EBWT file: " << _in2Str.c_str());
		fout2.close();
		if(tellpSz2 > fileSize(_in2Str.c_str())) {
			err = true;
			cerr << "Index is corrupt: File size for " << _in2Str.c_str() << " should have been " << tellpSz2
			     << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl;
		}

		if(saOut != NULL) {
			// Check on suffix array output file size
			int64_t tellpSzSa = (int64_t)saOut->tellp();
			VMSG_NL("Wrote " << tellpSzSa << " bytes to suffix-array file: " << _inSaStr.c_str());
			saOut->close();
			delete saOut;
			if(tellpSzSa > fileSize(_inSaStr.c_str())) {
				err = true;
				cerr << "Index is corrupt: File size for " << _inSaStr.c_str() << " should have been " << tellpSzSa
				     << " but is actually " << fileSize(_inSaStr.c_str()) << "." << endl;
			}
		}

		if(bwtOut != NULL) {
			// Check on suffix array output file size
			int64_t tellpSzBwt = (int64_t)bwtOut->tellp();
			VMSG_NL("Wrote " << tellpSzBwt << " bytes to BWT file: " << _inBwtStr.c_str());
			bwtOut->close();
			delete bwtOut;
			if(tellpSzBwt > fileSize(_inBwtStr.c_str())) {
				err = true;
				cerr << "Index is corrupt: File size for " << _inBwtStr.c_str() << " should have been " << tellpSzBwt
				     << " but is actually " << fileSize(_inBwtStr.c_str()) << "." << endl;
			}
		}

		if(err) {
			cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
			throw 1;
		}

		// Reopen as input streams
		VMSG_NL("Re-opening _in1 and _in2 as input streams");
		if(_sanity) {
			VMSG_NL("Sanity-checking Bt2");
			assert(!isInMemory());
			readIntoMemory(
				color,                       // colorspace?
				fw ? -1 : needEntireReverse, // 1 -> need the reverse to be reverse-of-concat
				true,                        // load SA sample (_offs[])?
				true,                        // load ftab (_ftab[] & _eftab[])?
				true,                        // load r-starts (_rstarts[])?
				false,                       // just load header?
				NULL,                        // Params object to fill
				false,                       // mm sweep?
				true,                        // load names?
				false);                      // verbose startup?
			sanityCheckAll(refparams.reverse);
			evictFromMemory();
			assert(!isInMemory());
		}
		VMSG_NL("Returning from Ebwt constructor");
	}

	/**
	 * Static constructor for a pair of forward/reverse indexes for the
	 * given reference string.
	 */
	template<typename TStr>
	static pair<Ebwt*, Ebwt*>
	fromString(
		const char* str,
		bool packed,
		int color,
		int reverse,
		bool bigEndian,
		int32_t lineRate,
		int32_t offRate,
		int32_t ftabChars,
		const string& file,
		bool useBlockwise,
		TIndexOffU bmax,
		TIndexOffU bmaxSqrtMult,
		TIndexOffU bmaxDivN,
		int dcv,
		uint32_t seed,
		bool verbose,
		bool autoMem,
		bool sanity)
		{
			EList<std::string> strs(EBWT_CAT);
			strs.push_back(std::string(str));
			return fromStrings<TStr>(
				strs,
				packed,
				color,
				reverse,
				bigEndian,
				lineRate,
				offRate,
				ftabChars,
				file,
				useBlockwise,
				bmax,
				bmaxSqrtMult,
				bmaxDivN,
				dcv,
				seed,
				verbose,
				autoMem,
				sanity);
		}

	/**
	 * Static constructor for a pair of forward/reverse indexes for the
	 * given list of reference strings.
	 */
	template<typename TStr>
	static pair<Ebwt*, Ebwt*>
	fromStrings(
		const EList<std::string>& strs,
		bool packed,
		int color,
		int reverse,
		bool bigEndian,
		int32_t lineRate,
		int32_t offRate,
		int32_t ftabChars,
		const string& file,
		bool useBlockwise,
		TIndexOffU bmax,
		TIndexOffU bmaxSqrtMult,
		TIndexOffU bmaxDivN,
		int dcv,
		uint32_t seed,
		bool verbose,
		bool autoMem,
		bool sanity)
		{
			assert(!strs.empty());
			EList<FileBuf*> is(EBWT_CAT);
			RefReadInParams refparams(color, REF_READ_FORWARD, false, false);
			// Adapt sequence strings to stringstreams open for input
			unique_ptr<stringstream> ss(new stringstream());
			for(TIndexOffU i = 0; i < strs.size(); i++) {
				(*ss) << ">" << i << endl << strs[i] << endl;
			}
			unique_ptr<FileBuf> fb(new FileBuf(ss.get()));
			assert(!fb->eof());
			assert(fb->get() == '>');
			ASSERT_ONLY(fb->reset());
			assert(!fb->eof());
			is.push_back(fb.get());
			// Vector for the ordered list of "records" comprising the input
			// sequences.  A record represents a stretch of unambiguous
			// characters in one of the input sequences.
			EList<RefRecord> szs(EBWT_CAT);
			std::pair<TIndexOffU, TIndexOffU> sztot;
			sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity);
			// Construct Ebwt from input strings and parameters
			Ebwt *ebwtFw = new Ebwt(
				TStr(),
				packed,
				refparams.color ? 1 : 0,
				-1,           // fw
				lineRate,
				offRate,      // suffix-array sampling rate
				ftabChars,    // number of chars in initial arrow-pair calc
				file,         // basename for .?.ebwt files
				true,         // fw?
				useBlockwise, // useBlockwise
				bmax,         // block size for blockwise SA builder
				bmaxSqrtMult, // block size as multiplier of sqrt(len)
				bmaxDivN,     // block size as divisor of len
				dcv,          // difference-cover period
				is,           // list of input streams
				szs,          // list of reference sizes
				sztot.first,  // total size of all unambiguous ref chars
				refparams,    // reference read-in parameters
				seed,         // pseudo-random number generator seed
				-1,           // override offRate
				verbose,      // be talkative
				autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
				sanity);      // verify results and internal consistency
			refparams.reverse = reverse;
			szs.clear();
			sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity);
			// Construct Ebwt from input strings and parameters
			Ebwt *ebwtBw = new Ebwt(
				TStr(),
				packed,
				refparams.color ? 1 : 0,
				reverse == REF_READ_REVERSE,
				lineRate,
				offRate,      // suffix-array sampling rate
				ftabChars,    // number of chars in initial arrow-pair calc
				file + ".rev",// basename for .?.ebwt files
				false,        // fw?
				useBlockwise, // useBlockwise
				bmax,         // block size for blockwise SA builder
				bmaxSqrtMult, // block size as multiplier of sqrt(len)
				bmaxDivN,     // block size as divisor of len
				dcv,          // difference-cover period
				is,           // list of input streams
				szs,          // list of reference sizes
				sztot.first,  // total size of all unambiguous ref chars
				refparams,    // reference read-in parameters
				seed,         // pseudo-random number generator seed
				-1,           // override offRate
				verbose,      // be talkative
				autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
				sanity);      // verify results and internal consistency
			return make_pair(ebwtFw, ebwtBw);
		}

	/// Return true iff the Ebwt is packed
	bool isPacked() { return packed_; }

	/**
	 * Write the rstarts array given the szs array for the reference.
	 */
	void szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse);

	/**
	 * Helper for the constructors above.  Takes a vector of text
	 * strings and joins them into a single string with a call to
	 * joinToDisk, which does a join (with padding) and writes some of
	 * the resulting data directly to disk rather than keep it in
	 * memory.  It then constructs a suffix-array producer (what kind
	 * depends on 'useBlockwise') for the resulting sequence.  The
	 * suffix-array producer can then be used to obtain chunks of the
	 * joined string's suffix array.
	 */
	template <typename TStr>
	void initFromVector(EList<FileBuf*>& is,
	                    EList<RefRecord>& szs,
	                    TIndexOffU sztot,
	                    const RefReadInParams& refparams,
	                    ofstream& out1,
			    ofstream& out2,
                            const string& outfile,
	                    ofstream* saOut,
	                    ofstream* bwtOut,
                            int nthreads,
	                    bool useBlockwise,
	                    TIndexOffU bmax,
	                    TIndexOffU bmaxSqrtMult,
	                    TIndexOffU bmaxDivN,
	                    int dcv,
	                    uint32_t seed,
	                    bool verbose)
		{
			// Compose text strings into single string
			VMSG_NL("Calculating joined length");
			TStr s; // holds the entire joined reference after call to joinToDisk
			TIndexOffU jlen;
			jlen = joinedLen(szs);
			assert_geq(jlen, sztot);
			VMSG_NL("Writing header");
			writeFromMemory(true, out1, out2);
			try {
				VMSG_NL("Reserving space for joined string");
				s.resize(jlen);
				VMSG_NL("Joining reference sequences");
				if(refparams.reverse == REF_READ_REVERSE) {
					{
						Timer timer(cout, "  Time to join reference sequences: ", _verbose);
						joinToDisk(is, szs, sztot, refparams, s, out1, out2);
					}
					{
						Timer timer(cout, "  Time to reverse reference sequence: ", _verbose);
						EList<RefRecord> tmp(EBWT_CAT);
						s.reverse();
						reverseRefRecords(szs, tmp, false, verbose);
						szsToDisk(tmp, out1, refparams.reverse);
					}
				} else {
					Timer timer(cout, "  Time to join reference sequences: ", _verbose);
					joinToDisk(is, szs, sztot, refparams, s, out1, out2);
					szsToDisk(szs, out1, refparams.reverse);
				}
				// Joined reference sequence now in 's'
			} catch(bad_alloc& e) {
				// If we throw an allocation exception in the try block,
				// that means that the joined version of the reference
				// string itself is too larger to fit in memory.  The only
				// alternatives are to tell the user to give us more memory
				// or to try again with a packed representation of the
				// reference (if we haven't tried that already).
				cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl;
				if(!isPacked() && _passMemExc) {
					// Pass the exception up so that we can retry using a
					// packed string representation
					throw e;
				}
				// There's no point passing this exception on.  The fact
				// that we couldn't allocate the joined string means that
				// --bmax is irrelevant - the user should re-run with
				// ebwt-build-packed
				if(isPacked()) {
					cerr << "Please try running bowtie-build on a computer with more memory." << endl;
				} else {
					cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl
					     << "mode (-a/--auto), or try again on a computer with more memory." << endl;
				}
				if(sizeof(void*) == 4) {
					cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
					     << "this executable is 32-bit." << endl;
				}
				throw 1;
			}
			// Succesfully obtained joined reference string
			assert_geq(s.length(), jlen);
			if(bmax != OFF_MASK) {
				VMSG_NL("bmax according to bmax setting: " << bmax);
			}
			else if(bmaxSqrtMult != OFF_MASK) {
				bmax *= bmaxSqrtMult;
				VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax);
			}
			else if(bmaxDivN != OFF_MASK) {
				bmax = max<TIndexOffU>(jlen / bmaxDivN, 1);
				VMSG_NL("bmax according to bmaxDivN setting: " << bmax);
			}
			else {
				bmax = (TIndexOffU)sqrt(s.length());
				VMSG_NL("bmax defaulted to: " << bmax);
			}
#ifndef USE_SAIS
			int iter = 0;
			bool first = true;
			streampos out1pos = out1.tellp();
			streampos out2pos = out2.tellp();
#endif

			// Look for bmax/dcv parameters that work.
			thread_pool pool(nthreads - 1);
			while(true) {
#ifndef USE_SAIS
				if(!first && bmax < 40 && _passMemExc) {
					cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl;
					if(!isPacked()) {
						// Throw an exception exception so that we can
						// retry using a packed string representation
						throw bad_alloc();
					} else {
						cerr << "Already tried a packed string representation." << endl;
					}
					cerr << "Please try indexing this reference on a computer with more memory." << endl;
					if(sizeof(void*) == 4) {
						cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
						     << "this executable is 32-bit." << endl;
					}
					throw 1;
				}
				if(!first) {
					out1.seekp(out1pos);
					out2.seekp(out2pos);
				}
				if(dcv > 4096) dcv = 4096;
				if((iter % 6) == 5 && dcv < 4096 && dcv != 0) {
					dcv <<= 1; // double difference-cover period
				} else {
					bmax -= (bmax >> 2); // reduce by 25%
				}
				VMSG("Using parameters --bmax " << bmax);
				if(dcv == 0) {
					VMSG_NL(" and *no difference cover*");
				} else {
					VMSG_NL(" --dcv " << dcv);
				}
				iter++;
#endif
				try {
#ifndef USE_SAIS
					{
						VMSG_NL("  Doing ahead-of-time memory usage test");
						// Make a quick-and-dirty attempt to force a bad_alloc iff
						// we would have thrown one eventually as part of
						// constructing the DifferenceCoverSample
						dcv <<= 1;
						TIndexOffU sz = (TIndexOffU)DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
						if(nthreads > 1) sz *= (nthreads + 1);
						AutoArray<uint8_t> tmp(sz, EBWT_CAT);
						dcv >>= 1;
						// Likewise with the KarkkainenBlockwiseSA
						sz = (TIndexOffU)KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
						AutoArray<uint8_t> tmp2(sz, EBWT_CAT);
						// Now throw in the 'ftab' and 'isaSample' structures
						// that we'll eventually allocate in buildToDisk
						AutoArray<TIndexOffU> ftab(_eh._ftabLen * 2, EBWT_CAT);
						AutoArray<uint8_t> side(_eh._sideSz, EBWT_CAT);
						// Grab another 20 MB out of caution
						AutoArray<uint32_t> extra(20*1024*1024, EBWT_CAT);
						// If we made it here without throwing bad_alloc, then we
						// passed the memory-usage stress test
						VMSG("  Passed!  Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv);
						if(isPacked()) {
							VMSG(" --packed");
						}
						VMSG_NL("");
					}
#endif
#ifdef USE_SAIS
					VMSG_NL("Using SAIS algorithm for constructing suffix array");
					SAISBlockwiseSA<TStr> bsa(s, bmax, nthreads);
#else
					VMSG_NL("Constructing suffix-array element generator");
					KarkkainenBlockwiseSA<TStr> bsa(s, bmax, nthreads, pool, dcv, seed, _sanity, _passMemExc, _verbose, outfile);
#endif
					assert(bsa.suffixItrIsReset());
					assert_eq(bsa.size(), s.length()+1);
					VMSG_NL("Converting suffix-array elements to index image");
					buildToDisk(bsa, s, out1, out2, saOut, bwtOut);
					out1.flush(); out2.flush();
					bool failed = out1.fail() || out2.fail();
					if(saOut != NULL) {
						saOut->flush();
						failed = failed || saOut->fail();
					}
					if(bwtOut != NULL) {
						bwtOut->flush();
						failed = failed || bwtOut->fail();
					}
					if(failed) {
						cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
						throw 1;
					}
					break;
				} catch(bad_alloc& e) {
#ifdef USE_SAIS
					cerr << "Out of memory while constructing suffix array.  Please try using a smaller" << endl;
					throw 1;
#else
					if(_passMemExc) {
						VMSG_NL("  Ran out of memory; automatically trying more memory-economical parameters.");
					} else {
						cerr << "Out of memory while constructing suffix array.  Please try using a smaller" << endl
						     << "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl;
						throw 1;
					}
#endif
				}
#ifndef USE_SAIS
				first = false;
#endif

			}
			assert(repOk());
			// Now write reference sequence names on the end
			assert_eq(this->_refnames.size(), this->_nPat);
			for(TIndexOffU i = 0; i < this->_refnames.size(); i++) {
				out1 << this->_refnames[i].c_str() << endl;
			}
			out1 << '\0';
			out1.flush(); out2.flush();
			if(out1.fail() || out2.fail()) {
				cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
				throw 1;
			}
			VMSG_NL("Returning from initFromVector");
		}

	/**
	 * Return the length that the joined string of the given string
	 * list will have.  Note that this is indifferent to how the text
	 * fragments correspond to input sequences - it just cares about
	 * the lengths of the fragments.
	 */
	TIndexOffU joinedLen(EList<RefRecord>& szs) {
		TIndexOffU ret = 0;
		for(unsigned int i = 0; i < szs.size(); i++) {
			ret += szs[i].len;
		}
		return ret;
	}

	/// Destruct an Ebwt
	~Ebwt() {
		_fchr.reset();
		_ftab.reset();
		_eftab.reset();
		_plen.reset();
		_rstarts.reset();
		_offs.reset();
		_ebwt.reset();
		if(offs() != NULL && useShmem_) {
			FREE_SHARED(offs());
		}
		if(ebwt() != NULL && useShmem_) {
			FREE_SHARED(ebwt());
		}
		if (_in1 != NULL) fclose(_in1);
		if (_in2 != NULL) fclose(_in2);
	}

	/// Accessors
	inline const EbwtParams& eh() const     { return _eh; }
	TIndexOffU    zOff() const         { return _zOff; }
	TIndexOffU    zEbwtByteOff() const { return _zEbwtByteOff; }
	TIndexOff         zEbwtBpOff() const   { return _zEbwtBpOff; }
	TIndexOffU    nPat() const         { return _nPat; }
	TIndexOffU    nFrag() const        { return _nFrag; }
	inline TIndexOffU*   fchr()              { return _fchr.get(); }
	inline TIndexOffU*   ftab()              { return _ftab.get(); }
	inline TIndexOffU*   eftab()             { return _eftab.get(); }
	inline TIndexOffU*   offs()              { return _offs.get(); }
	inline TIndexOffU*   plen()              { return _plen.get(); }
	inline TIndexOffU*   rstarts()           { return _rstarts.get(); }
	inline uint8_t*    ebwt()              { return _ebwt.get(); }
	inline const TIndexOffU* fchr() const    { return _fchr.get(); }
	inline const TIndexOffU* ftab() const    { return _ftab.get(); }
	inline const TIndexOffU* eftab() const   { return _eftab.get(); }
	inline const TIndexOffU* offs() const    { return _offs.get(); }
	inline const TIndexOffU* plen() const    { return _plen.get(); }
	inline const TIndexOffU* rstarts() const { return _rstarts.get(); }
	inline const uint8_t*  ebwt() const    { return _ebwt.get(); }
	bool        verbose() const      { return _verbose; }
	bool        sanityCheck() const  { return _sanity; }
	EList<string>& refnames()        { return _refnames; }
	bool        fw() const           { return fw_; }
#ifdef POPCNT_CAPABILITY
	bool _usePOPCNTinstruction;
#endif

	/**
	 * Returns true iff the index contains the given string (exactly).  The
	 * given string must contain only unambiguous characters.  TODO:
	 * support skipping of ambiguous characters.
	 */
	bool contains(
		const BTDnaString& str,
		TIndexOffU *top = NULL,
		TIndexOffU *bot = NULL) const;

	/**
	 * Returns true iff the index contains the given string (exactly).  The
	 * given string must contain only unambiguous characters.  TODO:
	 * support skipping of ambiguous characters.
	 */
	bool contains(
		const char *str,
		TIndexOffU *top = NULL,
		TIndexOffU *bot = NULL) const
		{
			return contains(BTDnaString(str, true), top, bot);
		}

	/// Return true iff the Ebwt is currently in memory
	bool isInMemory() const {
		if(ebwt() != NULL) {
			// Note: We might have skipped loading _offs, _ftab,
			// _eftab, and _rstarts depending on whether this is the
			// reverse index and what algorithm is being used.
			assert(_eh.repOk());
			//assert(_ftab != NULL);
			//assert(_eftab != NULL);
			assert(fchr() != NULL);
			//assert(_offs != NULL);
			//assert(_rstarts != NULL);
			assert_neq(_zEbwtByteOff, OFF_MASK);
			assert_neq(_zEbwtBpOff, -1);
			return true;
		} else {
			assert(ftab() == NULL);
			assert(eftab() == NULL);
			assert(fchr() == NULL);
			assert(offs() == NULL);
			assert(rstarts() == NULL);
			assert_eq(_zEbwtByteOff, OFF_MASK);
			assert_eq(_zEbwtBpOff, -1);
			return false;
		}
	}

	/// Return true iff the Ebwt is currently stored on disk
	bool isEvicted() const {
		return !isInMemory();
	}

	/**
	 * Load this Ebwt into memory by reading it in from the _in1 and
	 * _in2 streams.
	 */
	void loadIntoMemory(
		int color,
		int needEntireReverse,
		bool loadSASamp,
		bool loadFtab,
		bool loadRstarts,
		bool loadNames,
		bool verbose)
		{
			readIntoMemory(
				color,       // expect index to be colorspace?
				needEntireReverse, // require reverse index to be concatenated reference reversed
				loadSASamp,  // load the SA sample portion?
				loadFtab,    // load the ftab (_ftab[] and _eftab[])?
				loadRstarts, // load the r-starts (_rstarts[])?
				false,       // stop after loading the header portion?
				NULL,        // params
				false,       // mmSweep
				loadNames,   // loadNames
				verbose);    // startVerbose
		}

	/**
	 * Frees memory associated with the Ebwt.
	 */
	void evictFromMemory() {
		assert(isInMemory());
		_fchr.free();
		_ftab.free();
		_eftab.free();
		_rstarts.free();
		_offs.free(); // might not be under control of APtrWrap
		_ebwt.free(); // might not be under control of APtrWrap
		// Keep plen; it's small and the client may want to seq it
		// even when the others are evicted.
		//_plen  = NULL;
		_zEbwtByteOff = OFF_MASK;
		_zEbwtBpOff = -1;
	}

	/**
	 * Turn a substring of 'seq' starting at offset 'off' and having
	 * length equal to the index's 'ftabChars' into an int that can be
	 * used to index into the ftab array.
	 */
	TIndexOffU ftabSeqToInt(
		const BTDnaString& seq,
		size_t off,
		bool rev) const
		{
			int fc = _eh._ftabChars;
			size_t lo = off, hi = lo + fc;
			assert_leq(hi, seq.length());
			TIndexOffU ftabOff = 0;
			for(int i = 0; i < fc; i++) {
				bool fwex = fw();
				if(rev) fwex = !fwex;
				// We add characters to the ftabOff in the order they would
				// have been consumed in a normal search.  For BWT, this
				// means right-to-left order; for BWT' it's left-to-right.
				int c = (fwex ? seq[lo + i] : seq[hi - i - 1]);
				if(c > 3) {
					return std::numeric_limits<TIndexOffU>::max();
				}
				assert_range(0, 3, c);
				ftabOff <<= 2;
				ftabOff |= c;
			}
			return ftabOff;
		}

	/**
	 * Non-static facade for static function ftabHi.
	 */
	TIndexOffU ftabHi(TIndexOffU i) const {
		return Ebwt::ftabHi(
			ftab(),
			eftab(),
			_eh._len,
			_eh._ftabLen,
			_eh._eftabLen,
			i);
	}

	void ftabHiPrefetch(TIndexOffU i) const {
		Ebwt::ftabHiPrefetch(
			ftab(),
			i);
	}

	/**
	 * Get "high interpretation" of ftab entry at index i.  The high
	 * interpretation of a regular ftab entry is just the entry
	 * itself.  The high interpretation of an extended entry is the
	 * second correpsonding ui32 in the eftab.
	 *
	 * It's a static member because it's convenient to ask this
	 * question before the Ebwt is fully initialized.
	 */
	static TIndexOffU ftabHi(
		const TIndexOffU *ftab,
		const TIndexOffU *eftab,
		TIndexOffU len,
		TIndexOffU ftabLen,
		TIndexOffU eftabLen,
		TIndexOffU i)
		{
			assert_lt(i, ftabLen);
			if(ftab[i] <= len) {
				return ftab[i];
			} else {
				TIndexOffU efIdx = ftab[i] ^ OFF_MASK;
				assert_lt(efIdx*2+1, eftabLen);
				return eftab[efIdx*2+1];
			}
		}

	static void ftabHiPrefetch(
		const TIndexOffU *ftab,
		TIndexOffU i)
		{
			__builtin_prefetch(&(ftab[i]));
		}

	/**
	 * Non-static facade for static function ftabLo.
	 */
	TIndexOffU ftabLo(TIndexOffU i) const {
		return Ebwt::ftabLo(
			ftab(),
			eftab(),
			_eh._len,
			_eh._ftabLen,
			_eh._eftabLen,
			i);
	}

	void ftabLoPrefetch(TIndexOffU i) const {
		Ebwt::ftabLoPrefetch(
			ftab(),
			i);
	}

	/**
	 * Get low and high bound of ftab range.
	 */
	void
	ftabLoHi(
		TIndexOffU i,
		TIndexOffU& top,
		TIndexOffU& bot) const
		{
			top = ftabHi(i);
			bot = ftabLo(i+1);
			assert_geq(bot, top);
		}

	void ftabLoHiPrefetch(TIndexOffU i) const
		{
			ftabHiPrefetch(i);
			ftabLoPrefetch(i+1);
		}

	/**
	 * Get low bound of ftab range.
	 */
	TIndexOffU ftabLo(const BTDnaString& seq, size_t off) const {
		return ftabLo(ftabSeqToInt(seq, off, false));
	}

	/**
	 * Get high bound of ftab range.
	 */
	TIndexOffU ftabHi(const BTDnaString& seq, size_t off) const {
		return ftabHi(ftabSeqToInt(seq, off, false));
	}

	/**
	 * Extract characters from seq starting at offset 'off' and going either
	 * forward or backward, depending on 'rev'.  Order matters when compiling
	 * the integer that gets looked up in the ftab.  Each successive character
	 * is ORed into the least significant bit-pair, and characters are
	 * integrated in the direction of the search.
	 */
	bool
	ftabLoHi(
		const BTDnaString& seq, // sequence to extract from
		size_t off,             // offset into seq to begin extracting
		bool rev,               // reverse while extracting
		TIndexOffU& top,
		TIndexOffU& bot) const
		{
			TIndexOffU fi = ftabSeqToInt(seq, off, rev);
			if(fi == std::numeric_limits<TIndexOffU>::max()) {
				return false;
			}
			ftabLoHi(fi, top, bot);
			return true;
		}

	/**
	 * Get "low interpretation" of ftab entry at index i.  The low
	 * interpretation of a regular ftab entry is just the entry
	 * itself.  The low interpretation of an extended entry is the
	 * first correpsonding ui32 in the eftab.
	 *
	 * It's a static member because it's convenient to ask this
	 * question before the Ebwt is fully initialized.
	 */
	static TIndexOffU ftabLo(
		const TIndexOffU *ftab,
		const TIndexOffU *eftab,
		TIndexOffU len,
		TIndexOffU ftabLen,
		TIndexOffU eftabLen,
		TIndexOffU i)
		{
			assert_lt(i, ftabLen);
			if(ftab[i] <= len) {
				return ftab[i];
			} else {
				TIndexOffU efIdx = ftab[i] ^ OFF_MASK;
				assert_lt(efIdx*2+1, eftabLen);
				return eftab[efIdx*2];
			}
		}

	static void ftabLoPrefetch(
		const TIndexOffU *ftab,
		TIndexOffU i)
		{
			__builtin_prefetch(&(ftab[i]));
		}

	/**
	 * Try to resolve the reference offset of the BW element 'elt'.  If
	 * it can be resolved immediately, return the reference offset.  If
	 * it cannot be resolved immediately, return max value.
	 */
	TIndexOffU tryOffset(TIndexOffU elt) const {
		assert(offs() != NULL);
		if(elt == _zOff) return 0;
		if((elt & _eh._offMask) == elt) {
			TIndexOffU eltOff = elt >> _eh._offRate;
			assert_lt(eltOff, _eh._offsLen);
			TIndexOffU off = offs()[eltOff];
			assert_neq(OFF_MASK, off);
			return off;
		} else {
			// Try looking at zoff
			return OFF_MASK;
		}
	}

	/**
	 * Try to resolve the reference offset of the BW element 'elt' such
	 * that the offset returned is at the right-hand side of the
	 * forward reference substring involved in the hit.
	 */
	TIndexOffU tryOffset(
		TIndexOffU elt,
		bool fw,
		TIndexOffU hitlen) const
		{
			TIndexOffU off = tryOffset(elt);
			if(off != OFF_MASK && !fw) {
				assert_lt(off, _eh._len);
				off = _eh._len - off - 1;
				assert_geq(off, hitlen-1);
				off -= (hitlen-1);
				assert_lt(off, _eh._len);
			}
			return off;
		}

	/**
	 * Walk 'steps' steps to the left and return the row arrived at.
	 */
	TIndexOffU walkLeft(TIndexOffU row, TIndexOffU steps) const;

	/**
	 * Resolve the reference offset of the BW element 'elt'.
	 */
	TIndexOffU getOffset(TIndexOffU row) const;

	/**
	 * Resolve the reference offset of the BW element 'elt' such that
	 * the offset returned is at the right-hand side of the forward
	 * reference substring involved in the hit.
	 */
	TIndexOffU getOffset(
		TIndexOffU elt,
		bool fw,
		TIndexOffU hitlen) const;

	/**
	 * When using read() to create an Ebwt, we have to set a couple of
	 * additional fields in the Ebwt object that aren't part of the
	 * parameter list and are not stored explicitly in the file.  Right
	 * now, this just involves initializing _zEbwtByteOff and
	 * _zEbwtBpOff from _zOff.
	 */
	void postReadInit(EbwtParams& eh) {
		TIndexOffU sideNum     = _zOff / eh._sideBwtLen;
		TIndexOffU sideCharOff = _zOff % eh._sideBwtLen;
		TIndexOffU sideByteOff = sideNum * eh._sideSz;
		_zEbwtByteOff = sideCharOff >> 2;
		assert_lt(_zEbwtByteOff, eh._sideBwtSz);
		_zEbwtBpOff = sideCharOff & 3;
		assert_lt(_zEbwtBpOff, 4);
		_zEbwtByteOff += sideByteOff;
		assert(repOk(eh)); // Ebwt should be fully initialized now
	}

	/**
	 * Given basename of an Ebwt index, read and return its flag.
	 */
	static int32_t readFlags(const string& instr);

	/**
	 * Pretty-print the Ebwt to the given output stream.
	 */
	void print(ostream& out) const {
		print(out, _eh);
	}

	/**
	 * Pretty-print the Ebwt and given EbwtParams to the given output
	 * stream.
	 */
	void print(ostream& out, const EbwtParams& eh) const {
		eh.print(out); // print params
		out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl
		    << "    zOff: "         << _zOff << endl
		    << "    zEbwtByteOff: " << _zEbwtByteOff << endl
		    << "    zEbwtBpOff: "   << _zEbwtBpOff << endl
		    << "    nPat: "  << _nPat << endl
		    << "    plen: ";
		if(plen() == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << plen()[0] << endl;
		}
		out << "    rstarts: ";
		if(rstarts() == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << rstarts()[0] << endl;
		}
		out << "    ebwt: ";
		if(ebwt() == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << ebwt()[0] << endl;
		}
		out << "    fchr: ";
		if(fchr() == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << fchr()[0] << endl;
		}
		out << "    ftab: ";
		if(ftab() == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << ftab()[0] << endl;
		}
		out << "    eftab: ";
		if(eftab() == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << eftab()[0] << endl;
		}
		out << "    offs: ";
		if(offs() == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << offs()[0] << endl;
		}
	}

	// Building
	template <typename TStr> static TStr join(EList<TStr>& l, uint32_t seed);
	template <typename TStr> static TStr join(EList<FileBuf*>& l, EList<RefRecord>& szs, TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed);
	template <typename TStr> void joinToDisk(EList<FileBuf*>& l, EList<RefRecord>& szs, TIndexOffU sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2);
	template <typename TStr> void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2, ostream* saOut, ostream* bwtOut);

	// I/O
	void readIntoMemory(int color, int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose);
	void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const;
	void writeFromMemory(bool justHeader, const string& out1, const string& out2) const;

	// Sanity checking
	void sanityCheckUpToSide(TIndexOff upToSide) const;
	void sanityCheckAll(int reverse) const;
	void restore(SString<char>& s) const;
	void checkOrigs(const EList<SString<char> >& os, bool color, bool mirror) const;

	// Searching and reporting
	void joinedToTextOff(TIndexOffU qlen, TIndexOffU off, TIndexOffU& tidx, TIndexOffU& textoff, TIndexOffU& tlen, bool rejectStraddle, bool& straddled) const;

#define WITHIN_BWT_LEN(x)				\
	assert_leq(x[0], this->_eh._sideBwtLen);	\
	assert_leq(x[1], this->_eh._sideBwtLen);	\
	assert_leq(x[2], this->_eh._sideBwtLen);	\
	assert_leq(x[3], this->_eh._sideBwtLen)

#define WITHIN_FCHR(x)				\
	assert_leq(x[0], this->fchr()[1]);	\
	assert_leq(x[1], this->fchr()[2]);	\
	assert_leq(x[2], this->fchr()[3]);	\
	assert_leq(x[3], this->fchr()[4])

#define WITHIN_FCHR_DOLLARA(x)			\
	assert_leq(x[0], this->fchr()[1]+1);	\
	assert_leq(x[1], this->fchr()[2]);	\
	assert_leq(x[2], this->fchr()[3]);	\
	assert_leq(x[3], this->fchr()[4])

	/**
	 * Count all occurrences of character c from the beginning of the
	 * forward side to <by,bp> and add in the occ[] count up to the side
	 * break just prior to the side.
	 *
	 * A Bowtie 2 side is shaped like:
	 *
	 * XXXXXXXXXXXXXXXX [A] [C] [G] [T]
	 * --------48------ -4- -4- -4- -4-  (numbers in bytes)
	 */
	inline TIndexOffU countBt2Side(const SideLocus& l, int c) const {
		assert_range(0, 3, c);
		assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
		assert_range(0, 3, (int)l._bp);
		const uint8_t *side = l.side(this->ebwt());
		TIndexOffU cCnt = countUpTo(l, c);
		assert_leq(cCnt, l.toBWRow());
		assert_leq(cCnt, this->_eh._sideBwtLen);
		if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
			// Adjust for the fact that we represented $ with an 'A', but
			// shouldn't count it as an 'A' here
			if((l._sideByteOff + l._by > _zEbwtByteOff) ||
			   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
			{
				cCnt--; // Adjust for '$' looking like an 'A'
			}
		}
		TIndexOffU ret;
		// Now factor in the occ[] count at the side break
		const uint8_t *acgt8 = side + _eh._sideBwtSz;
		const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(acgt8);
		assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
		assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->_eh._len);
		assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->_eh._len);
		assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->_eh._len);
		ret = endianizeU<TIndexOffU>(acgt[c], _switchEndian) + cCnt + this->fchr()[c];
#ifndef NDEBUG
		assert_leq(ret, this->fchr()[c+1]); // can't have jumpded into next char's section
		if(c == 0) {
			assert_leq(cCnt, this->_eh._sideBwtLen);
		} else {
			assert_leq(ret, this->_eh._bwtLen);
		}
#endif
		return ret;
	}

	/**
	 * Count all occurrences of all four nucleotides up to the starting
	 * point (which must be in a forward side) given by 'l' storing the
	 * result in 'cntsUpto', then count nucleotide occurrences within the
	 * range of length 'num' storing the result in 'cntsIn'.  Also, keep
	 * track of the characters occurring within the range by setting
	 * 'masks' accordingly (masks[1][10] == true -> 11th character is a
	 * 'C', and masks[0][10] == masks[2][10] == masks[3][10] == false.
	 */
	inline void countBt2SideRange(
		SideLocus& l,        // top locus
		TIndexOffU num,        // number of elts in range to tall // @double-check
		TIndexOffU* cntsUpto, // A/C/G/T counts up to top
		TIndexOffU* cntsIn,   // A/C/G/T counts within range
		EList<bool> *masks) const // masks indicating which range elts = A/C/G/T
		{
			assert_gt(num, 0);
			assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
			assert_range(0, 3, (int)l._bp);
			countUpToEx(l, cntsUpto);
			WITHIN_FCHR_DOLLARA(cntsUpto);
			WITHIN_BWT_LEN(cntsUpto);
			const uint8_t *side = l.side(this->ebwt());
			if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
				// Adjust for the fact that we represented $ with an 'A', but
				// shouldn't count it as an 'A' here
				if((l._sideByteOff + l._by > _zEbwtByteOff) ||
				   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
				{
					cntsUpto[0]--; // Adjust for '$' looking like an 'A'
				}
			}
			// Now factor in the occ[] count at the side break
			const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(side + _eh._sideBwtSz);
			assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->fchr()[1] + this->_eh.sideBwtLen());
			assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->fchr()[2]-this->fchr()[1]);
			assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->fchr()[3]-this->fchr()[2]);
			assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->fchr()[4]-this->fchr()[3]);
			assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->_eh._len + this->_eh.sideBwtLen());
			assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->_eh._len);
			assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->_eh._len);
			assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->_eh._len);
			cntsUpto[0] += (endianizeU<TIndexOffU>(acgt[0], _switchEndian) + this->fchr()[0]);
			cntsUpto[1] += (endianizeU<TIndexOffU>(acgt[1], _switchEndian) + this->fchr()[1]);
			cntsUpto[2] += (endianizeU<TIndexOffU>(acgt[2], _switchEndian) + this->fchr()[2]);
			cntsUpto[3] += (endianizeU<TIndexOffU>(acgt[3], _switchEndian) + this->fchr()[3]);
			masks[0].resize(num);
			masks[1].resize(num);
			masks[2].resize(num);
			masks[3].resize(num);
			WITHIN_FCHR_DOLLARA(cntsUpto);
			WITHIN_FCHR_DOLLARA(cntsIn);
			// 'cntsUpto' is complete now.
			// Walk forward until we've tallied the entire 'In' range
			TIndexOffU nm = 0;
			// Rest of this side
			nm += countBt2SideRange2(l, true, num - nm, cntsIn, masks, nm);
			assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
			assert_leq(nm, num);
			SideLocus lcopy = l;
			while(nm < num) {
				// Subsequent sides, if necessary
				lcopy.nextSide(this->_eh);
				nm += countBt2SideRange2(lcopy, false, num - nm, cntsIn, masks, nm);
				WITHIN_FCHR_DOLLARA(cntsIn);
				assert_leq(nm, num);
				assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
			}
			assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
			WITHIN_FCHR_DOLLARA(cntsIn);
		}

	/**
	 * Count all occurrences of character c from the beginning of the
	 * forward side to <by,bp> and add in the occ[] count up to the side
	 * break just prior to the side.
	 *
	 * A forward side is shaped like:
	 *
	 * [A] [C] XXXXXXXXXXXXXXXX
	 * -4- -4- --------56------ (numbers in bytes)
	 *         ^
	 *         Side ptr (result from SideLocus.side())
	 *
	 * And following it is a reverse side shaped like:
	 *
	 * [G] [T] XXXXXXXXXXXXXXXX
	 * -4- -4- --------56------ (numbers in bytes)
	 *         ^
	 *         Side ptr (result from SideLocus.side())
	 *
	 */
	inline void countBt2SideEx(const SideLocus& l, TIndexOffU* arrs) const {
		assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
		assert_range(0, 3, (int)l._bp);
		countUpToEx(l, arrs);
		if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
			// Adjust for the fact that we represented $ with an 'A', but
			// shouldn't count it as an 'A' here
			if((l._sideByteOff + l._by > _zEbwtByteOff) ||
			   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
			{
				arrs[0]--; // Adjust for '$' looking like an 'A'
			}
		}
		WITHIN_FCHR(arrs);
		WITHIN_BWT_LEN(arrs);
		// Now factor in the occ[] count at the side break
		const uint8_t *side = l.side(this->ebwt());
		const uint8_t *acgt16 = side + this->_eh._sideSz - OFF_SIZE*4;
		const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(acgt16);
		assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->fchr()[1] + this->_eh.sideBwtLen());
		assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->fchr()[2]-this->fchr()[1]);
		assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->fchr()[3]-this->fchr()[2]);
		assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->fchr()[4]-this->fchr()[3]);
		assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->_eh._len + this->_eh.sideBwtLen());
		assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->_eh._len);
		assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->_eh._len);
		assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->_eh._len);
		arrs[0] += (endianizeU<TIndexOffU>(acgt[0], _switchEndian) + this->fchr()[0]);
		arrs[1] += (endianizeU<TIndexOffU>(acgt[1], _switchEndian) + this->fchr()[1]);
		arrs[2] += (endianizeU<TIndexOffU>(acgt[2], _switchEndian) + this->fchr()[2]);
		arrs[3] += (endianizeU<TIndexOffU>(acgt[3], _switchEndian) + this->fchr()[3]);
		WITHIN_FCHR(arrs);
	}

	/**
	 * Counts the number of occurrences of character 'c' in the given Ebwt
	 * side up to (but not including) the given byte/bitpair (by/bp).
	 *
	 * This is a performance-critical function.  This used to be the top search-
	 * related hit in the time profile.
	 * The bottleneck was due to cache misses due to random memory access pattern.
	 *
	 * The use of prefetch instructions in initFromRow, when applied enough in advance,
	 * mostly eliminate the cache misses. 
	 *
	 */
	inline TIndexOffU countUpTo(const SideLocus& l, int c) const { // @double-check
		// Count occurrences of c in each 64-bit (using bit trickery);
		// Someday countInU64() and pop() functions should be
		// vectorized/SSE-ized in case that helps.
		TIndexOffU cCnt = 0;
		const uint8_t *side = l.side(this->ebwt());
		int i = 0;
#ifdef POPCNT_CAPABILITY
		if ( _usePOPCNTinstruction) {
			for(; i + 7 < l._by; i += 8) {
				cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i]);
			}
		}
		else {
			for(; i + 7 < l._by; i += 8) {
				cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i]);
			}
		}
#else
		for(; i + 7 < l._by; i += 8) {
			cCnt += countInU64(c, *(uint64_t*)&side[i]);
		}
#endif
		// Count occurences of c in the rest of the side (using LUT)
		for(; i < l._by; i++) {
			cCnt += cCntLUT_4[0][c][side[i]];
		}
		// Count occurences of c in the rest of the byte
		if(l._bp > 0) {
			cCnt += cCntLUT_4[(int)l._bp][c][side[i]];
		}
		return cCnt;
	}

	/**
	 * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
	 * within a 64-bit argument.
	 *
	 * Function gets 2.32% in profile
	 */
#ifdef POPCNT_CAPABILITY
	template<typename Operation>
#endif
	inline static void countInU64Ex(uint64_t dw, TIndexOffU* arrs) {
		uint64_t c0 = c_table[0];
		uint64_t x0 = dw ^ c0;
		uint64_t x1 = (x0 >> 1);
		uint64_t x2 = x1 & (0x5555555555555555llu);
		uint64_t x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
		uint64_t tmp = Operation().pop64(x3);
#else
		uint64_t tmp = pop64(x3);
#endif
		arrs[0] += (uint32_t) tmp;

		c0 = c_table[1];
		x0 = dw ^ c0;
		x1 = (x0 >> 1);
		x2 = x1 & (0x5555555555555555llu);
		x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
		tmp = Operation().pop64(x3);
#else
		tmp = pop64(x3);
#endif
		arrs[1] += (uint32_t) tmp;

		c0 = c_table[2];
		x0 = dw ^ c0;
		x1 = (x0 >> 1);
		x2 = x1 & (0x5555555555555555llu);
		x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
		tmp = Operation().pop64(x3);
#else
		tmp = pop64(x3);
#endif
		arrs[2] += (uint32_t) tmp;

		c0 = c_table[3];
		x0 = dw ^ c0;
		x1 = (x0 >> 1);
		x2 = x1 & (0x5555555555555555llu);
		x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
		tmp = Operation().pop64(x3);
#else
		tmp = pop64(x3);
#endif
		arrs[3] += (uint32_t) tmp;
	}

	/**
	 * Counts the number of occurrences of all four nucleotides in the
	 * given side up to (but not including) the given byte/bitpair (by/bp).
	 * Count for 'a' goes in arrs[0], 'c' in arrs[1], etc.
	 *
	 * This is a performance-critical function.  This used to be the top search-
	 * related hit in the time profile.
	 * The bottleneck was due to cache misses due to random memory access pattern.
	 *
	 * The use of prefetch instructions in initFromRow, when applied enough in advance,
	 * mostly eliminate the cache misses. 
	 */
	inline void countUpToEx(const SideLocus& l, TIndexOffU* arrs) const {
		int i = 0;
		// Count occurrences of each nucleotide in each 64-bit word using
		// bit trickery; note: this seems does not seem to lend a
		// significant boost to performance in practice.  If you comment
		// out this whole loop (which won't affect correctness - it will
		// just cause the following loop to take up the slack) then runtime
		// does not change noticeably. Someday the countInU64() and pop()
		// functions should be vectorized/SSE-ized in case that helps.
		const uint8_t *side = l.side(this->ebwt());

#ifdef POPCNT_CAPABILITY
		if (_usePOPCNTinstruction) {
			for(; i+7 < l._by; i += 8) {
				countInU64Ex<USE_POPCNT_INSTRUCTION>(*(uint64_t*)&side[i], arrs);
			}
		}
		else {
			for(; i+7 < l._by; i += 8) {
				countInU64Ex<USE_POPCNT_GENERIC>(*(uint64_t*)&side[i], arrs);
			}
		}
#else
		for(; i+7 < l._by; i += 8) {
			countInU64Ex(*(uint64_t*)&side[i], arrs);
		}
#endif
		// Count occurences of nucleotides in the rest of the side (using LUT)
		// Many cache misses on following lines (~20K)
		for(; i < l._by; i++) {
			arrs[0] += cCntLUT_4[0][0][side[i]];
			arrs[1] += cCntLUT_4[0][1][side[i]];
			arrs[2] += cCntLUT_4[0][2][side[i]];
			arrs[3] += cCntLUT_4[0][3][side[i]];
		}
		// Count occurences of c in the rest of the byte
		if(l._bp > 0) {
			arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]];
			arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]];
			arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]];
			arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]];
		}
	}

#ifndef NDEBUG
	/**
	 * Given top and bot loci, calculate counts of all four DNA chars up to
	 * those loci.  Used for more advanced backtracking-search.
	 */
	inline void mapLFEx(
		const SideLocus& l,
		TIndexOffU *arrs
		ASSERT_ONLY(, bool overrideSanity = false)
		) const
		{
			assert_eq(0, arrs[0]);
			assert_eq(0, arrs[1]);
			assert_eq(0, arrs[2]);
			assert_eq(0, arrs[3]);
			countBt2SideEx(l, arrs);
			if(_sanity && !overrideSanity) {
				// Make sure results match up with individual calls to mapLF;
				// be sure to override sanity-checking in the callee, or we'll
				// have infinite recursion
				assert_eq(mapLF(l, 0, true), arrs[0]);
				assert_eq(mapLF(l, 1, true), arrs[1]);
				assert_eq(mapLF(l, 2, true), arrs[2]);
				assert_eq(mapLF(l, 3, true), arrs[3]);
			}
		}
#endif

	/**
	 * Given top and bot rows, calculate counts of all four DNA chars up to
	 * those loci.
	 */
	inline void mapLFEx(
		TIndexOffU top,
		TIndexOffU bot,
		TIndexOffU *tops,
		TIndexOffU *bots
		ASSERT_ONLY(, bool overrideSanity = false)
		) const
		{
			SideLocus ltop, lbot;
			SideLocus::initFromTopBot(top, bot, _eh, ebwt(), ltop, lbot);
			mapLFEx(ltop, lbot, tops, bots ASSERT_ONLY(, overrideSanity));
		}

	/**
	 * Given top and bot loci, calculate counts of all four DNA chars up to
	 * those loci.  Used for more advanced backtracking-search.
	 */
	inline void mapLFEx(
		const SideLocus& ltop,
		const SideLocus& lbot,
		TIndexOffU *tops,
		TIndexOffU *bots
		ASSERT_ONLY(, bool overrideSanity = false)
		) const
		{
			assert(ltop.repOk(this->eh()));
			assert(lbot.repOk(this->eh()));
			assert_eq(0, tops[0]); assert_eq(0, bots[0]);
			assert_eq(0, tops[1]); assert_eq(0, bots[1]);
			assert_eq(0, tops[2]); assert_eq(0, bots[2]);
			assert_eq(0, tops[3]); assert_eq(0, bots[3]);
			// hopefully ltop and lbot were already prefetched
			// if not, we have time to prefetch lbot while procssing ltop
			lbot.prefetch(this->ebwt());
			countBt2SideEx(ltop, tops);
			countBt2SideEx(lbot, bots);
#ifndef NDEBUG
			if(_sanity && !overrideSanity) {
				// Make sure results match up with individual calls to mapLF;
				// be sure to override sanity-checking in the callee, or we'll
				// have infinite recursion
				assert_eq(mapLF(ltop, 0, true), tops[0]);
				assert_eq(mapLF(ltop, 1, true), tops[1]);
				assert_eq(mapLF(ltop, 2, true), tops[2]);
				assert_eq(mapLF(ltop, 3, true), tops[3]);
				assert_eq(mapLF(lbot, 0, true), bots[0]);
				assert_eq(mapLF(lbot, 1, true), bots[1]);
				assert_eq(mapLF(lbot, 2, true), bots[2]);
				assert_eq(mapLF(lbot, 3, true), bots[3]);
			}
#endif
		}

	/**
	 * Counts the number of occurrences of all four nucleotides in the
	 * given side from the given byte/bitpair (l->_by/l->_bp) (or the
	 * beginning of the side if l == 0).  Count for 'a' goes in arrs[0],
	 * 'c' in arrs[1], etc.
	 *
	 * Note: must account for $.
	 *
	 * Must fill in masks
	 */
	inline TIndexOffU countBt2SideRange2( // @double-check
		const SideLocus& l,
		bool startAtLocus,
		TIndexOffU num,
		TIndexOffU* arrs,
		EList<bool> *masks,
		TIndexOffU maskOff) const
		{
			assert(!masks[0].empty());
			assert_eq(masks[0].size(), masks[1].size());
			assert_eq(masks[0].size(), masks[2].size());
			assert_eq(masks[0].size(), masks[3].size());
			ASSERT_ONLY(TIndexOffU myarrs[4] = {0, 0, 0, 0});
			TIndexOffU nm = 0; // number of nucleotides tallied so far
			int iby = 0;      // initial byte offset
			int ibp = 0;      // initial base-pair offset
			if(startAtLocus) {
				iby = l._by;
				ibp = l._bp;
			} else {
				// Start at beginning
			}
			int by = iby, bp = ibp;
			assert_lt(bp, 4);
			assert_lt(by, (int)this->_eh._sideBwtSz);
			const uint8_t *side = l.side(this->ebwt());
			while(nm < num) {
				int c = (side[by] >> (bp * 2)) & 3;
				assert_lt(maskOff + nm, masks[c].size());
				masks[0][maskOff + nm] = masks[1][maskOff + nm] =
					masks[2][maskOff + nm] = masks[3][maskOff + nm] = false;
				assert_range(0, 3, c);
				// Note: we tally $ just like an A
				arrs[c]++; // tally it
				ASSERT_ONLY(myarrs[c]++);
				masks[c][maskOff + nm] = true; // not dead
				nm++;
				if(++bp == 4) {
					bp = 0;
					by++;
					assert_leq(by, (int)this->_eh._sideBwtSz);
					if(by == (int)this->_eh._sideBwtSz) {
						// Fell off the end of the side
						break;
					}
				}
			}
			WITHIN_FCHR_DOLLARA(arrs);
#ifndef NDEBUG
			if(_sanity) {
				// Make sure results match up with a call to mapLFEx.
				TIndexOffU tops[4] = {0, 0, 0, 0};
				TIndexOffU bots[4] = {0, 0, 0, 0};
				TIndexOffU top = l.toBWRow();
				TIndexOffU bot = top + nm;
				mapLFEx(top, bot, tops, bots, false);
				assert(myarrs[0] == (bots[0] - tops[0]) || myarrs[0] == (bots[0] - tops[0])+1);
				assert_eq(myarrs[1], bots[1] - tops[1]);
				assert_eq(myarrs[2], bots[2] - tops[2]);
				assert_eq(myarrs[3], bots[3] - tops[3]);
			}
#endif
			return nm;
		}

	/**
	 * Return the final character in row i (i.e. the i'th character in the
	 * BWT transform).  Note that the 'L' in the name of the function
	 * stands for 'last', as in the literature.
	 */
	inline int rowL(const SideLocus& l) const {
		// Extract and return appropriate bit-pair
		return unpack_2b_from_8b(l.side(this->ebwt())[l._by], l._bp);
	}

	/**
	 * Return the final character in row i (i.e. the i'th character in the
	 * BWT transform).  Note that the 'L' in the name of the function
	 * stands for 'last', as in the literature.
	 */
	inline int rowL(TIndexOffU i) const {
		// Extract and return appropriate bit-pair
		SideLocus l;
		l.initFromRow(i, _eh, ebwt());
		return rowL(l);
	}

	/**
	 * Given top and bot loci, calculate counts of all four DNA chars up to
	 * those loci.  Used for more advanced backtracking-search.
	 */
	inline void mapLFRange(
		SideLocus& ltop,
		SideLocus& lbot,
		TIndexOffU num,        // Number of elts
		TIndexOffU* cntsUpto,  // A/C/G/T counts up to top
		TIndexOffU* cntsIn,    // A/C/G/T counts within range
		EList<bool> *masks
		ASSERT_ONLY(, bool overrideSanity = false)
		) const
		{
			assert(ltop.repOk(this->eh()));
			assert(lbot.repOk(this->eh()));
			assert_eq(num, lbot.toBWRow() - ltop.toBWRow());
			assert_eq(0, cntsUpto[0]); assert_eq(0, cntsIn[0]);
			assert_eq(0, cntsUpto[1]); assert_eq(0, cntsIn[1]);
			assert_eq(0, cntsUpto[2]); assert_eq(0, cntsIn[2]);
			assert_eq(0, cntsUpto[3]); assert_eq(0, cntsIn[3]);
			countBt2SideRange(ltop, num, cntsUpto, cntsIn, masks);
			assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
#ifndef NDEBUG
			if(_sanity && !overrideSanity) {
				// Make sure results match up with individual calls to mapLF;
				// be sure to override sanity-checking in the callee, or we'll
				// have infinite recursion
				TIndexOffU tops[4] = {0, 0, 0, 0};
				TIndexOffU bots[4] = {0, 0, 0, 0};
				assert(ltop.repOk(this->eh()));
				assert(lbot.repOk(this->eh()));
				mapLFEx(ltop, lbot, tops, bots, false);
				for(int i = 0; i < 4; i++) {
					assert(cntsUpto[i] == tops[i] || tops[i] == bots[i]);
					if(i == 0) {
						assert(cntsIn[i] == bots[i]-tops[i] ||
						       cntsIn[i] == bots[i]-tops[i]+1);
					} else {
						assert_eq(cntsIn[i], bots[i]-tops[i]);
					}
				}
			}
#endif
		}

	/**
	 * Given row i, return the row that the LF mapping maps i to.
	 */
	inline TIndexOffU mapLF(
		const SideLocus& l
		ASSERT_ONLY(, bool overrideSanity = false)
		) const
		{
			ASSERT_ONLY(TIndexOffU srcrow = l.toBWRow());
			TIndexOffU ret;
			assert(l.side(this->ebwt()) != NULL);
			int c = rowL(l);
			assert_lt(c, 4);
			assert_geq(c, 0);
			ret = countBt2Side(l, c);
			assert_lt(ret, this->_eh._bwtLen);
			assert_neq(srcrow, ret);
#ifndef NDEBUG
			if(_sanity && !overrideSanity) {
				// Make sure results match up with results from mapLFEx;
				// be sure to override sanity-checking in the callee, or we'll
				// have infinite recursion
				TIndexOffU arrs[] = { 0, 0, 0, 0 };
				mapLFEx(l, arrs, true);
				assert_eq(arrs[c], ret);
			}
#endif
			return ret;
		}

	/**
	 * Given row i and character c, return the row that the LF mapping maps
	 * i to on character c.
	 */
	inline TIndexOffU mapLF(
		const SideLocus& l, int c
		ASSERT_ONLY(, bool overrideSanity = false)
		) const
		{
			TIndexOffU ret;
			assert_lt(c, 4);
			assert_geq(c, 0);
			ret = countBt2Side(l, c);
			assert_lt(ret, this->_eh._bwtLen);
#ifndef NDEBUG
			if(_sanity && !overrideSanity) {
				// Make sure results match up with results from mapLFEx;
				// be sure to override sanity-checking in the callee, or we'll
				// have infinite recursion
				TIndexOffU arrs[] = { 0, 0, 0, 0 };
				mapLFEx(l, arrs, true);
				assert_eq(arrs[c], ret);
			}
#endif
			return ret;
		}

	/**
	 * Given top and bot loci, calculate counts of all four DNA chars up to
	 * those loci.  Also, update a set of tops and bots for the reverse
	 * index/direction using the idea from the bi-directional BWT paper.
	 */
	inline void mapBiLFEx(
		const SideLocus& ltop,
		const SideLocus& lbot,
		TIndexOffU *tops,
		TIndexOffU *bots,
		TIndexOffU *topsP, // topsP[0] = top
		TIndexOffU *botsP
		ASSERT_ONLY(, bool overrideSanity = false)
		) const
		{
#ifndef NDEBUG
			for(int i = 0; i < 4; i++) {
				assert_eq(0, tops[0]);  assert_eq(0, bots[0]);
			}
#endif
			countBt2SideEx(ltop, tops);
			countBt2SideEx(lbot, bots);
#ifndef NDEBUG
			if(_sanity && !overrideSanity) {
				// Make sure results match up with individual calls to mapLF;
				// be sure to override sanity-checking in the callee, or we'll
				// have infinite recursion
				assert_eq(mapLF(ltop, 0, true), tops[0]);
				assert_eq(mapLF(ltop, 1, true), tops[1]);
				assert_eq(mapLF(ltop, 2, true), tops[2]);
				assert_eq(mapLF(ltop, 3, true), tops[3]);
				assert_eq(mapLF(lbot, 0, true), bots[0]);
				assert_eq(mapLF(lbot, 1, true), bots[1]);
				assert_eq(mapLF(lbot, 2, true), bots[2]);
				assert_eq(mapLF(lbot, 3, true), bots[3]);
			}
#endif
			// bots[0..3] - tops[0..3] = # of ways to extend the suffix with an
			// A, C, G, T
			botsP[0] = topsP[0] + (bots[0] - tops[0]);
			topsP[1] = botsP[0];
			botsP[1] = topsP[1] + (bots[1] - tops[1]);
			topsP[2] = botsP[1];
			botsP[2] = topsP[2] + (bots[2] - tops[2]);
			topsP[3] = botsP[2];
			botsP[3] = topsP[3] + (bots[3] - tops[3]);
		}

	/**
	 * Given row and its locus information, proceed on the given character
	 * and return the next row, or all-fs if we can't proceed on that
	 * character.  Returns max value if this row ends in $.
	 */
	inline TIndexOffU mapLF1(
		TIndexOffU row,       // starting row
		const SideLocus& l, // locus for starting row
		int c               // character to proceed on
		ASSERT_ONLY(, bool overrideSanity = false)
		) const
		{
			if(rowL(l) != c || row == _zOff) return OFF_MASK;
			TIndexOffU ret;
			assert_lt(c, 4);
			assert_geq(c, 0);
			ret = countBt2Side(l, c);
			assert_lt(ret, this->_eh._bwtLen);
#ifndef NDEBUG
			if(_sanity && !overrideSanity) {
				// Make sure results match up with results from mapLFEx;
				// be sure to override sanity-checking in the callee, or we'll
				// have infinite recursion
				TIndexOffU arrs[] = { 0, 0, 0, 0 };
				mapLFEx(l, arrs, true);
				assert_eq(arrs[c], ret);
			}
#endif
			return ret;
		}


	/**
	 * Given row and its locus information, set the row to LF(row) and
	 * return the character that was in the final column.
	 */
	inline int mapLF1(
		TIndexOffU& row,      // starting row
		const SideLocus& l  // locus for starting row
		ASSERT_ONLY(, bool overrideSanity = false)
		) const
		{
			if(row == _zOff) return -1;
			int c = rowL(l);
			assert_range(0, 3, c);
			row = countBt2Side(l, c);
			assert_lt(row, this->_eh._bwtLen);
#ifndef NDEBUG
			if(_sanity && !overrideSanity) {
				// Make sure results match up with results from mapLFEx;
				// be sure to override sanity-checking in the callee, or we'll
				// have infinite recursion
				TIndexOffU arrs[] = { 0, 0, 0, 0 };
				mapLFEx(l, arrs, true);
				assert_eq(arrs[c], row);
			}
#endif
			return c;
		}

#ifndef NDEBUG
	/// Check that in-memory Ebwt is internally consistent with respect
	/// to given EbwtParams; assert if not
	bool inMemoryRepOk(const EbwtParams& eh) const {
		assert_geq(_zEbwtBpOff, 0);
		assert_lt(_zEbwtBpOff, 4);
		assert_lt(_zEbwtByteOff, eh._ebwtTotSz);
		assert_lt(_zOff, eh._bwtLen);
		// assert_geq(_nFrag, _nPat);
		return true;
	}

	/// Check that in-memory Ebwt is internally consistent; assert if
	/// not
	bool inMemoryRepOk() const {
		return repOk(_eh);
	}

	/// Check that Ebwt is internally consistent with respect to given
	/// EbwtParams; assert if not
	bool repOk(const EbwtParams& eh) const {
		assert(_eh.repOk());
		if(isInMemory()) {
			return inMemoryRepOk(eh);
		}
		return true;
	}

	/// Check that Ebwt is internally consistent; assert if not
	bool repOk() const {
		return repOk(_eh);
	}
#endif

	bool       _switchEndian;
	int32_t    _overrideOffRate;
	bool       _verbose;
	bool       _passMemExc;
	bool       _sanity;
	bool       fw_;     // true iff this is a forward index
	FILE       *_in1;    // input fd for primary index file
	FILE       *_in2;    // input fd for secondary index file
	string     _in1Str; // filename for primary index file
	string     _in2Str; // filename for secondary index file
	string     _inSaStr;  // filename for suffix-array file
	string     _inBwtStr; // filename for BWT file
	TIndexOffU  _zOff;
	TIndexOffU  _zEbwtByteOff;
	TIndexOff   _zEbwtBpOff;
	TIndexOffU  _nPat;  /// number of reference texts
	TIndexOffU  _nFrag; /// number of fragments
	APtrWrap<TIndexOffU> _plen;
	APtrWrap<TIndexOffU> _rstarts; // starting offset of fragments / text indexes
	// _fchr, _ftab and _eftab are expected to be relatively small
	// (usually < 1MB, perhaps a few MB if _fchr is particularly large
	// - like, say, 11).  For this reason, we don't bother with writing
	// them to disk through separate output streams; we
	APtrWrap<TIndexOffU> _fchr;
	APtrWrap<TIndexOffU> _ftab;
	APtrWrap<TIndexOffU> _eftab; // "extended" entries for _ftab
	// _offs may be extremely large.  E.g. for DNA w/ offRate=4 (one
	// offset every 16 rows), the total size of _offs is the same as
	// the total size of the input sequence
	APtrWrap<TIndexOffU> _offs;
	// _ebwt is the Extended Burrows-Wheeler Transform itself, and thus
	// is at least as large as the input sequence.
	APtrWrap<uint8_t> _ebwt;
	bool       _useMm;        /// use memory-mapped files to hold the index
	bool       useShmem_;     /// use shared memory to hold large parts of the index
	EList<string> _refnames; /// names of the reference sequences
	char *mmFile1_;
	char *mmFile2_;
	EbwtParams _eh;
	bool packed_;

	static const TIndexOffU default_bmax = OFF_MASK;
	static const TIndexOffU default_bmaxMultSqrt = OFF_MASK;
	static const TIndexOffU default_bmaxDivN = 4;
	static const int      default_dcv = 1024;
	static const bool     default_noDc = false;
	static const bool     default_useBlockwise = true;
	static const uint32_t default_seed = 0;
#ifdef BOWTIE_64BIT_INDEX
	static const int      default_lineRate = 7;
#else
	static const int      default_lineRate = 6;
#endif
	static const int      default_offRate = 5;
	static const int      default_offRatePlus = 0;
	static const int      default_ftabChars = 10;
	static const bool     default_bigEndian = false;

private:

	ostream& log() const {
		return cout; // TODO: turn this into a parameter
	}

	/// Print a verbose message and flush (flushing is helpful for
	/// debugging)
	void verbose(const string& s) const {
		if(this->verbose()) {
			this->log() << s.c_str();
			this->log().flush();
		}
	}
};

/**
 * Read reference names from an input stream 'in' for an Ebwt primary
 * file and store them in 'refnames'.
 */
void readEbwtRefnames(FILE* fin, EList<string>& refnames);

/**
 * Read reference names from the index with basename 'in' and store
 * them in 'refnames'.
 */
void readEbwtRefnames(const string& instr, EList<string>& refnames);

/**
 * Read just enough of the Ebwt's header to determine whether it's
 * colorspace.
 */
bool readEbwtColor(const string& instr);

/**
 * Read just enough of the Ebwt's header to determine whether it's
 * entirely reversed.
 */
bool readEntireReverse(const string& instr);

///////////////////////////////////////////////////////////////////////
//
// Functions for building Ebwts
//
///////////////////////////////////////////////////////////////////////

/**
 * Join several text strings together in a way that's compatible with
 * the text-chunking scheme dictated by chunkRate parameter.
 *
 * The non-static member Ebwt::join additionally builds auxilliary
 * arrays that maintain a mapping between chunks in the joined string
 * and the original text strings.
 */
template<typename TStr>
TStr Ebwt::join(EList<TStr>& l, uint32_t seed) {
	RandomSource rand; // reproducible given same seed
	rand.init(seed);
	TStr ret;
	TIndexOffU guessLen = 0;
	for(TIndexOffU i = 0; i < l.size(); i++) {
		guessLen += length(l[i]);
	}
	ret.resize(guessLen);
	TIndexOffU off = 0;
	for(size_t i = 0; i < l.size(); i++) {
		TStr& s = l[i];
		assert_gt(s.length(), 0);
		for(size_t j = 0; j < s.size(); j++) {
			ret.set(s[j], off++);
		}
	}
	return ret;
}

/**
 * Join several text strings together in a way that's compatible with
 * the text-chunking scheme dictated by chunkRate parameter.
 *
 * The non-static member Ebwt::join additionally builds auxilliary
 * arrays that maintain a mapping between chunks in the joined string
 * and the original text strings.
 */
template<typename TStr>
TStr Ebwt::join(EList<FileBuf*>& l,
                EList<RefRecord>& szs,
                TIndexOffU sztot,
                const RefReadInParams& refparams,
                uint32_t seed)
{
	RandomSource rand; // reproducible given same seed
	rand.init(seed);
	RefReadInParams rpcp = refparams;
	TStr ret;
	TIndexOffU guessLen = sztot;
	ret.resize(guessLen);
	ASSERT_ONLY(TIndexOffU szsi = 0);
	TIndexOffU dstoff = 0;
	for(TIndexOffU i = 0; i < l.size(); i++) {
		// For each sequence we can pull out of istream l[i]...
		assert(!l[i]->eof());
		bool first = true;
		while(!l[i]->eof()) {
			RefRecord rec = fastaRefReadAppend(*l[i], first, ret, dstoff, rpcp);
			first = false;
			TIndexOffU bases = rec.len;
			assert_eq(rec.off, szs[szsi].off);
			assert_eq(rec.len, szs[szsi].len);
			assert_eq(rec.first, szs[szsi].first);
			ASSERT_ONLY(szsi++);
			if(bases == 0) continue;
		}
	}
	return ret;
}

/**
 * Join several text strings together according to the text-chunking
 * scheme specified in the EbwtParams.  Ebwt fields calculated in this
 * function are written directly to disk.
 *
 * It is assumed, but not required, that the header values have already
 * been written to 'out1' before this function is called.
 *
 * The static member Ebwt::join just returns a joined version of a
 * list of strings without building any of the auxilliary arrays.
 */
template<typename TStr>
void Ebwt::joinToDisk(
	EList<FileBuf*>& l,
	EList<RefRecord>& szs,
	TIndexOffU sztot,
	const RefReadInParams& refparams,
	TStr& ret,
	ostream& out1,
	ostream& out2)
{
	RefReadInParams rpcp = refparams;
	assert_gt(szs.size(), 0);
	assert_gt(l.size(), 0);
	assert_gt(sztot, 0);
	// Not every fragment represents a distinct sequence - many
	// fragments may correspond to a single sequence.  Count the
	// number of sequences here by counting the number of "first"
	// fragments.
	this->_nPat = 0;
	this->_nFrag = 0;
	for(TIndexOffU i = 0; i < szs.size(); i++) {
		if(szs[i].len > 0) this->_nFrag++;
		if(szs[i].first) this->_nPat++;
	}
	assert_gt(this->_nPat, 0);
	// assert_geq(this->_nFrag, this->_nPat);
	_rstarts.reset();
	writeU<TIndexOffU>(out1, this->_nPat, _switchEndian);
	// Allocate plen[]
	try {
		this->_plen.init(new TIndexOffU[this->_nPat], this->_nPat);
	} catch(bad_alloc& e) {
		cerr << "Out of memory allocating plen[] in Ebwt::join)"
		     << " at " << __FILE__ << ":" << __LINE__ << endl;
		throw e;
	}
	// For each pattern, set plen
	TIndexOff npat = -1;
	for(TIndexOffU i = 0; i < szs.size(); i++) {
		if(szs[i].first) {
			if(npat >= 0) {
				writeU<TIndexOffU>(out1, this->plen()[npat], _switchEndian);
			}
			this->plen()[++npat] = (szs[i].len + szs[i].off);
		} else {
			this->plen()[npat] += (szs[i].len + szs[i].off);
		}
	}
	assert_eq((TIndexOffU)npat, this->_nPat-1);
	writeU<TIndexOffU>(out1, this->plen()[npat], _switchEndian);
	// Write the number of fragments
	writeU<TIndexOffU>(out1, this->_nFrag, _switchEndian);
	TIndexOffU seqsRead = 0;
	ASSERT_ONLY(TIndexOffU szsi = 0);
	ASSERT_ONLY(TIndexOffU entsWritten = 0);
	TIndexOffU dstoff = 0;
	// For each filebuf
	for(unsigned int i = 0; i < l.size(); i++) {
		assert(!l[i]->eof());
		bool first = true;
		// For each *fragment* (not necessary an entire sequence) we
		// can pull out of istream l[i]...
		while(!l[i]->eof()) {
			string name;
			// Push a new name onto our vector
			_refnames.push_back("");
			RefRecord rec = fastaRefReadAppend(
				*l[i], first, ret, dstoff, rpcp, &_refnames.back());
			first = false;
			TIndexOffU bases = rec.len;
			if(rec.first) {
				if(_refnames.back().length() == 0) {
					// If name was empty, replace with an index
					ostringstream stm;
					stm << seqsRead;
					_refnames.back() = stm.str();
				}
			} else {
				// This record didn't actually start a new sequence so
				// no need to add a name
				// assert_eq(0, _refnames.back().length());
				_refnames.pop_back();
			}
			assert_lt(szsi, szs.size());
			assert_eq(rec.off, szs[szsi].off);
			assert_eq(rec.len, szs[szsi].len);
			assert_eq(rec.first, szs[szsi].first);
			assert(rec.first || rec.off > 0);
			ASSERT_ONLY(szsi++);

			// Increment seqsRead if this is the first fragment
			if(rec.first) seqsRead++;
			if(bases == 0) continue;
			assert_leq(bases, this->plen()[seqsRead-1]);
			// Adjust rpcps
			//uint32_t seq = seqsRead-1;
			ASSERT_ONLY(entsWritten++);
		}
		assert_gt(szsi, 0);
		l[i]->reset();
		assert(!l[i]->eof());
#ifndef NDEBUG
		int c = l[i]->get();
		assert_eq('>', c);
		assert(!l[i]->eof());
		l[i]->reset();
		assert(!l[i]->eof());
#endif
	}
	assert_eq(entsWritten, this->_nFrag);
}

/**
 * Build an Ebwt from a string 's' and its suffix array 'sa' (which
 * might actually be a suffix array *builder* that builds blocks of the
 * array on demand).  The bulk of the Ebwt, i.e. the ebwt and offs
 * arrays, is written directly to disk.  This is by design: keeping
 * those arrays in memory needlessly increases the footprint of the
 * building process.  Instead, we prefer to build the Ebwt directly
 * "to disk" and then read it back into memory later as necessary.
 *
 * It is assumed that the header values and join-related values (nPat,
 * plen) have already been written to 'out1' before this function
 * is called.  When this function is finished, it will have
 * additionally written ebwt, zOff, fchr, ftab and eftab to the primary
 * file and offs to the secondary file.
 *
 * Assume DNA/RNA/any alphabet with 4 or fewer elements.
 * Assume occ array entries are 32 bits each.
 *
 * @param sa            the suffix array to convert to a Ebwt
 * @param s             the original string
 * @param out
 */
template<typename TStr>
void Ebwt::buildToDisk(
	InorderBlockwiseSA<TStr>& sa,
	const TStr& s,
	ostream& out1,
	ostream& out2,
	ostream* saOut,
	ostream* bwtOut)
{
	const EbwtParams& eh = this->_eh;

	assert(eh.repOk());
	assert_eq(s.length()+1, sa.size());
	assert_eq(s.length(), eh._len);
	assert_gt(eh._lineRate, 3);
	assert(sa.suffixItrIsReset());

	TIndexOffU len = eh._len;
	TIndexOffU ftabLen = eh._ftabLen;
	TIndexOffU sideSz = eh._sideSz;
	TIndexOffU ebwtTotSz = eh._ebwtTotSz;
	TIndexOffU fchr[] = {0, 0, 0, 0, 0};
	EList<TIndexOffU> ftab(EBWT_CAT);
	TIndexOffU zOff = OFF_MASK;

	// Save # of occurrences of each character as we walk along the bwt
	TIndexOffU occ[4] = {0, 0, 0, 0};
	TIndexOffU occSave[4] = {0, 0, 0, 0};

	// Record rows that should "absorb" adjacent rows in the ftab.
	// The absorbed rows represent suffixes shorter than the ftabChars
	// cutoff.
	uint8_t absorbCnt = 0;
	EList<uint8_t> absorbFtab(EBWT_CAT);
	try {
		VMSG_NL("Allocating ftab, absorbFtab");
		ftab.resize(ftabLen);
		ftab.fillZero();
		absorbFtab.resize(ftabLen);
		absorbFtab.fillZero();
	} catch(bad_alloc &e) {
		cerr << "Out of memory allocating ftab[] or absorbFtab[] "
		     << "in Ebwt::buildToDisk() at " << __FILE__ << ":"
		     << __LINE__ << endl;
		throw e;
	}

	// Allocate the side buffer; holds a single side as its being
	// constructed and then written to disk.  Reused across all sides.
#ifdef SIXTY4_FORMAT
	EList<uint64_t> ebwtSide(EBWT_CAT);
#else
	EList<uint8_t> ebwtSide(EBWT_CAT);
#endif
	try {
#ifdef SIXTY4_FORMAT
		ebwtSide.resize(sideSz >> 3);
#else
		ebwtSide.resize(sideSz);
#endif
	} catch(bad_alloc &e) {
		cerr << "Out of memory allocating ebwtSide[] in "
		     << "Ebwt::buildToDisk() at " << __FILE__ << ":"
		     << __LINE__ << endl;
		throw e;
	}

	// Points to the base offset within ebwt for the side currently
	// being written
	TIndexOffU side = 0;

	// Whether we're assembling a forward or a reverse bucket
	bool fw;
	TIndexOff sideCur = 0;
	fw = true;

	// Have we skipped the '$' in the last column yet?
	ASSERT_ONLY(bool dollarSkipped = false);

	TIndexOffU si = 0;   // string offset (chars)
	ASSERT_ONLY(TIndexOffU lastSufInt = 0);
	ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix
	                               // array (as opposed to the padding at the
	                               // end)
	// Iterate over packed bwt bytes
	VMSG_NL("Entering Ebwt loop");
	ASSERT_ONLY(TIndexOffU beforeEbwtOff = (TIndexOffU)out1.tellp()); // @double-check - pos_type, std::streampos

	// First integer in the suffix-array output file is the length of the
	// array, including $
	if(saOut != NULL) {
		// Write length word
		writeU<TIndexOffU>(*saOut, len+1, _switchEndian);
	}

	// First integer in the BWT output file is the length of BWT(T), including $
	if(bwtOut != NULL) {
		// Write length word
		writeU<TIndexOffU>(*bwtOut, len+1, _switchEndian);
	}
	while(side < ebwtTotSz) {
		// Sanity-check our cursor into the side buffer
		assert_geq(sideCur, 0);
		assert_lt(sideCur, (int)eh._sideBwtSz);
		assert_eq(0, side % sideSz); // 'side' must be on side boundary
		ebwtSide[sideCur] = 0; // clear
		assert_lt(side + sideCur, ebwtTotSz);
		// Iterate over bit-pairs in the si'th character of the BWT
#ifdef SIXTY4_FORMAT
		for(int bpi = 0; bpi < 32; bpi++, si++)
#else
			for(int bpi = 0; bpi < 4; bpi++, si++)
#endif
			{
				int bwtChar;
				bool count = true;
				if(si <= len) {
					// Still in the SA; extract the bwtChar
					TIndexOffU saElt = sa.nextSuffix();
					// Write it to the optional suffix-array output file
					if(saOut != NULL) {
						writeU<TIndexOffU>(*saOut, saElt, _switchEndian);
					}
					// TODO: what exactly to write to the BWT output file?  How to
					// represent $?  How to pack nucleotides into bytes/words?

					// (that might have triggered sa to calc next suf block)
					if(saElt == 0) {
						// Don't add the '$' in the last column to the BWT
						// transform; we can't encode a $ (only A C T or G)
						// and counting it as, say, an A, will mess up the
						// LR mapping
						bwtChar = 0; count = false;
						ASSERT_ONLY(dollarSkipped = true);
						zOff = si; // remember the SA row that
						// corresponds to the 0th suffix
					} else {
						bwtChar = (int)(s[saElt-1]);
						assert_lt(bwtChar, 4);
						// Update the fchr
						fchr[bwtChar]++;
					}
					// Update ftab
					if((len-saElt) >= (TIndexOffU)eh._ftabChars) {
						// Turn the first ftabChars characters of the
						// suffix into an integer index into ftab.  The
						// leftmost (lowest index) character of the suffix
						// goes in the most significant bit pair if the
						// integer.
						TIndexOffU sufInt = 0;
						for(int i = 0; i < eh._ftabChars; i++) {
							sufInt <<= 2;
							assert_lt((TIndexOffU)i, len-saElt);
							sufInt |= (unsigned char)(s[saElt+i]);
						}
						// Assert that this prefix-of-suffix is greater
						// than or equal to the last one (true b/c the
						// suffix array is sorted)
#ifndef NDEBUG
						if(lastSufInt > 0) assert_geq(sufInt, lastSufInt);
						lastSufInt = sufInt;
#endif
						// Update ftab
						assert_lt(sufInt+1, ftabLen);
						ftab[sufInt+1]++;
						if(absorbCnt > 0) {
							// Absorb all short suffixes since the last
							// transition into this transition
							absorbFtab[sufInt] = absorbCnt;
							absorbCnt = 0;
						}
					} else {
						// Otherwise if suffix is fewer than ftabChars
						// characters long, then add it to the 'absorbCnt';
						// it will be absorbed into the next transition
						assert_lt(absorbCnt, 255);
						absorbCnt++;
					}
					// Suffix array offset boundary? - update offset array
					if((si & eh._offMask) == si) {
						assert_lt((si >> eh._offRate), eh._offsLen);
						// Write offsets directly to the secondary output
						// stream, thereby avoiding keeping them in memory
						writeU<TIndexOffU>(out2, saElt, _switchEndian);
					}
				} else {
					// Strayed off the end of the SA, now we're just
					// padding out a bucket
#ifndef NDEBUG
					if(inSA) {
						// Assert that we wrote all the characters in the
						// string before now
						assert_eq(si, len+1);
						inSA = false;
					}
#endif
					// 'A' used for padding; important that padding be
					// counted in the occ[] array
					bwtChar = 0;
				}
				if(count) occ[bwtChar]++;
				// Append BWT char to bwt section of current side
				if(fw) {
					// Forward bucket: fill from least to most
#ifdef SIXTY4_FORMAT
					ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1));
					if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
#else
					pack_2b_in_8b(bwtChar, ebwtSide[sideCur], bpi);
					assert_eq((ebwtSide[sideCur] >> (bpi*2)) & 3, bwtChar);
#endif
				} else {
					// Backward bucket: fill from most to least
#ifdef SIXTY4_FORMAT
					ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1));
					if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
#else
					pack_2b_in_8b(bwtChar, ebwtSide[sideCur], 3-bpi);
					assert_eq((ebwtSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar);
#endif
				}
			} // end loop over bit-pairs
		assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
#ifdef SIXTY4_FORMAT
		assert_eq(0, si & 31);
#else
		assert_eq(0, si & 3);
#endif

		sideCur++;
		if(sideCur == (int)eh._sideBwtSz) {
			sideCur = 0;
			TIndexOffU *cpptr = reinterpret_cast<TIndexOffU*>(ebwtSide.ptr());
			// Write 'A', 'C', 'G' and 'T' tallies
			side += sideSz;
			assert_leq(side, eh._ebwtTotSz);
#ifdef BOWTIE_64BIT_INDEX
			cpptr[(sideSz >> 3)-4] = endianizeU<TIndexOffU>(occSave[0], _switchEndian);
			cpptr[(sideSz >> 3)-3] = endianizeU<TIndexOffU>(occSave[1], _switchEndian);
			cpptr[(sideSz >> 3)-2] = endianizeU<TIndexOffU>(occSave[2], _switchEndian);
			cpptr[(sideSz >> 3)-1] = endianizeU<TIndexOffU>(occSave[3], _switchEndian);
#else
			cpptr[(sideSz >> 2)-4] = endianizeU<TIndexOffU>(occSave[0], _switchEndian);
			cpptr[(sideSz >> 2)-3] = endianizeU<TIndexOffU>(occSave[1], _switchEndian);
			cpptr[(sideSz >> 2)-2] = endianizeU<TIndexOffU>(occSave[2], _switchEndian);
			cpptr[(sideSz >> 2)-1] = endianizeU<TIndexOffU>(occSave[3], _switchEndian);
#endif
			occSave[0] = occ[0];
			occSave[1] = occ[1];
			occSave[2] = occ[2];
			occSave[3] = occ[3];
			// Write backward side to primary file
			out1.write((const char *)ebwtSide.ptr(), sideSz);
		}
	}
	VMSG_NL("Exited Ebwt loop");
	assert_neq(zOff, OFF_MASK);
	if(absorbCnt > 0) {
		// Absorb any trailing, as-yet-unabsorbed short suffixes into
		// the last element of ftab
		absorbFtab[ftabLen-1] = absorbCnt;
	}
	// Assert that our loop counter got incremented right to the end
	assert_eq(side, eh._ebwtTotSz);
	// Assert that we wrote the expected amount to out1
	assert_eq(((TIndexOffU)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz); // @double-check - pos_type
	// assert that the last thing we did was write a forward bucket

	//
	// Write zOff to primary stream
	//
	writeU<TIndexOffU>(out1, zOff, _switchEndian);

	//
	// Finish building fchr
	//
	// Exclusive prefix sum on fchr
	for(int i = 1; i < 4; i++) {
		fchr[i] += fchr[i-1];
	}
	assert_eq(fchr[3], len);
	// Shift everybody up by one
	for(int i = 4; i >= 1; i--) {
		fchr[i] = fchr[i-1];
	}
	fchr[0] = 0;
	if(_verbose) {
		for(int i = 0; i < 5; i++)
			cout << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl;
	}
	// Write fchr to primary file
	for(int i = 0; i < 5; i++) {
		writeU<TIndexOffU>(out1, fchr[i], _switchEndian);
	}

	//
	// Finish building ftab and build eftab
	//
	// Prefix sum on ftable
	TIndexOffU eftabLen = 0;
	assert_eq(0, absorbFtab[0]);
	for(TIndexOffU i = 1; i < ftabLen; i++) {
		if(absorbFtab[i] > 0) eftabLen += 2;
	}
	assert_leq(eftabLen, (TIndexOffU)eh._ftabChars*2);
	eftabLen = eh._ftabChars*2;
	EList<TIndexOffU> eftab(EBWT_CAT);
	try {
		eftab.resize(eftabLen);
		eftab.fillZero();
	} catch(bad_alloc &e) {
		cerr << "Out of memory allocating eftab[] "
		     << "in Ebwt::buildToDisk() at " << __FILE__ << ":"
		     << __LINE__ << endl;
		throw e;
	}
	TIndexOffU eftabCur = 0;
	for(TIndexOffU i = 1; i < ftabLen; i++) {
		TIndexOffU lo = ftab[i] + Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i-1);
		if(absorbFtab[i] > 0) {
			// Skip a number of short pattern indicated by absorbFtab[i]
			TIndexOffU hi = lo + absorbFtab[i];
			assert_lt(eftabCur*2+1, eftabLen);
			eftab[eftabCur*2] = lo;
			eftab[eftabCur*2+1] = hi;
			ftab[i] = (eftabCur++) ^ OFF_MASK; // insert pointer into eftab
			assert_eq(lo, Ebwt::ftabLo(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
			assert_eq(hi, Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
		} else {
			ftab[i] = lo;
		}
	}
	assert_eq(Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, ftabLen-1), len+1);
	// Write ftab to primary file
	for(TIndexOffU i = 0; i < ftabLen; i++) {
		writeU<TIndexOffU>(out1, ftab[i], _switchEndian);
	}
	// Write eftab to primary file
	for(TIndexOffU i = 0; i < eftabLen; i++) {
		writeU<TIndexOffU>(out1, eftab[i], _switchEndian);
	}

	// Note: if you'd like to sanity-check the Ebwt, you'll have to
	// read it back into memory first!
	assert(!isInMemory());
	VMSG_NL("Exiting Ebwt::buildToDisk()");
}

/**
 * Try to find the Bowtie index specified by the user.  First try the
 * exact path given by the user.  Then try the user-provided string
 * appended onto the path of the "indexes" subdirectory below this
 * executable, then try the provided string appended onto
 * "$BOWTIE2_INDEXES/".
 */
string adjustEbwtBase(const string& cmdline,
		      const string& ebwtFileBase,
		      bool verbose);


extern string gLastIOErrMsg;

/* Checks whether a call to read() failed or not. */
inline bool is_read_err(int fdesc, ssize_t ret, size_t count){
	if (ret < 0) {
		std::stringstream sstm;
		sstm << "ERRNO: " << errno << " ERR Msg:" << strerror(errno) << std::endl;
		gLastIOErrMsg = sstm.str();
		return true;
	}
	return false;
}

/* Checks whether a call to fread() failed or not. */
inline bool is_fread_err(FILE* file_hd, size_t ret, size_t count){
	if (ferror(file_hd)) {
		gLastIOErrMsg = "Error Reading File!";
		return true;
	}
	return false;
}


#endif /*EBWT_H_*/
