/************************************************************************
 * CASSI, version 1.01
 * Copyright 2012,
 * Richard Howey
 * Institute of Genetic Medicine, Newcastle University
 *
 * richard.howey@ncl.ac.uk
 * http://www.staff.ncl.ac.uk/richard.howey/
 *
 * This file is part of CASSI, the SNP interaction analysis program.
 *
 * CASSI is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CASSI is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CASSI.  If not, see <http://www.gnu.org/licenses/>.
 ************************************************************************/


/*! \file Data.cpp
    \brief This file contains the source for manipulating SNP data.
    
*/

#include <string>
#include <map>
#include <set>
#include <list>
#include <iostream>
#include <fstream>
#include <sstream>
#include <math.h>

using namespace std; // initiates the "std" or "standard" namespace

#include "main.h"
#include "Data.h"
 


//! Adds a count to the joint genotypes between two SNPs.
void JointGenotypeCounts::addCount(const unsigned int & count1, const unsigned int & count2)
{
	if(count1 != 3 && count2 != 3)
	{
		counts[count1][count2]++;
		++total;
	};
};

//! Adjust the genotype counts, adding a half if req or needed
void JointGenotypeCounts::setAdjCounts(const bool & addHalf)
{
	//add 0.5 if any cells are 0
	if(addHalf || counts[0][0] == 0 || counts[0][1] == 0
		|| counts[0][2] == 0 || counts[1][0] == 0 || counts[2][0] == 0
		|| counts[1][1] == 0 || counts[1][2] == 0 || counts[2][1] == 0
		|| counts[2][2] == 0)
	{
		halfAdded = true;
		adjCount00 = 0.5 + (double)(counts[0][0]);
		adjCount01 = 0.5 + (double)(counts[0][1]);
		adjCount02 = 0.5 + (double)(counts[0][2]);
		adjCount10 = 0.5 + (double)(counts[1][0]);
		adjCount11 = 0.5 + (double)(counts[1][1]);
		adjCount12 = 0.5 + (double)(counts[1][2]);
		adjCount20 = 0.5 + (double)(counts[2][0]);
		adjCount21 = 0.5 + (double)(counts[2][1]);
		adjCount22 = 0.5 + (double)(counts[2][2]);
	}
	else
	{
		halfAdded = false;
		adjCount00 = (double)(counts[0][0]);
		adjCount01 = (double)(counts[0][1]);
		adjCount02 = (double)(counts[0][2]);
		adjCount10 = (double)(counts[1][0]);
		adjCount11 = (double)(counts[1][1]);
		adjCount12 = (double)(counts[1][2]);
		adjCount20 = (double)(counts[2][0]);
		adjCount21 = (double)(counts[2][1]);
		adjCount22 = (double)(counts[2][2]);
	};

	//adjust all the elements if the baseline frequency is less than 0.01
	if(adjCount00/(double)(total) < 0.01)
	{
		double factor = ((double)(total))/(0.01*(double)(total) + adjCount01 + adjCount02 + adjCount10 +
			             adjCount11 + adjCount12 + adjCount20 + adjCount21  + adjCount22);

		adjCount00 = 0.01*factor*(double)(total);
		adjCount01 *= factor;
		adjCount02 *= factor;
		adjCount10 *= factor;
		adjCount11 *= factor;
		adjCount12 *= factor;
		adjCount20 *= factor;
		adjCount21 *= factor;
		adjCount22 *= factor;

	};

};

//! Resets all counts to zero.
void JointGenotypeCounts::resetCounts()
{
	total = 0;
	counts[0][0] = 0;
	counts[0][1] = 0;
	counts[0][2] = 0;
	counts[1][0] = 0;
	counts[1][1] = 0;
	counts[1][2] = 0;
	counts[2][0] = 0;
	counts[2][1] = 0;
	counts[2][2] = 0;	
};


//! Sets up SNP data from the .bim file
void DescriptionOfSNPs::setUpSNPDesciptionData(string & filename, const bool & secondFile)
{
	if(filename == "")
	{
		oneFile = true;
		return;
	};

	//try and find the binary map file, .bim, and read in data
	unsigned int length = filename.length();
	string mapFilename = filename.substr(0, length-4) + ".bim";

	ifstream readMapFile;
	readMapFile.open(mapFilename.c_str());
	if(!readMapFile.is_open())
	{
		outErr("Cannot read map file: "); outErr(mapFilename); outErr("!\n");
		exit(0);
	};

	string chromosome, snpIdentifier, geneticDistance;
	string prevSnpIdentifier = "";
	unsigned int basePairPosition;	
	string alleleName1, alleleName2;
	unsigned int snpID = 1;

	//loop thro' subjects and store the cases
	do{
		
		readMapFile >> chromosome >> snpIdentifier >> geneticDistance >> basePairPosition >> alleleName1 >> alleleName2;
		
		if(snpIdentifier != prevSnpIdentifier)
		{				
				if(!secondFile) basePairs1[snpID] = (make_pair(basePairPosition, snpIdentifier));
				else basePairs2[snpID] = (make_pair(basePairPosition, snpIdentifier));
				snpID++;
		};
		
		prevSnpIdentifier = snpIdentifier;				
	}while(!readMapFile.eof());

	readMapFile.close();

	out("Data Summary Statistics:\n");
	if(!secondFile && basePairs2.size() == 0) {out("Number of SNPs: "); out(basePairs1.size()); out("\n");}
	else if(!secondFile && basePairs2.size() != 0) {out("Number of SNPs in "); out(filename); out(": "); out(basePairs1.size()); out("\n");}
	else {out("Number of SNPs in "); out(filename); out(": "); out( basePairs2.size() ); out("\n");};
	
};

//! Returns the SNP name (rs number) for a given SNP
string DescriptionOfSNPs::getSNPName(unsigned int & snpNo, const unsigned int & winNo) const
{
	map<unsigned int, pair<unsigned int, string> >::const_iterator i;
	string snpName;

	if(oneFile || winNo == 1) i = basePairs1.find(snpNo);
	else i = basePairs2.find(snpNo);

	if((i != basePairs1.end() && winNo == 1) || (i != basePairs2.end() && winNo != 1)) snpName = i->second.second;
	else snpName = "??";

	return snpName;
};

void SNPWindow::reopenBinaryFile()
{
	readSNPData.close();
	readSNPData.open(filename.c_str(), ios::binary);
	
	//get past the three special bytes at before the SNP data
	char buffer[3];
	readSNPData.read(buffer, 3);
};

//! Open the binary file for the first time
void SNPWindow::openBinaryFileFirst()
{
	//try and find the binary pedigree file, .bed, and read in data for the first window
	readSNPData.open(filename.c_str(), ios::binary);
	
	if(!readSNPData.is_open())
	{
		outErr("Cannot read binary pedigree file: "); outErr(filename); outErr("!\n");
		exit(0);
	};

	char buffer[3];
	readSNPData.read(buffer, 3);

	//check the plink magic numbers for the file type
	//3rd number indicates format of genotype data, 1 => subjects x SNPs, 0 => SNPs x subjects
	unsigned int magicNumber1 = buffer[0];
	unsigned int magicNumber2 = buffer[1];

	if(magicNumber1 != 108 || magicNumber2 != 27)
	{
		outErr("Detected an old version .bed file!\n");
		outErr("Please use PLINK to update the .bed file.\n");
			
		readSNPData.close();		
		exit(0);
	};

	//determine binary file type
	unsigned int mode = buffer[2];
	if(mode == 0)
	{
		outErr("The binary pedigree file must be in SNP-major mode!\n");
		outErr("Please use PLINK to update the .bed file.\n");
			
		readSNPData.close();		
		exit(0);
	};
};

unsigned int SNPWindow::getNextNoOfMinorAlleles()
{
	int allele1, allele2;
	unsigned int noMinorAlleles = 0;

	//read in the next piece of data
	if(bitCount == 9)
	{
		
		readSNPData.read(buffer, 1);
		if(readSNPData.eof())
		{			
			outErr("Error: reached end of binary SNP file!\n");
			exit(0);
		};
			
		aBit = buffer[0];
			
		bitCount = 1;
	};

	allele1 = aBit & one; //read the least significant bit				
	aBit = aBit >> 1; //shift bits to the right
	allele2 = aBit & one; //read the new least significant bit				
	aBit = aBit >> 1; //shift bits to the right for next time

	bitCount += 2;	

	//if genotype is encoded 1/0 then the genotype is missing so do not add it
	if(allele1 == 1 && allele2 == 1)
	{	
		noMinorAlleles = 0;
	}
	else if(allele1 == 0 && allele2 == 1)
	{	
		noMinorAlleles = 1;
	}
	else if(allele1 == 0 && allele2 == 0)
	{	
		noMinorAlleles = 2;
	}
	else
		noMinorAlleles = 3; //denotes missing genotype

	return noMinorAlleles;

};


//! Creates the SNP window.
SNPWindowReadFromFile::SNPWindowReadFromFile(const unsigned int & ts, unsigned int & ssn, string & fname, const bool & readFirstSNP) : SNPWindow(ts, ssn, fname)
{
	//create SNP data object
	snp = new SNPData();

	//Open the binary file for the first time
	openBinaryFileFirst();

	//put data in for window 1, but not window 2, since window 2 will be reopened
	if(readFirstSNP)
	{
		advanceToFirstWindow();

		//setup the data for the first SNP
		for(unsigned int subjectNo = 1; subjectNo <= ts; ++subjectNo)
		{
			snp->noMinorAllelesAllSubjects.push_back(getNextNoOfMinorAlleles());		
		};
	}
	else
	{
		//setup the data for the first SNP with no counts to overwrite later
		for(unsigned int subjectNo = 1; subjectNo <= ts; ++subjectNo)
		{
			snp->noMinorAllelesAllSubjects.push_back(0);		
		};
	};
};

SNPWindowStoreAllData::SNPWindowStoreAllData(const unsigned int & ts, unsigned int & ssn, string & fname, unsigned int & endSNPNo) : SNPWindow(ts, ssn, fname)
{
	//Open the binary file for the first time
	openBinaryFileFirst();

	//advance thro' the snp data to the first SNP in the window
	advanceToFirstWindow();

	//now loop thro' the remaining SNPs and store data for each SNP until the last SNP
	SNPData * someSNPData;
	unsigned int subjectNo;
	unsigned int snpNo = ssn;
	unsigned int m;

	while(snpNo <= endSNPNo)
	{

		startNewByte();
		someSNPData = new SNPData();

		for(subjectNo = 1; subjectNo <= ts; ++subjectNo)
		{
			m = getNextNoOfMinorAlleles();
			someSNPData->noMinorAllelesAllSubjects.push_back(m);		
		};

		allSNPData.push_back(someSNPData);
		++snpNo;
	};

	currentSNP = allSNPData.begin();
};
	

//! Sets the SNP to the start of the window
void SNPWindowReadFromFile::startWindowAtStart()
{
	reopenBinaryFile();

	advanceToFirstWindow();

	moveToNextSNP();//read in the data for the first SNP in the window
};

//! Adds the SNP data of the next SNP to the SNP data object
void SNPWindowReadFromFile::moveToNextSNP()
{
	//ensure a new byte is read
	startNewByte();

	for(list<unsigned int>::iterator i = snp->noMinorAllelesAllSubjects.begin(); i != snp->noMinorAllelesAllSubjects.end(); ++i)
	{
		*i = getNextNoOfMinorAlleles();
	};

};

//! Moves through SNP data by one SNP.
void SNPWindow::advanceSNPData()
{
	//a new byte is started after each SNP, 4 subject genotypes per byte,
	// so the no. of bytes is rounded up when divided by 4
	unsigned int bufferSize;
	if(totalNoSubjects%4 == 0) bufferSize = totalNoSubjects/4;
	else bufferSize = (unsigned int)(totalNoSubjects/4) + 1;

	char buffer[1];
	for(unsigned int i = 1; i <= bufferSize; ++i)
	{
		readSNPData.read(buffer, 1);
	};

};

//! Moves SNP window onto the chosen first SNP.
void SNPWindow::advanceToFirstWindow()
{
	unsigned int snpCount = 1;

	do{
		if(snpCount == startSNPNo) break;

		//read SNP data in for the previous SNP that will not be used
		advanceSNPData();

		snpCount++;
	}while(!readSNPData.eof());

};

