package org.baderlab.pdzsvmstruct.data.manager;


import org.baderlab.pdzsvmstruct.utils.Constants;
import org.baderlab.pdzsvmstruct.predictor.pwm.PWM;

import java.util.*;

import org.baderlab.brain.ProteinProfile;
import org.biojava.bio.seq.db.HashSequenceDB;
import org.biojava.bio.seq.SequenceIterator;
import org.biojava.bio.seq.Sequence;
import org.baderlab.pdzsvmstruct.utils.PDZSVMUtils;
import weka.core.Utils;

/**
 * Copyright (c) 2011 University of Toronto
 * Code written by: Shirley Hui
 * Authors: Shirley Hui, Gary Bader
 *
 * This file is part of PDZSVMStruct.
 *
 * PDZSVM is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * PDZSVM is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  The software and
 * documentation provided hereunder is on an "as is" basis, and the
 * University of Toronto has no obligations to provide maintenance,
 * support, updates, enhancements or modifications.  In no event shall
 * the University of Toronto be liable to any party for direct, indirect,
 * special, incidental or consequential damages, including lost profits,
 * arising out of the use of this software and its documentation, even if
 * the University of Toronto has been advised of the possibility of such
 * damage. See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with PDZSVMStruct.  If not, see <http://www.gnu.org/licenses/>.
 */

/**
 * Manager for different set of sequences including sets of sequences from
 * proteomes and experiments.
 */
public class SequencePoolManager {

    private static String HUMLIB_FILE = DataFileManager.DATA_ROOT_DIR + "/Data/HumLib/HumLib6223.txt";

    private static String MOUSE_GENOME_FILE = DataFileManager.DATA_ROOT_DIR + "/Data/Proteomes/Mus_musculus.NCBIM37.59.pep.all.fa";
    private static String HUMAN_GENOME_FILE = DataFileManager.DATA_ROOT_DIR + "/Data/Proteomes/Homo_sapiens.GRCh37.56.pep.all.fa";
    //private static String HUMAN_GENOME_FILE = DataFileManager.DATA_ROOT_DIR + "/Data/Proteomes/UniprotHumanProteomeJune142011.fa";
    private static String WORM_GENOME_FILE = DataFileManager.DATA_ROOT_DIR + "/Data/Proteomes/Caenorhabditis_elegans.WS200.56.pep.all.fa";
    private static String FLY_GENOME_FILE = DataFileManager.DATA_ROOT_DIR + "/Data/Proteomes/Drosophila_melanogaster.BDGP5.13.56.pep.all.fa";
    private static String HUMAN_RED_GENOME_FILE = DataFileManager.DATA_ROOT_DIR + "/Data/Proteomes/ReducedHumanProteome3.fa";
    private static String MOUSE_RED_GENOME_FILE = DataFileManager.DATA_ROOT_DIR + "/Data/Proteomes/ReducedMouseProteome3.fa";

    private static String STIFFLER_PEPTIDES_FILE = DataFileManager.DATA_ROOT_DIR + "/Data/Mouse/PDZ/Fasta/StifflerPeptides.fa";
    private static String SIDHU_HUMAN_PEPTIDES_FILE = DataFileManager.DATA_ROOT_DIR + "/Data/Human/PDZ/Fasta/SidhuPeptides-Human.fa";
    private static String SIDHU_WORM_PEPTIDES_FILE = DataFileManager.DATA_ROOT_DIR + "/Data/Worm/PDZ/Fasta/SidhuPeptides-Worm.fa";
    private static String TAP_PSD95_MOUSE_PEPTIDES_FILE = "/Users/shirleyhui/Data/TAP-PSD95/TAP-PSD95-Seq.fa";

    private  HashSequenceDB genomeSequenceDB = new HashSequenceDB();
    private int numPos =Constants.NUM_RES_POS;
    private List sequencePoolList = new ArrayList();
    public static final int NONE = 0;
    public static final int ASC = 1;
    public static final int DESC = 2;

    public SequencePoolManager(String organism)
    {
        makeGenomeDB(organism);
    }
    public SequencePoolManager(String organism, String method)
    {
        makeExperimentDB(organism, method);
    }


    public SequencePoolManager(List profileList)
    {
        makeSequencePoolFromProfiles(profileList);
    }
    public HashSequenceDB getSequenceDB()
    {
        return genomeSequenceDB;
    }
    public double getMinScore(PWM pwm)
    {
        double minScore = Double.MAX_VALUE;
        for (int ix=0;ix < sequencePoolList.size();ix++)
        {
            String seq = (String)sequencePoolList.get(ix);
            double score = pwm.score(seq);

            if (score < minScore)
            {
                minScore= score;
            }
        }
        return minScore;
    }
    public List sortSequencePool(int sort, PWM pwm)
    {

        double[] pwmScores = new double[sequencePoolList.size()];
        for (int j=0;j<sequencePoolList.size();j++)
        {
            String seq = (String) sequencePoolList.get(j);
            double score = pwm.score(seq);
            //System.out.print("\t" + seq + "\t" +score);
            pwmScores[j] = score;

        }

        int[] lowToHigh  = Utils.sort(pwmScores);
        int[] highToLo = new int[lowToHigh.length];
        int end = lowToHigh.length-1;

        for (int ii=0; ii<lowToHigh.length;ii++)
        {
            int ixi = lowToHigh[end-ii];
            highToLo[ii] = ixi;
        }
        int[] ix;
        if (sort == ASC)
            ix = lowToHigh;
        else
            ix = highToLo;
        List sortedSequenceList  = new ArrayList();
        for (int ii=0; ii < ix.length;ii++)
        {
            int ixi = ix[ii];
            String seq = (String) sequencePoolList.get(ixi);
            sortedSequenceList.add(seq);
        }
        return sortedSequenceList;
    }
    private void makeSequencePoolFromProfiles(List profileList)
    {
        ProteinProfile profile0 = (ProteinProfile)profileList.get(0);
        String method = PDZSVMUtils.methodLongToShortForm(profile0.getExperimentalMethod());
        String organism = PDZSVMUtils.organismLongToShortForm(profile0.getOrganism());

        System.out.println("\tGetting sequences from profile: " + method + "\t" + organism);
        for (int i=0; i < profileList.size();i++)
        {
            ProteinProfile profile = (ProteinProfile)profileList.get(i);
            Collection seqCollection = profile.getSequenceMap();
            List profileSeqList = new ArrayList(seqCollection);
            for (int j=0;j < profileSeqList.size();j++)
            {
                Sequence seq = (Sequence)profileSeqList.get(j);
                String seqString = seq.seqString();
                if (!sequencePoolList.contains(seqString))
                {
                    sequencePoolList.add(seqString);
                }
            }
        }

    }
    public static boolean isLike(List refSeqList, String seq, int numResLike)
    {
        int length = seq.length();
        double simCutoff = (double)numResLike/(double)length;
        for (int i=0; i < refSeqList.size();i++)
        {
            String refSeq = (String) refSeqList.get(i);

            double sim = PDZSVMUtils.identity(refSeq, seq);
            if (sim >= simCutoff)
                return true;

        }
        return false;
    }
    public static boolean isExactly(List refSeqList, String seq, int numResLike)
    {
        int length = seq.length();
        double simCutoff = (double)numResLike/(double)length;
        for (int i=0; i < refSeqList.size();i++)
        {
            String refSeq = (String) refSeqList.get(i);

            double sim = PDZSVMUtils.identity(refSeq, seq);
            if (sim == simCutoff)
                return true;

        }
        return false;
    }
    public List getSequencePool()
    {
        return getSequencePool(0);    
    }
    public List getSequencePool(int numPos)
    {
        if (sequencePoolList.isEmpty())
        {
            sequencePoolList = new ArrayList();
            SequenceIterator it= genomeSequenceDB.sequenceIterator();

            while(it.hasNext())
            {
                try
                {
                    Sequence seq = it.nextSequence();
                    String seqString = seq.seqString();
                    
                    if (numPos > 0)
                        seqString = seqString.substring(seqString.length()-numPos, seqString.length());
                    if (seqString.indexOf('Z')<0 )
                        sequencePoolList.add(seqString);
                    else
                    {
                        System.out.println("\tNot including sequence containing Z or *: "+ seqString);
                    }
                    //System.out.println(seq);
                }
                catch(Exception e)
                {
                    System.out.println("Exception: " + e);
                }
            }
            System.out.println("\tRead " + sequencePoolList.size() + " sequences");
            Collections.shuffle(sequencePoolList);
        }
        return sequencePoolList;
      
    }
    public HashSequenceDB getGenomeSequenceDB()
    {
        return genomeSequenceDB;
    }
    private void makeGenomeDB(String organism)
    {
        System.out.println("\tMaking sequence pool from "+ organism + " genome...");
        String genomeFile = null;
        if (organism.equals(Constants.HUMAN) ||organism.equals(Constants.HUMAN_MUTANT ))
        {
            genomeFile= HUMAN_GENOME_FILE;
        }
        else if (organism.equals(Constants.HUMAN_REDUCED))
        {
            genomeFile= HUMAN_RED_GENOME_FILE;
        }
        else if (organism.equals(Constants.MOUSE))
        {
            genomeFile= MOUSE_GENOME_FILE;
        }
        else if (organism.equals(Constants.MOUSE_REDUCED))
        {
            genomeFile= MOUSE_RED_GENOME_FILE;
        }
        else if (organism.equals(Constants.WORM))
        {
            genomeFile= WORM_GENOME_FILE;
        }
        else if (organism.equals(Constants.FLY))
        {
            genomeFile= FLY_GENOME_FILE;
        }
        if (genomeFile!=null)
        {
            try
            {
                genomeSequenceDB = PDZSVMUtils.readAlignmentToDB(genomeFile,numPos);
            }
            catch(Exception e)
            {
                System.out.println("Exception: " + e);
            }
        }

    }
    private void makeExperimentDB(String organism, String method)
    {
        System.out.println("\tMaking sequence pool from "+ organism + "," + method + " ...");

        String genomeFile = null;
        if (method.equals(Constants.PROTEIN_MICROARRAY))
        {
            if (organism.equals(Constants.MOUSE))
            {
                genomeFile= STIFFLER_PEPTIDES_FILE;
            }
        }
        else if (method.equals(Constants.PHAGE_DISPLAY))
        {
            if (organism.equals(Constants.HUMAN))
            {
                genomeFile= SIDHU_HUMAN_PEPTIDES_FILE;
            }
            else if (organism.equals(Constants.WORM))
            {
                genomeFile= SIDHU_WORM_PEPTIDES_FILE;
            }
        }
        else if (method.equals(Constants.HUMLIB))
        {
            genomeFile = HUMLIB_FILE;
        }
        else if (method.equals(Constants.TAP_PSD95))
        {
            genomeFile = TAP_PSD95_MOUSE_PEPTIDES_FILE;
        }
        if (genomeFile!=null)
        {
            try
            {
                genomeSequenceDB = PDZSVMUtils.readAlignmentToDB(genomeFile,numPos);
            }
            catch(Exception e)
            {
                System.out.println("Exception: " + e);
            }
        }

    }

    public static void main(String[] args)
    {
        SequencePoolManager gm= new SequencePoolManager ( Constants.HUMAN);
        List seqList = gm.getSequencePool(5);
        System.out.println(seqList.size());
        for (int i=0; i < seqList.size();i++)
        {
            String seq = (String)seqList.get(i);
            if (seq.indexOf('B')>0 ||
                seq.indexOf('J')>0 ||
                    seq.indexOf('O')>0 ||
                    seq.indexOf('U')>0 ||
                    seq.indexOf('X')>0 ||
                    seq.indexOf('Z')>0 )

                System.out.println(seq);
        }


    }
}