package org.baderlab.pdzsvmstruct.analysis;

import org.baderlab.pdzsvmstruct.data.manager.SequencePoolManager;
import org.baderlab.pdzsvmstruct.data.manager.DataFileManager;
import org.baderlab.pdzsvmstruct.data.DataLoader;
import org.baderlab.pdzsvmstruct.utils.BindingSiteUtils;
import org.baderlab.pdzsvmstruct.utils.Constants;
import org.baderlab.pdzsvmstruct.utils.PDZSVMUtils;
import org.baderlab.pdzsvmstruct.predictor.svm.GlobalSVMPredictor;
import org.baderlab.pdzsvmstruct.evaluation.Prediction;
import org.baderlab.brain.ProteinProfile;
import org.biojava.bio.seq.db.HashSequenceDB;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.SequenceIterator;

import java.util.*;
import java.io.*;
import java.text.DecimalFormat;

import libsvm.svm_parameter;

/**
 * Copyright (c) 2011 University of Toronto
 * Code written by: Shirley Hui
 * Authors: Shirley Hui, Gary Bader
 *
 * This file is part of PDZSVMStruct.
 *
 * PDZSVM is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * PDZSVM is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  The software and
 * documentation provided hereunder is on an "as is" basis, and the
 * University of Toronto has no obligations to provide maintenance,
 * support, updates, enhancements or modifications.  In no event shall
 * the University of Toronto be liable to any party for direct, indirect,
 * special, incidental or consequential damages, including lost profits,
 * arising out of the use of this software and its documentation, even if
 * the University of Toronto has been advised of the possibility of such
 * damage. See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with PDZSVMStruct.  If not, see <http://www.gnu.org/licenses/>.
 */

public class ProteomeScan {

    private GlobalSVMPredictor g;
    private String organism = "";
    public ProteomeScan(String organism)
    {
        this.organism = organism;
    }

    public void scanStruct(List domainNameList)
    {
        DataLoader dl = new DataLoader();
        dl.loadMousePDBTrain();
        dl.loadSidhuHumanPDBTrain(Constants.SIDHU_HUMAN_G_PDB, Constants.PHAGE_DISPLAY);

        List posTrainProfileList = dl.getPosTrainProfileList();
        List negTrainProfileList = dl.getNegTrainProfileList();
        g= trainSVMStruct(posTrainProfileList, negTrainProfileList);

        SequencePoolManager s = new SequencePoolManager(organism);

        HashSequenceDB genSeqDB = s.getSequenceDB();
        System.out.println("\tNumber of genomic sequences: " + genSeqDB.ids().size());
        System.out.println("\tNumber of domains to scan: " + domainNameList.size());
        
        for (int i=0; i < domainNameList.size();i++)
        {
            String domainName = (String)domainNameList.get(i);
            System.out.println("\t=============================================================");
            System.out.println("\tRunning scan for: " + domainName);

            int ix = domainName.lastIndexOf("-");
            int domainNum =  Integer.parseInt(domainName.substring(ix+1, domainName.length()));

            BindingSiteUtils bs = new BindingSiteUtils(organism);
            String domainSeq = bs.getDomainSequence(domainName);
            System.out.println("\tDomain Name: " + domainName);
            System.out.println("\tDomain Num: " + domainNum);
            System.out.println("\tOrganism: " + organism);
            System.out.println("\tDomain Seq: " + domainSeq);

            List prediction3List = new ArrayList();
            SequenceIterator it = genSeqDB.sequenceIterator();

            int numSeqs = 0;
            while(numSeqs<genSeqDB.ids().size())
            {
                HashSequenceDB testSeqDB = new HashSequenceDB();
                int getNum = 0;

                while(it.hasNext())
                {
                    try
                    {
                        getNum = getNum +1;
                        Sequence seqi = it.nextSequence();
                        testSeqDB.addSequence(seqi);
                        if (getNum == 2000)
                            break;
                    }
                    catch(Exception e)
                    {
                        System.out.println("Exception: " + e);
                    }

                }
                int numSubSeqs = testSeqDB.ids().size();
                numSeqs = numSeqs +numSubSeqs;
                System.out.println("\tNum sequences: " + numSubSeqs);
                System.out.println("\tRunning num sequences: "+numSeqs+" out of "+genSeqDB.ids().size());

                ProteinProfile testProfile = PDZSVMUtils.makeProfile(
                        domainName,
                        domainNum,
                        domainSeq,
                        PDZSVMUtils.organismShortToLongForm(organism),
                        testSeqDB);
                testProfile.setExperimentalMethod(Constants.PHAGE_DISPLAY);

                List testProfileList = new ArrayList();
                testProfileList.add(testProfile);
                int numPredStructHits = 0;

                List predictiong3List = g.predict(testProfileList, new ArrayList());
                prediction3List.addAll(predictiong3List);
                for (int ii=0; ii < prediction3List.size();ii++)
                {
                    Prediction pred = (Prediction)prediction3List.get(ii);
                    if (pred.getPrediction()==1.0)
                    {
                        numPredStructHits = numPredStructHits+1;
                    }
                }
                System.out.println("\tNumber struct predicted hits: " + numPredStructHits);
            }
            printPredictions(g.getPredictorName(), prediction3List, domainName, domainNum, domainSeq, organism);
        }

    }

    public void printPredictions(String predictorName, List predictionList, String domainName, int domainNum, String domainSeq, String domainOrganism)
    {
        String dir = "STRUCT";

        List posPredDecValueList = new ArrayList();
        HashMap posPredDecValueToPeptideMap = new HashMap();

        for (int i=0; i < predictionList.size();i++)
        {
            Prediction pred = (Prediction)predictionList.get(i);

            if (pred.getPrediction()==1.0)
            {
                double decValue =pred.getDecValue();
                String peptideSeq = pred.peptideSeq;
                posPredDecValueList.add(decValue);
                posPredDecValueToPeptideMap.put(decValue, peptideSeq);
            }
        }
        Collections.sort(posPredDecValueList);
        Collections.reverse(posPredDecValueList);
        try
        {
            DecimalFormat format = new DecimalFormat("#.###");
            StringBuffer outString = new StringBuffer();
            for (int i=0; i < posPredDecValueList.size();i++)
            {
                Double decValueKey = (Double)posPredDecValueList.get(i);
                String pepString = (String)posPredDecValueToPeptideMap.get(decValueKey);

                outString.append(format.format(decValueKey) + "\t" + pepString +"\n");

            }
            if (organism.equals(Constants.MOUSE))
            {
                if (domainName.equals("LRRC7-1") ||
                    domainName.equals("PDZK1-1") ||
                    domainName.equals("SHANK3-1"))
                {
                    domainName = domainName + "-M";            
                }
            }
            BufferedWriter bw = new BufferedWriter(new FileWriter(new File(DataFileManager.OUTPUT_ROOT_DIR+"/ScanTest/Predictions/"+dir+"/"+domainName+".predictions.txt")));
            bw.write(outString.toString());
            bw.close();
        }
        catch(Exception e)
        {
            System.out.println("Exception: " + e);
        }
        System.out.println("\tNumber of predictions: "+posPredDecValueList.size());
    }

    private GlobalSVMPredictor trainSVMStruct(List posTrainProfileList, List negTrainProfileList)
    {
        System.out.println("\tTraining SVM Struct predictor...");

        svm_parameter svmParams = new svm_parameter();
        svmParams.setDefaults();
        svmParams.data_encoding = svm_parameter.STRUCT;
        double C = 4; double g = 3; // 83 mouse + human + svm negs

        System.out.println("\tStructure encoding...");
        System.out.println("\t[g,C] = ["+g+","+C+"])");

        svmParams.C = Math.exp(C);
        svmParams.gamma = Math.exp(-Math.log(2)-g);

        GlobalSVMPredictor p =new GlobalSVMPredictor(
                posTrainProfileList,
                negTrainProfileList,
                svmParams);
        p.train();
        return p;

    }

    public static void main(String[]args)
    {
        String filename = args[0];
        String organism = args[1];

        ProteomeScan p = new ProteomeScan(organism);
        List domainNameList = new ArrayList();

        try
        {
            BufferedReader br = new BufferedReader(new FileReader(new File(filename)));
            String line = "";
            while((line=br.readLine())!=null)
            {
                String[] splitLine = line.split("\\s++");
                for (int i=0; i < splitLine.length;i++)
                {
                    if (!splitLine[i].equals("") && !splitLine[i].startsWith("#"))
                    {
                        domainNameList.add(splitLine[i]);
                    }
                }
            }
            br.close();
        }
        catch(Exception e)
        {
            System.out.println("Exception: " + e);
        }
        p.scanStruct(domainNameList);

    }
}
