#!/usr/bin/env python
# -*- coding: utf-8
"""A DIAMOND and MCL-based pangenome workflow"""

import sys

import anvio
import anvio.panops as panops
import anvio.terminal as terminal
import anvio.constants as constants

from anvio.errors import ConfigError, FilesNPathsError, HDF5Error


__author__ = "A. Murat Eren"
__copyright__ = "Copyright 2016, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "a.murat.eren@gmail.com"


run = terminal.Run()
progress = terminal.Progress()


if __name__ == '__main__':
    run.warning('If you publish results from this workflow, please do not forget to cite DIAMOND\
                 (doi:10.1038/nmeth.3176), unless you use it with --use-ncbi-blast flag, and MCL\
                 (http://micans.org/mcl/ and doi:10.1007/978-1-61779-361-5_15)', lc = 'yellow')

    import argparse

    parser = argparse.ArgumentParser(description="A DIAMOND and MCL-based anvi'o pangenome workflow. You provide genomes from anywhere\
                                                  (whether they are external genomes, or anvi'o genome bins in collections), and it\
                                                  gives you back a pangenome analysis.")

    groupA = parser.add_argument_group('GENOMES', "The very fancy genomes storage file. This file is generated by the program `anvi-genomes-storage`.\
                                                   Please see the online tutorial on pangenomic workflow if you don't know how to generate one.")
    groupA.add_argument(*anvio.A('genomes-storage'), **anvio.K('genomes-storage', {'required': True}))
    groupA.add_argument(*anvio.A('genomes-names'), **anvio.K('genomes-names'))

    groupC = parser.add_argument_group("PARAMETERS", "Important stuff Tom never pays attention (but you should).")
    groupC.add_argument('--skip-alignments', default = False, action = 'store_true', help = "By default, anvi'o attempts to align amino acid\
                                sequences in each protein cluster using multiple sequnce alignment via muscle. You can use this flag to skip\
                                that step and be upset later.")
    groupC.add_argument('--exclude-partial-gene-calls', default = False, action = 'store_true', help = "By default, anvi'o includes all partial\
                                gene calls from the analysis, which, in some cases, may inflate the number of protein clusters identified and\
                                introduce extra heterogeneity within those protein clusters. Using this flag, you can request anvi'o to exclude\
                                partial gene calls from the analysis (whether a gene call is partial or not is an information that comes directly\
                                from the gene caller used to identify genes during the generation of the contigs database).")
    groupC.add_argument('--use-ncbi-blast', default = False, action = 'store_true', help = "This program uses DIAMOND by default, however,\
                                if you like, you can use good ol' blastp from NCBI instead.")
    groupC.add_argument('--maxbit', type = float, default = 0.5, metavar = "MAXBIT", help = "The maxbit heuristic provides a mean to set a\
                                to eliminate weak matches between two protein sequences. We learned it from ITEP (Benedict MN et al,\
                                doi:10.1186/1471-2164-15-8), which is a comprehensive pan genome analysis workflow, and decided to use it in\
                                the anvi'o pan genomic workflow as well. If you have two protein sequences 'A' and 'B', the maxbit is defined\
                                as BITSCORE(A, B) / MIN(BITSCORE(A, A), BITSCORE(B, B)). The maxbit score between two sequences goes to 1 if\
                                they are very similar over their entire length, and goes to 0 if they match over a very short stretch compared\
                                to their entire length (regardless of how similar they are at the aligned region), or their sequence identity\
                                is low. The default is %(default)g.")
    groupC.add_argument('--mcl-inflation', type = float, default = 2.0, metavar = "INFLATION", help = "MCL inflation parameter, that defines\
                                the sensitivity of the algorithm during the identification of the protein clusters. More information on this\
                                parameter and it's effect on cluster granularity is here: (http://micans.org/mcl/man/mclfaq.html#faq7.2).\
                                The default is %(default)g.")
    groupC.add_argument('--min-occurrence', type = int, default = 1, metavar = 'NUM_OCCURRENCE', help = "Do you not want singletons?\ You don't?\
                                Well, this parameter will help you get rid of them (along with doubletons, if you want). Anvi'o will remove\
                                protein clusters that occur less than the number you set using this parameter from the analysis. The default\
                                is %(default)d, which means everything will be kept. If you want to remove singletons, set it to 2, if you want to\
                                remove doubletons as well, set it to 3, and so on.")
    groupC.add_argument('--min-percent-identity', type = float, default = 0.0, metavar = "PERCENT", help = "Minimum percent identity\
                                between the two protein sequences for them to have an edge for MCL analysis. This value will be used\
                                to filter hits from Diamond search results. Because percent identity is not a predictor of a good match (since\
                                it does not communicate many other important factors such as the alignment length between the two sequences and\
                                its proportion to the entire length of those involved), we suggest you rely on 'maxbit' parameter. But you know\
                                what? Maybe you shouldn't listen to anyone, and experiment on your own! The default is %(default)g percent.")
    groupC.add_argument(*anvio.A('sensitive'), **anvio.K('sensitive'))

    groupD = parser.add_argument_group("OTHERS", "Sweet parameters of convenience.")
    groupD.add_argument(*anvio.A('project-name'), **anvio.K('project-name'))
    groupD.add_argument(*anvio.A('description'), **anvio.K('description'))
    groupD.add_argument(*anvio.A('output-dir'), **anvio.K('output-dir'))
    groupD.add_argument(*anvio.A('overwrite-output-destinations'), **anvio.K('overwrite-output-destinations'))
    groupD.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads'))
    groupD.add_argument(*anvio.A('debug'), **anvio.K('debug'))

    groupE = parser.add_argument_group("ORGANIZING PCs", "These are stuff that will change the clustering dendrogram of your PCs.")
    groupE.add_argument(*anvio.A('distance'), **anvio.K('distance', {'default': None, 'help':
                      'The distance metric for the clustering of PCs. If you do not use this flag,\
                       the default distance metric will be used for each clustering configuration\
                       which is "%s".' % constants.distance_metric_default}))
    groupE.add_argument(*anvio.A('linkage'), **anvio.K('linkage', {'default': None, 'help':
                      'The same story with the `--distance`, except, the system default for this one\
                       is %s.' % constants.linkage_method_default}))
    args = parser.parse_args()

    try:
        pan = panops.Pangenome(args, run, progress)
        pan.process()
    except ConfigError as e:
        print(e)
        sys.exit(-1)
    except FilesNPathsError as e:
        print(e)
        sys.exit(-2)
    except HDF5Error as e:
        print(e)
        sys.exit(-2)
