#!/srv/sw/python/2.7.4/bin/python
###############################################################################
#                                                                             #
#    This program is free software: you can redistribute it and/or modify     #
#    it under the terms of the GNU General Public License as published by     #
#    the Free Software Foundation, either version 3 of the License, or        #
#    (at your option) any later version.                                      #
#                                                                             #
#    This program is distributed in the hope that it will be useful,          #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
#    GNU General Public License for more details.                             #
#                                                                             #
#    You should have received a copy of the GNU General Public License        #
#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
#                                                                             #
###############################################################################

__author__ = "Donovan Parks"
__copyright__ = "Copyright 2015"
__credits__ = ["Donovan Parks"]
__license__ = "GPL3"
__maintainer__ = "Donovan Parks"
__email__ = "donovan.parks@gmail.com"
__status__ = "Development"

import os
import sys
import ntpath
import logging
import argparse

from genometreetk.main import OptionsParser
from biolib.misc.custom_help_formatter import CustomHelpFormatter
from biolib.common import make_sure_path_exists
from biolib.taxonomy import Taxonomy


def version():
    """Read program version from file."""
    import genometreetk
    version_file = open(os.path.join(genometreetk.__path__[0], 'VERSION'))
    return version_file.read().strip()


def print_help():
    """Help menu."""

    print ''
    print '                ...::: GenomeTreeTk v' + version() + ' :::...'''
    print '''\

    Infer lineage-specific genome tree:
      trusted     -> Determine trusted genomes to use for marker gene inference
      dereplicate -> Dereplicate genomes based on taxonomy
      markers     -> Determine phylogenetically informative marker genes
      infer       -> Infer genome tree

    Infer 16S tree for GTDB genomes:
      ssu_tree -> Infer a 16S tree spanning GTDB genomes

    Assess stability of tree:
      bootstrap  -> Bootstrap multiple sequence alignment
      jk_markers -> Jackknife marker genes
      jk_taxa    -> Jackknife ingroup taxa
      combine    -> Combine all support values into a single tree

    Reroot tree:
      midpoint -> Reroot tree at midpoint
      outgroup -> Reroot tree with outgroup

    Select representative genomes:
      refseq_reps -> Determine representative genomes in RefSeq
      reps        -> Determine additional representatives genomes
      aai_cluster -> Cluster genomes based on AAI of concatenated alignment

    Taxonomy verification and manipulation:
      validate   -> Check taxonomy file is formatted as expected
      fill_ranks -> Ensure all taxonomy strings contain all 7 canonical ranks
      binomial   -> Ensure species are designated using binomial nomenclature
      propagate  -> Propagate labels from representatives to all genomes in a cluster
      strip      -> Remove taxonomic labels from a tree (useful for re-decorating)
      pull       -> Create taxonomy file from a decorated tree
      diff       -> Determine differences between two taxonomy files

    Other:
      pd -> Calculate phylogenetic diversity (branch length) of named groups


  Use: genometreetk <command> -h for command specific help.

  Feature requests or bug reports can be sent to Donovan Parks (donovan.parks@gmail.com)
    or posted on GitHub (https://github.com/dparks1134/GenomeTreeTk).
    '''


def logger_setup(output_dir, silent):
    """Set logging for application.

    Parameters
    ----------
    output_dir : str
        Output directory for log file.
    silent : boolean
        Flag indicating if output to stdout should be suppressed.
    """

    # setup general properties of logger
    logger = logging.getLogger('')
    logger.setLevel(logging.DEBUG)
    log_format = logging.Formatter(fmt="[%(asctime)s] %(levelname)s: %(message)s",
                                   datefmt="%Y-%m-%d %H:%M:%S")

    # setup logging to console
    if not silent:
        stream_logger = logging.StreamHandler(sys.stdout)
        stream_logger.setFormatter(log_format)
        stream_logger.setLevel(logging.DEBUG)
        logger.addHandler(stream_logger)

    if output_dir:
        make_sure_path_exists(output_dir)
        file_logger = logging.FileHandler(os.path.join(output_dir, 'genometreetk.log'), 'a')
        file_logger.setFormatter(log_format)
        logger.addHandler(file_logger)

    logger.info('GenomeTreeTk v%s' % version())
    logger.info(ntpath.basename(sys.argv[0]) + ' ' + ' '.join(sys.argv[1:]))


if __name__ == '__main__':

    # initialize the options parser
    parser = argparse.ArgumentParser(add_help=False)
    subparsers = parser.add_subparsers(help="--", dest='subparser_name')

    # determine trusted genomes
    trusted_parser = subparsers.add_parser('trusted',
                                        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                        description='Determine trusted genomes to use for marker gene inference.')
    trusted_parser.add_argument('metadata_file', help="metadata file from GTDB with CheckM estimates for all genomes in RefSeq")
    trusted_parser.add_argument('trusted_genomes_file', help="output file listing trusted genomes")
    trusted_parser.add_argument('--trusted_comp', help='minimum completeness to trust genome for marker set inference [0, 100]', type=float, default=95)
    trusted_parser.add_argument('--trusted_cont', help='maximum contamination to trust genome for marker set inference [0, 100]', type=float, default=5)
    trusted_parser.add_argument('--max_contigs', help='maximum number of contigs to trust genome for marker set inference', type=int, default=200)
    trusted_parser.add_argument('--min_N50', help='minimum N50 of contigs to trust genome for marker set inference', type=int, default=20000)
    trusted_parser.add_argument('--refseq_rep', help='limit selection to RefSeq representative and reference genomes', action='store_true')
    trusted_parser.add_argument('--silent', help="suppress output", action='store_true')

    # dereplicate genomes
    dereplicate_parser = subparsers.add_parser('dereplicate',
                                        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                        description='Dereplicate genomes based on taxonomy.')
    dereplicate_parser.add_argument('derep_genome_file', help="output file listing dereplicated genomes")
    dereplicate_parser.add_argument('--max_species', help='maximum number of genomes of the same species to retain', type=int, default=2)
    dereplicate_parser.add_argument('--trusted_genomes_file', help='limit selected genomes to those marked as trusted', default=None)
    dereplicate_parser.add_argument('--silent', help="suppress output", action='store_true')

    # determine marker genes
    markers_parser = subparsers.add_parser('markers',
                                        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                        description='Determine phylogenetically informative marker genes.')
    markers_parser.add_argument('ingroup_file', help="unique ids of genomes within the ingroup")
    markers_parser.add_argument('output_dir', help="output directory")
    markers_parser.add_argument('--redundancy', help='threshold for declaring HMMs redundant', type=float, default=0.50)
    markers_parser.add_argument('--ubiquity', help='ubiquity threshold for defining marker genes', type=float, default=0.95)
    markers_parser.add_argument('--single_copy', help='single-copy threshold for defining marker genes', type=float, default=0.95)

    markers_parser.add_argument('--min_support', help='minimum jackknife support of split during LGT filtering', type=float, default=0.8)
    markers_parser.add_argument('--min_per_taxa', help='minimum percentage of taxa to consider a split during LGT filtering', type=float, default=0.01)
    markers_parser.add_argument('--perc_markers', help='percentage of markers to keep during marker jackknifing', type=float, default=0.7)

    markers_parser.add_argument('--restict_marker_list', help='restrict marker set to genes within this list', default=None)
    markers_parser.add_argument('-c', '--cpus', help='number of cpus to use', type=int, default=16)
    markers_parser.add_argument('--silent', help="suppress output", action='store_true')

    # infer tree
    infer_parser = subparsers.add_parser('infer',
                                        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                        description='Infer genome tree.')
    infer_parser.add_argument('genome_id_file', help="unique ids of genomes to include in tree")
    infer_parser.add_argument('marker_id_file', help="unique ids of marker genes to use for inference")
    infer_parser.add_argument('output_dir', help="output directory")
    infer_parser.add_argument('-m', '--model', choices=['wag', 'jtt'], help="model of evolution to use", default='wag')
    infer_parser.add_argument('-c', '--cpus', help='number of cpus', type=int, default=1)
    infer_parser.add_argument('--silent', help="suppress output", action='store_true')

    # infer 16S tree across GTDB genomes
    ssu_tree_parser = subparsers.add_parser('ssu_tree',
                                        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                        description='Infer 16S tree spanning GTDBgenomes.')
    ssu_tree_parser.add_argument('gtdb_metadata_file', help="metadata file from GTDB")
    ssu_tree_parser.add_argument('gtdb_dir_file', help="file indicating path to each GTDB genome")
    ssu_tree_parser.add_argument('output_dir', help="output directory")
    ssu_tree_parser.add_argument('--ncbi_rep_only', help="include only NCBI representative and reference genomes", action='store_true')
    ssu_tree_parser.add_argument('--user_genomes', help="include User genomes (default is NCBI only)", action='store_true')
    ssu_tree_parser.add_argument('--genome_list', help="explict list of genomes to use (ignores --ncbi_rep_only and --user_genomes)")
    ssu_tree_parser.add_argument('--min_ssu_length', help='minimum length of 16S sequence to be include in tree', type=int, default=1200)
    ssu_tree_parser.add_argument('--min_ssu_contig', help='minimum length of contig containing 16S sequence to be include in tree', type=int, default=5000)
    ssu_tree_parser.add_argument('--min_quality', help='minimum quality (completeness - contamination) for a genome to be included in tree [0, 100]', type=float, default=80)
    ssu_tree_parser.add_argument('--max_contigs', help='maximum contigs comprising a genome for it to be included in tree', type=int, default=200)
    ssu_tree_parser.add_argument('--min_N50', help='minimum N50 of contigs for a genome to be include in tree', type=int, default=20000)
    ssu_tree_parser.add_argument('--disable_tax_filter', help="filter sequences with incongruent taxonomy", action='store_true')
    ssu_tree_parser.add_argument('-c', '--cpus', help='number of cpus', type=int, default=1)
    ssu_tree_parser.add_argument('--silent', help="suppress output", action='store_true')

    # assess robustness of genome tree using classic bootstrapping
    bootstrap_parser = subparsers.add_parser('bootstrap',
                                        formatter_class=CustomHelpFormatter,
                                        description='Bootstrap multiple sequence alignment.')
    bootstrap_parser.add_argument('input_tree', help="tree inferred from original data")
    bootstrap_parser.add_argument('msa_file', help="file containing multiple sequence alignment")
    bootstrap_parser.add_argument('output_dir', help="output directory")
    bootstrap_parser.add_argument('-b', '--base_type', choices=['nt', 'prot'], help="indicates if bases are nucleotides or amino acids", default='prot')
    bootstrap_parser.add_argument('-m', '--model', choices=['wag', 'jtt'], help="model of evolution to use", default='wag')
    bootstrap_parser.add_argument('-r', '--num_replicates', help="number of bootstrap replicates to perform", type=int, default=100)
    bootstrap_parser.add_argument('-f', '--fraction', help="fraction of alignment to subsample", type=float, default=1.0)
    bootstrap_parser.add_argument('-c', '--cpus', help='number of cpus', type=int, default=1)
    bootstrap_parser.add_argument('--silent', help="suppress output", action='store_true')

    # assess robustness of genome tree by jackknifing marker genes
    jk_markers_parser = subparsers.add_parser('jk_markers',
                                        formatter_class=CustomHelpFormatter,
                                        description='Jackknife marker genes.')
    jk_markers_parser.add_argument('input_tree', help="tree inferred from original data")
    jk_markers_parser.add_argument('msa_file', help="file containing multiple sequence alignment")
    jk_markers_parser.add_argument('gene_length_file', help="file indicating length of each gene in concatenated alignment")
    jk_markers_parser.add_argument('output_dir', help="output directory)")
    jk_markers_parser.add_argument('-m', '--model', choices=['wag', 'jtt'], help="model of evolution to use", default='wag')
    jk_markers_parser.add_argument('-p', '--perc_markers', help="percentage of markers to keep", type=float, default=0.5)
    jk_markers_parser.add_argument('-r', '--num_replicates', help="number of jackknife replicates to perform", type=int, default=100)
    jk_markers_parser.add_argument('-c', '--cpus', help='number of cpus', type=int, default=1)
    jk_markers_parser.add_argument('--silent', help="suppress output", action='store_true')

    # assess robustness of genome tree by jackknifing ingroup taxa
    jk_taxa_parser = subparsers.add_parser('jk_taxa',
                                        formatter_class=CustomHelpFormatter,
                                        description='Jackknife ingroup taxa.')
    jk_taxa_parser.add_argument('input_tree', help="tree inferred from original data")
    jk_taxa_parser.add_argument('msa_file', help="file containing multiple sequence alignment")
    jk_taxa_parser.add_argument('output_dir', help="output directory")
    jk_taxa_parser.add_argument('--outgroup_ids', help="file indicating outgroup taxa", default=None)
    jk_taxa_parser.add_argument('-m', '--model', choices=['wag', 'jtt'], help="model of evolution to use", default='wag')
    jk_taxa_parser.add_argument('-p', '--perc_taxa', help="percentage of taxa to keep", type=float, default=0.5)
    jk_taxa_parser.add_argument('-r', '--num_replicates', help="number of jackknife replicates to perform", type=int, default=100)
    jk_taxa_parser.add_argument('-c', '--cpus', help='number of cpus', type=int, default=1)
    jk_taxa_parser.add_argument('--silent', help="suppress output", action='store_true')

    # assess robustness of genome tree by jackknifing ingroup taxa
    combine_parser = subparsers.add_parser('combine',
                                        formatter_class=CustomHelpFormatter,
                                        description='Combine all support values into a single tree.')
    combine_parser.add_argument('bootstrap_tree', help="tree with bootstrap support values")
    combine_parser.add_argument('jk_marker_tree', help="tree with jackknife marker support values")
    combine_parser.add_argument('jk_taxa_tree', help="tree with jackknife taxa support values")
    combine_parser.add_argument('output_tree', help="output tree")
    combine_parser.add_argument('-s', '--support_type', choices=['average', 'minimum'], help="type of support values to compute", default='average')
    combine_parser.add_argument('--silent', help="suppress output", action='store_true')

    # reroot tree at midpoint
    midpoint_parser = subparsers.add_parser('midpoint',
                                        formatter_class=CustomHelpFormatter,
                                        description='Reroot tree at midpoint.')
    midpoint_parser.add_argument('input_tree', help="tree to reroot")
    midpoint_parser.add_argument('output_tree', help="output tree")
    midpoint_parser.add_argument('--silent', help="suppress output", action='store_true')

    # reroot tree with outgroup
    outgroup_parser = subparsers.add_parser('outgroup',
                                        formatter_class=CustomHelpFormatter,
                                        description='Reroot tree with outgroup.')
    outgroup_parser.add_argument('input_tree', help="tree to reroot")
    outgroup_parser.add_argument('taxonomy_file', help="file indicating taxonomy string for genomes")
    outgroup_parser.add_argument('outgroup_taxon', help="taxon to use as outgroup (e.g., d__Archaea)")
    outgroup_parser.add_argument('output_tree', help="output tree")
    outgroup_parser.add_argument('--silent', help="suppress output", action='store_true')

    # representative genomes
    refseq_rep_parser = subparsers.add_parser('refseq_reps',
                                        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                        description='Determine representative genomes in RefSeq.')
    refseq_rep_parser.add_argument('metadata_file', help="metadata file from GTDB with CheckM estimates for all genomes in RefSeq")
    refseq_rep_parser.add_argument('rep_genome_file', help="output file listing RefSeq representative genomes")
    refseq_rep_parser.add_argument('--max_species', help='maximum number of genomes of the same species to retain', type=int, default=2)
    refseq_rep_parser.add_argument('--min_rep_comp', help='minimum completeness for a genome to be a representative [0, 100]', type=float, default=90)
    refseq_rep_parser.add_argument('--max_rep_cont', help='maximum contamination for a genome to be a representative [0, 100]', type=float, default=10)
    refseq_rep_parser.add_argument('--min_quality', help='minimum genome quality (comp - 4*cont) to be a representative [0, 100]', type=float, default=60)
    refseq_rep_parser.add_argument('--max_contigs', help='maximum number of contigs for a genome to be a representative', type=int, default=300)
    refseq_rep_parser.add_argument('--min_N50', help='minimum N50 of contigs for a genome to be a representative', type=int, default=20000)
    refseq_rep_parser.add_argument('--max_ambiguous', help='maximum number of ambiguous bases for a genome to be a representative', type=int, default=10000)
    refseq_rep_parser.add_argument('--strict_filtering', help='apply filtering to all genomes; default is to apply lenient filtering to genomes where the chromosome and plasmids are reported as complete', action='store_true')
    refseq_rep_parser.add_argument('--silent', help="suppress output", action='store_true')

    # representative genomes
    rep_parser = subparsers.add_parser('reps',
                                        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                        description='Determine additional representatives genomes.')
    rep_parser.add_argument('ar_msa_file', help="file containing canonical archaeal multiple sequence alignment")
    rep_parser.add_argument('bac_msa_file', help="file containing canonical bacterial multiple sequence alignment")
    rep_parser.add_argument('refseq_representatives', help="file listing RefSeq representative genomes")
    rep_parser.add_argument('metadata_file', help="metadata file from GTDB with CheckM estimates for all genomes in RefSeq")
    rep_parser.add_argument('rep_genome_file', help="output file listing representative genomes")
    rep_parser.add_argument('--trusted_user_file', help='file specifying trusted User genomes that should be treated as being in GenBank')
    rep_parser.add_argument('--threshold', help='AAI threshold for identifying new representatives', type=float, default=0.995)
    rep_parser.add_argument('--min_rep_comp', help='minimum completeness for a genome to be a representative [0, 100]', type=float, default=90)
    rep_parser.add_argument('--max_rep_cont', help='maximum contamination for a genome to be a representative [0, 100]', type=float, default=10)
    rep_parser.add_argument('--min_quality', help='minimum genome quality (comp - 4*cont) to be a representative [0, 100]', type=float, default=60)
    rep_parser.add_argument('--max_contigs', help='maximum number of contigs for a genome to be a representative', type=int, default=300)
    rep_parser.add_argument('--min_N50', help='minimum N50 of contigs for a genome to be a representative', type=int, default=20000)
    rep_parser.add_argument('--max_ambiguous', help='maximum number of ambiguous bases for a genome to be a representative', type=int, default=10000)
    rep_parser.add_argument('--silent', help="suppress output", action='store_true')

    # determine trusted genomes
    aai_cluster_parser = subparsers.add_parser('aai_cluster',
                                        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                        description='Cluster genomes based on AAI of concatenated alignment.')
    aai_cluster_parser.add_argument('ar_msa_file', help="file containing canonical archaeal multiple sequence alignment")
    aai_cluster_parser.add_argument('bac_msa_file', help="file containing canonical bacterial multiple sequence alignment")
    aai_cluster_parser.add_argument('representatives', help="list of representative genomes")
    aai_cluster_parser.add_argument('metadata_file', help="metadata file for all genomes in the GTDB")
    aai_cluster_parser.add_argument('cluster_file', help='output file indicating genome clusters')
    aai_cluster_parser.add_argument('--trusted_user_file', help='file specifying trusted User genomes that should be treated as being in GenBank')
    aai_cluster_parser.add_argument('--threshold', help='AAI threshold for assigning a genome to a representative', type=float, default=0.995)
    aai_cluster_parser.add_argument('-c', '--cpus', help='number of cpus', type=int, default=1)
    aai_cluster_parser.add_argument('--silent', help="suppress output", action='store_true')

    # validate taxonomy file
    validate_parser = subparsers.add_parser('validate',
                                        formatter_class=CustomHelpFormatter,
                                        description='Check taxonomy file is formatted as expected.')
    validate_parser.add_argument('input_taxonomy', help='input taxonomy file')
    validate_parser.add_argument('--silent', help="suppress output", action='store_true')

    # fill all 7 taxonomic ranks
    fill_ranks_parser = subparsers.add_parser('fill_ranks',
                                        formatter_class=CustomHelpFormatter,
                                        description='Ensure taxonomy strings contain all 7 canonical ranks.')
    fill_ranks_parser.add_argument('input_taxonomy', help='input taxonomy file')
    fill_ranks_parser.add_argument('output_taxonomy', help='output taxonomy file')
    fill_ranks_parser.add_argument('--silent', help="suppress output", action='store_true')

    # ensure species names use binomial nomenclature
    binomial_parser = subparsers.add_parser('binomial',
                                        formatter_class=CustomHelpFormatter,
                                        description='Ensure species are designated using binomial nomenclature.')
    binomial_parser.add_argument('input_taxonomy', help='input taxonomy file')
    binomial_parser.add_argument('output_taxonomy', help='output taxonomy file')
    binomial_parser.add_argument('--silent', help="suppress output", action='store_true')

    # ensure species names use binomial nomenclature
    propagate_parser = subparsers.add_parser('propagate',
                                        formatter_class=CustomHelpFormatter,
                                        description='Propagate labels to all genomes in a cluster.')
    propagate_parser.add_argument('input_taxonomy', help='input taxonomy file')
    propagate_parser.add_argument('metadata_file', help="metadata file for all genomes in the GTDB")
    propagate_parser.add_argument('output_taxonomy', help='output taxonomy file')
    propagate_parser.add_argument('--silent', help="suppress output", action='store_true')

    # strip taxonomic labels from tree
    strip_parser = subparsers.add_parser('strip',
                                        formatter_class=CustomHelpFormatter,
                                        description='Remove taxonomic labels from a tree.')
    strip_parser.add_argument('input_tree', help="tree to reroot")
    strip_parser.add_argument('output_tree', help="output tree")
    strip_parser.add_argument('--silent', help="suppress output", action='store_true')
    
    # pull taxonomy from a tree
    pull_parser = subparsers.add_parser('pull',
                                        formatter_class=CustomHelpFormatter,
                                        description='Create taxonomy file from a decorated tree.')
    pull_parser.add_argument('input_tree', help='decorated tree')
    pull_parser.add_argument('output_taxonomy', help='output taxonomy file')
    pull_parser.add_argument('--silent', help="suppress output", action='store_true')

    # difference between two taxonomy files
    diff_parser = subparsers.add_parser('diff',
                                        formatter_class=CustomHelpFormatter,
                                        description='Determine differences between two taxonomy files.')
    diff_parser.add_argument('input_taxonomy1', help='first taxonomy file')
    diff_parser.add_argument('input_taxonomy2', help='second taxonomy file')
    diff_parser.add_argument('rank', help='taxonomic rank to compare', choices=Taxonomy.rank_labels, default='genus')
    diff_parser.add_argument('--report_missing_taxa', help="report taxa not present in both files", action='store_true')
    diff_parser.add_argument('--report_missing_ranks', help="report taxa with empty ranks", action='store_true')
    diff_parser.add_argument('--silent', help="suppress output", action='store_true')

    # difference between two taxonomy files
    pd_parser = subparsers.add_parser('pd',
                                        formatter_class=CustomHelpFormatter,
                                        description='Calculate phylogenetic diversity of named groups.')
    pd_parser.add_argument('decorated_tree', help='tree with labeled internal nodes')
    pd_parser.add_argument('--silent', help="suppress output", action='store_true')

    # get and check options
    args = None
    if(len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv == '--help'):
        print_help()
        sys.exit(0)
    else:
        args = parser.parse_args()

    try:
        logger_setup(args.output_dir, args.silent)
    except:
        logger_setup(None, args.silent)

    # do what we came here to do
    try:
        parser = OptionsParser()
        if(False):
            # import pstats
            # p = pstats.Stats('prof')
            # p.sort_stats('cumulative').print_stats(10)
            # p.sort_stats('time').print_stats(10)
            import cProfile
            cProfile.run('parser.parse_options(args)', 'prof')
        elif False:
            import pdb
            pdb.run(parser.parse_options(args))
        else:
            parser.parse_options(args)
    except SystemExit:
        print "\n  Controlled exit resulting from an unrecoverable error or warning."
    except:
        print "\nUnexpected error:", sys.exc_info()[0]
        raise
