#!/usr/bin/env python
# -*- coding: utf-8

import sys

import anvio
import anvio.fastalib as u
import anvio.utils as utils
import anvio.terminal as terminal
import anvio.filesnpaths as filesnpaths

from anvio.errors import ConfigError, FilesNPathsError

__author__ = "Developers of anvi'o (see AUTHORS.txt)"
__copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "a.murat.eren@gmail.com"


run = terminal.Run()
progress = terminal.Progress()


def reformat_FASTA(args):
    if args.keep_ids and args.exclude_ids:
        raise ConfigError("You can't use`--exclude-ids and --keep-ids together :/")

    filesnpaths.is_output_file_writable(args.output_file)
    filesnpaths.is_file_exists(args.contigs_fasta)

    report_file = open(args.report_file, 'w') if args.report_file and args.simplify_names else None
    prefix = args.prefix if args.prefix else None

    if prefix:
        utils.is_this_name_OK_for_database('contig name prefix', prefix)

    if args.exclude_ids:
        filesnpaths.is_file_exists(args.exclude_ids)
        exclude_ids = set([l.split('\t')[0].strip() for l in open(args.exclude_ids, 'rU').readlines()])
        run.info('Input IDs to remove', '%d found' % len(exclude_ids))
    else:
        exclude_ids = set([])

    if args.keep_ids:
        filesnpaths.is_file_exists(args.keep_ids)
        keep_ids = set([l.split('\t')[0].strip() for l in open(args.keep_ids, 'rU').readlines()])
        run.info('Input IDs to consider', '%d found' % len(keep_ids))
    else:
        keep_ids = set([])


    output = u.FastaOutput(args.output_file)
    fasta = u.SequenceSource(args.contigs_fasta)

    run.info('Input', args.contigs_fasta)
    run.info('Output', args.output_file)
    run.info('Minimum length', args.min_len)

    total_num_nucleotides = 0
    total_num_contigs = 0
    total_num_nucleotides_removed = 0
    total_num_contigs_removed = 0

    while next(fasta):

        l = len(fasta.seq)

        total_num_nucleotides += l
        total_num_contigs += 1

        if keep_ids and fasta.id.split()[0] not in keep_ids:
            total_num_nucleotides_removed += l
            total_num_contigs_removed += 1
            continue

        if exclude_ids and fasta.id.split()[0] in exclude_ids:
            total_num_nucleotides_removed += l
            total_num_contigs_removed += 1
            continue

        if len(fasta.seq) < args.min_len:
            total_num_nucleotides_removed += l
            total_num_contigs_removed += 1
            continue

        if args.simplify_names:
            if prefix:
                defline = '%s_%012d' % (prefix, fasta.pos)
            else:
                defline = 'c_%012d' % fasta.pos

            output.write_id(defline)
            output.write_seq(fasta.seq, split = False)

            if report_file:
                report_file.write('%s\t%s\n' % (defline, fasta.id))
        else:
            output.store(fasta, split = False)

    if report_file:
        report_file.close()

    fasta.close()
    output.close()

    run.info('Total num contigs', total_num_contigs)
    run.info('Total num nucleotides', total_num_nucleotides)
    run.info('Contigs removed', '%d (%.2f%% of all)' % (total_num_contigs_removed, total_num_contigs_removed * 100.0 / total_num_contigs), mc='green')
    run.info('Nucleotides removed', '%d (%.2f%% of all)' % (total_num_nucleotides_removed, total_num_nucleotides_removed * 100.0 / total_num_nucleotides), mc='green')
    run.info('Deflines simplified', args.simplify_names, mc='green')


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description="Reformat FASTA file (remove contigs based on length, or based on a given list of\
                                                  deflines, and/or generate an output with simpler names)")

    parser.add_argument('contigs_fasta')
    parser.add_argument('-l', '--min-len', type=int, default=0, metavar='MIN_LENGTH',
                        help="Minimum length of contigs to keep (contigs shorter than this value\
                              will not be included in the output file). The default is %(default)d,\
                              so nothing will be removed if you do not declare a minimum size.")
    parser.add_argument('-i', '--exclude-ids', required=False, metavar='TXT FILE',
                        help="IDs to remove from the FASTA file. You cannot provide both\
                              --keep-ids and --exclude-ids.")
    parser.add_argument('-I', '--keep-ids', required=False, metavar='TXT FILE',
                        help="If provided, all IDs not in this file will be excluded from the\
                              reformatted FASTA file. Any additional filters (such as --min-len)\
                              will still be applied to the IDs in this file. You cannot provide both\
                              --exclude-ids and --keep-ids.")
    parser.add_argument('-o', '--output-file', required=True, metavar='FASTA FILE',
                        help="Output file path.")
    parser.add_argument('--simplify-names', default=False, action="store_true",
                        help="Edit deflines to make sure they contigs have simple names.")
    parser.add_argument('--prefix', default=None, metavar="PREFIX",
                        help="Use this parameter if you would like to add a prefix to your contig\
                              names while simplifying them. The prefix must be a single word (you\
                              can use underscor character, but nothing more!).")
    parser.add_argument('-r', '--report-file', required=False, metavar='REPORT FILE',
                        help="Report file path. When you run this program with `--simplify-names`\
                              flag, all changes to deflines will be reported in this file\
                              in case you need to go back to this information later. It is not\
                              mandatory to declare one, but it is a very good idea to have it.")

    args = anvio.get_args(parser)

    try:
        reformat_FASTA(args)
    except ConfigError as e:
        print(e)
        sys.exit(-1)
    except FilesNPathsError as e:
        print(e)
        sys.exit(-2)
