#!/usr/bin/env python

from __future__ import print_function

from collections import OrderedDict
import argparse
import os
import atexit
from subprocess import call

import pandas as pd

from epic.version import __version__

parser = argparse.ArgumentParser( description="""Use external control files (for
    example ChIP-files from another species) to find bins with non-specific
    binding. A blacklist is then computed for regions according to a Poisson
    model).

(Visit github.com/endrebak/epic for examples and help.)

    """,
    prog=os.path.basename(__file__))

parser.add_argument(
    '--infiles',
    '-i',
    required=True,
    type=str,
    nargs='+',
    help='''ChIP files to count reads in (bed/bedpe format).''')


parser.add_argument(
    '--number-cores',
    '-cpu',
    required=False,
    default=1,
    type=int,
    help=
    '''Number of cpus to use. Can use at most one per chromosome. Default: 1.''')


parser.add_argument(
    '--keep-duplicates',
    '-k',
    required=False,
    default=False,
    action='store_true',
    help=
    '''Keep reads mapping to the same position on the same strand within a library. Default is to remove all but the first duplicate.''')


parser.add_argument(
    '--window-size',
    '-w',
    required=False,
    default=200,
    type=int,
    help=
    '''Size of the windows to scan the genome. WINDOW_SIZE is the smallest possible island. Default 200.''')

parser.add_argument(
    '--fragment-size',
    '-fs',
    required=False,
    default=150,
    type=int,
    help=
    '''(Single end reads only) Size of the sequenced fragment. The center of the the fragment will be taken as half the fragment size. Default 150.''')


parser.add_argument('--genome',
                    '-gn',
                    required=False,
                    default="hg19",
                    type=str,
                    help='''Which genome to analyze. Default: hg19. If --chromsizes flag is given, --genome is not required.''')

parser.add_argument(
    '--chromsizes',
    '-cs',
    required=False,
    type=str,
    help=
    '''Set the chromosome lengths yourself in a file with two columns: chromosome names and sizes. Useful to analyze custom genomes, assemblies or simulated data. Only chromosomes included in the file will be analyzed.''')


parser.add_argument(
    '--bonferroni',
    '-b',
    required=False,
    default=0.05,
    type=str,
    help=
    '''The bonferroni-value to consider a bin having too many reads in it (Default: 0.05).''')

parser.add_argument(
    '--effective-genome-fraction',
    '-egf',
    required=False,
    type=float,
    help=
    '''Use a different effective genome fraction than the one included in epic. The default value depends on the genome and readlength, but is a number between 0 and 1.''')


parser.add_argument(
    '--outfile',
    '-o',
    required=True,
    type=str,
    help=
    '''File to write gzipped count matrix to.''')


if __name__ == "__main__":

    args = parser.parse_args()

    import logging
    from sys import argv

    from epic.config import logging_settings
    from epic.run.run_epic import multiple_files_count_reads_in_windows, _merge_files
    from epic.config.genomes import (create_genome_size_dict,
                                     create_genome_size_dict_custom_genome)
    from epic.matrixes.matrixes import put_dfs_in_chromosome_dict

    from epic.utils.find_readlength import (find_readlength,
                                            get_closest_readlength)

    from epic.config.genomes import (get_effective_genome_length,
                                     create_genome_size_dict, create_genome_size_dict_custom_genome)

    from epic.blacklist.compute_poisson import compute_poisson

    most_paired_end = sum([f.endswith(".bedpe") for f in args.infiles]) / float(len(args.infiles)) > 0.5

    # in case the user entered no effective genome size
    if not args.effective_genome_fraction and not most_paired_end:
        estimated_readlength = find_readlength(args)
        closest_readlength = get_closest_readlength(estimated_readlength)
        args.effective_genome_fraction = get_effective_genome_length(
            args.genome, closest_readlength)
    # for paired end the effective genome size is max
    elif not args.effective_genome_fraction and most_paired_end:
        logging.info("Using paired end so setting readlength to 100.")
        args.effective_genome_fraction = get_effective_genome_length(args.genome,
                                                                 100)

    if not args.chromsizes:
        args.chromosome_sizes = create_genome_size_dict(args.genome)
    else:
        args.chromosome_sizes = create_genome_size_dict_custom_genome(args.chromsizes)
        total_genome_length = sum(args.chromosome_sizes.values())
        args.effective_genome_fraction = total_genome_length * args.effective_genome_fraction

    windows = multiple_files_count_reads_in_windows(args.infiles, args)
    windows_merged = _merge_files(windows.values(), args.number_cores)

    d = put_dfs_in_chromosome_dict(windows_merged)

    df = pd.concat(d.values()).fillna(0)
    df = df.set_index("Chromosome Bin".split())

    logging.info("Computing poisson scores and writing result to " + args.outfile)
    compute_poisson(df, args).to_csv(args.outfile, sep="\t", index=False, header=False)
