#!/usr/bin/env python

import argparse
import os
import sys
import subprocess
from apricotlib.folder_setup import setup_analysis_folders
from apricotlib.uniprot_taxid import select_taxids
from apricotlib.create_query_files import *
from apricotlib.uniprot_proteome_table import *
from apricotlib.create_keyword_files import *
from apricotlib.query_annotation_data import CollectUniprotInformation
from apricotlib.keyword_based_domain_selection import KeywordBasedDomainSelection
from apricotlib.compile_selected_domains import merge_domain_data
from apricotlib.filter_keyword_selected_result import FilterPredictedDomains
from apricotlib.annotate_selected_data import SelectedProteinTable
from apricotlib.annotate_selected_data_without_uniprot import SelectedProteinTableWithoutUniprot
from apricotlib.classify_selected_data import ProteinClassifier
from apricotlib.calculate_annotation_score import ComputeCompositionDistance
from apricotlib.raptorx_secstr import RaptorxSecstrAnalysis
from apricotlib.literature_secstr import LiteratureSecstrAnalysis
from apricotlib.psortb_localization import PsortbSubcellularLocalization
from apricotlib.list_similar_pdbstr import PdbHomologyAnalysis
from apricotlib.create_analysis_summary import CreateAnalysisSummary
from apricotlib.apricot_visualization import VizApricotAnalysis
from apricotlib.convert_csv_format import *

__author__ = "Malvika Sharan <malvikasharan@uni-wuerzburg.de>"
__copyright__ = "2016 by Malvika Sharan <malvikasharan@uni-wuerzburg.de>"
__license__ = "ISC license"
__email__ = "malvikasharan@uni-wuerzburg.de>"
__version__ = "1.2.3"

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--version", "-v", default=False, action="store_true",
        help="show version")
    subparsers = parser.add_subparsers(help="APRICOT commands - Refer documentation for detail")
    
    minimum_default_pipeline = subparsers.add_parser(  
        "default", help="Analysis using all the required subcommands at their default parameters")
    minimum_default_pipeline.set_defaults(func=minimum_subcommands)
    minimum_default_pipeline.add_argument("--analysis_path", "-ap",
        default='APRICOT_analysis',
        help='Creates APRICOT_analysis folder for anlysis unless other name/path is provided')
    minimum_default_pipeline.add_argument("--uids", "-i", "-ui",
        help="Comma separated UniProt IDs")
    minimum_default_pipeline.add_argument("--taxid", "-tx",
        help="Select taxonomy id for query species")
    minimum_default_pipeline.add_argument("--geneids", "-gi",
        help="Comma separated query genes")
    minimum_default_pipeline.add_argument("--proteome", "-P",
        default=False, action="store_true",
        help="Analyze entire proteome")
    minimum_default_pipeline.add_argument("--kw_domain", "-kw",
        help="Comma separated keywords for domain selection")
    minimum_default_pipeline.add_argument("--cdd", '-C',
        default=False, action="store_true",
        help="Uses only CDD")
    minimum_default_pipeline.add_argument("--ipr", '-I',
        default=False, action="store_true",
        help="Uses only InterPro")
    minimum_default_pipeline.add_argument("--force", "-F",
        default=False, action="store_true",
        help="force flag for the current analysis, removes already existing predictions")
    minimum_default_pipeline.add_argument("--skip_select", '-skip_select',
        default=False, action="store_true",
        help="Skips running the subcommand 'select'")
    minimum_default_pipeline.add_argument("--classify", "-cl",
        help="Optional comma separated keyword for result classification")
    minimum_default_pipeline.add_argument("--db_root", "-dr",
        default='source_files', 
        help="Uses to get absolute path of domain annoation files, keyword selected domains")
    minimum_default_pipeline.add_argument("--similarity", "-sim", default=24,
        help="Percent similarity of prediction with reference")
    minimum_default_pipeline.add_argument("--coverage", "-cov", default=30,
        help="Percent coverage of reference domain in prediction")
    minimum_default_pipeline.add_argument("--identity", "-iden",
        help="Percent identity of prediction with reference")
    minimum_default_pipeline.add_argument("--evalue", "-eval", default=1,
        help="Evalue of the domain prediction")
    minimum_default_pipeline.add_argument("--gap", "-gap", default=None,
        help="Percent gap in predicted domain")
    minimum_default_pipeline.add_argument("--bit", "-bit", default=None,
        help="Bit score in predicted domain")
    minimum_default_pipeline.add_argument("--needle_dir", "-nd",
        default="needle",
        help="path for the locally configured EMBOSS suite")
    minimum_default_pipeline.add_argument("--html", "-HT",
        default=True, action="store_true")
    minimum_default_pipeline.add_argument("--xlsx", "-XL",
        default=False, action="store_true")
    minimum_default_pipeline.add_argument("--fasta", "-fa",
        default=False, action="store_true",
        help="Analyze fasta sequences")
    minimum_default_pipeline.add_argument("--query_path", "-qp",
        default='input/query_proteins/query_to_uids.txt',
        help="Get proteome table from UniProt")
    minimum_default_pipeline.add_argument("--proteome_path", "-pp",
        default='input/uniprot_reference_table/query_uids_reference.tab',
        help="Get proteome table from UniProt")
    minimum_default_pipeline.add_argument("--xml_path", "-xp",
        default='input/mapped_query_annotation/xml_path_mapped_query',
        help="Get proteome table from UniProt")
    minimum_default_pipeline.add_argument("--fasta_path", "-fp",
        default='APRICOT_analysis/input/mapped_query_annotation/fasta_path_mapped_query',
        help="Provide absolute path of fasta files for query proteins")
    minimum_default_pipeline.add_argument("--feature_table", "-ft",
        default='input/mapped_query_annotation/mapped_protein_xml_info_tables/query_feature_table.csv',
        help="Get proteome table from UniProt")
    minimum_default_pipeline.add_argument("--dom_kw", "-dk",
        default='source_files/domain_data/keywords_for_domain_selection.txt',
        help="Absolute path of keyword files")
    minimum_default_pipeline.add_argument("--cdd_db", "-cdb",
        default='source_files/reference_db_files/cdd/Cdd',
        help="Provide absolute path of CDD databases based on the chosen method")
    minimum_default_pipeline.add_argument("--ipr_db", "-idb",
        default='source_files/reference_db_files/interpro/interproscan',
        help="Provide absolute path of InterPro databases based on the chosen method")
    minimum_default_pipeline.add_argument("--predicted", "-predp",
        default='output/0_predicted_domains',
        help="Provide output path for domain prediction files")
    minimum_default_pipeline.add_argument("--domain_description_file", "-dd",
        default='source_files/domain_data/all_keyword_selected_domain_data.tab',
        help='Description table of the selected domains')
    minimum_default_pipeline.add_argument("--go_path",  "-gp",
        default='source_files/reference_db_files/go_mapping/',
        help='Go mapping data from fixed database reference files')
    minimum_default_pipeline.add_argument("--pred_path", "-pred",
        default='output/0_predicted_domains',
        help="Raw files of domain prediction")
    minimum_default_pipeline.add_argument("--up_table", "-ref",
        default='input/uniprot_reference_table',
        help="Uniprot proteome table from UniProt")
    minimum_default_pipeline.add_argument("--xml_info", "-feat",
        default='input/mapped_query_annotation/mapped_protein_xml_info_tables/query_feature_table.csv',
        help="Uniprot proteome table from UniProt")
    minimum_default_pipeline.add_argument("--compile_out", "-co",
        default='output/1_compiled_domain_information', 
        help="Data with annotation after filtering")
    minimum_default_pipeline.add_argument("--selected", "-sel",
        default="output/2_selected_domain_information/combined_data/annotation_extended_for_selected.csv",
        help="Selected data file (from select) with annotations")
    minimum_default_pipeline.add_argument("--class_kw", "-ck",
        default='source_files/domain_data/keywords_for_result_classification.txt',
        help="Path for keyword files")
    minimum_default_pipeline.add_argument("--classified", "-c",
        default="output/2_selected_domain_information/classified_data",
        help="Classification of selected data based on provided keywords")
    minimum_default_pipeline.add_argument("--cdd_pred", '-cp',
        default='output/0_predicted_domains/cdd_analysis',
        help='Raw files obtained from CDD based domain prediction')
    minimum_default_pipeline.add_argument("--scored", "-sco",
        default="output/3_annotation_scoring",
        help="Output path for annotation scoring files")
    minimum_default_pipeline.add_argument("--query_map", '-q',
        default='input/query_proteins/query_to_uids.txt',
        help='query_to_uids.txt file created by APRICOT to save query mapping information')
    minimum_default_pipeline.add_argument("--domains", '-d',
        default='source_files/domain_data/all_keyword_selected_domain_data.tab',
        help='File containing all the keyword selected_domains of interest')
    minimum_default_pipeline.add_argument("--unfilter_path", '-uf',
        default='output/1_compiled_domain_information/unfiltered_data',
        help='Directory with the unfiltered domain data from output-1 (unfiltered_data)')
    minimum_default_pipeline.add_argument("--summarized", '-sum',
        default='output/5_analysis_summary/APRICOT_analysis_summary.txt',
        help="Provide output path")
    minimum_default_pipeline.add_argument("--inpath", '-ip',
        default='APRICOT_analysis/output',
        help="Choose folder from analysis to be converted")
    minimum_default_pipeline.add_argument("--formatted", "-form",
        default="output/format_output_data",
        help="Output path for files with different file formats")
    
    select_taxonomy_id_parser = subparsers.add_parser(
        "taxid",
        help="Download taxonomy ids from UniProt for the user provided query species")
    select_taxonomy_id_parser.set_defaults(
        func=select_uniprot_taxid)
    select_taxonomy_id_parser.add_argument("db_path")
    select_taxonomy_id_parser.add_argument("--species", "-s",
        help="Species name (comma separated if more than one) for taxonomy id retreival")
    
    create_folders_parser = subparsers.add_parser(  
        "create",
        help="Create analysis folders")
    create_folders_parser.set_defaults(func=create_folders)
    create_folders_parser.add_argument("analysis_path",
        default='APRICOT_analysis',
        help='Creates APRICOT_analysis folder for anlysis unless other name/path is provided')
    
    select_taxonomy_id_parser = subparsers.add_parser(
        "taxid",
        help="Download taxonomy ids from UniProt for the user provided query species")
    select_taxonomy_id_parser.set_defaults(
        func=select_uniprot_taxid)
    select_taxonomy_id_parser.add_argument("db_path")
    select_taxonomy_id_parser.add_argument("--species", "-s",
        help="Species name (comma separated if more than one) for taxonomy id retreival")
    
    query_parser = subparsers.add_parser(
        "query",
        help="Map user provided comma separated queries to UniProt ids")
    query_parser.set_defaults(func=query_set)
    query_parser.add_argument("--analysis_path", "-ap",
        default='APRICOT_analysis',
        help='Main analysis path')
    query_parser.add_argument("--uids", "-i",
        help="Comma separated UniProt IDs")
    query_parser.add_argument("--taxid", "-tx",
        help="Select taxonomy id for query species")
    query_parser.add_argument("--geneids", "-gi",
        help="Comma separated query genes")
    query_parser.add_argument("--proteome", "-P",
        default=False, action="store_true",
        help="Analyze entire proteome")
    query_parser.add_argument("--fasta", "-fa",
        default=False, action="store_true",
        help="Analyze fasta sequences")
    query_parser.add_argument("--query_path", "-qp",
        default='input/query_proteins/query_to_uids.txt',
        help="Get proteome table from UniProt")
    query_parser.add_argument("--proteome_path", "-pp",
        default='input/uniprot_reference_table/query_uids_reference.tab',
        help="Get proteome table from UniProt")
    query_parser.add_argument("--xml_path", "-xp",
        default='input/mapped_query_annotation/xml_path_mapped_query',
        help="Get proteome table from UniProt")
    query_parser.add_argument("--fasta_path", "-fp",
        default='APRICOT_analysis/input/mapped_query_annotation/fasta_path_mapped_query',
        help="Get proteome table from UniProt")
    query_parser.add_argument("--feature_table", "-ft",
        default='input/mapped_query_annotation/mapped_protein_xml_info_tables/query_feature_table.csv',
        help="Get proteome table from UniProt")
    
    keyword_input_parser = subparsers.add_parser(
        "keywords",
        help="Save user provided keywords for domain selection (required) and analysis classification (-cl)")
    keyword_input_parser.set_defaults(func=keyword_input)
    keyword_input_parser.add_argument("kw_domain",
        help="Comma separated keywords for domain selection")
    keyword_input_parser.add_argument("--classify", "-cl",
        help="Optional comma separated keyword for result classification")
    keyword_input_parser.add_argument("--db_root", "-dr",
        default='source_files',
        help="Path for keyword files")
        
    domain_data_parser = subparsers.add_parser(
        "select",
        help="Select functional domains of interest (specified by keywords) from CDD (-C) and InterPro (-I) by default")
    domain_data_parser.set_defaults(func=select_domain_data)
    domain_data_parser.add_argument("--cdd", '-C',
        default=False, action="store_true",
        help="Selects functional domains of interest from CDD")
    domain_data_parser.add_argument("--ipr", '-I',
        default=False, action="store_true",
        help="Selects functional domains of interest from CDD")
    domain_data_parser.add_argument("--skip_select", '-skip_select',
        default=False, action="store_true",
        help="Skips running the subcommand 'select'")
    domain_data_parser.add_argument("--dom_kw", "-dk",
        default='source_files/domain_data/keywords_for_domain_selection.txt',
        help="Absolute path of keyword files")
    domain_data_parser.add_argument("--db_root", "-dr",
        default='source_files', 
        help="Uses to get absolute path of domain annoation files, keyword selected domains")
    
    domain_analysis_parser = subparsers.add_parser(
        "predict",
        help="Predict functional domains in the queries based on CDD (-C) and InterPro (-I) databases by default")
    domain_analysis_parser.set_defaults(func=run_domain_analysis)
    domain_analysis_parser.add_argument("--analysis_path", "-ap",
        default='APRICOT_analysis',
        help="Provide output path for the analysis result of the chosen method")
    #dont use the flag for default option of prediction by both CDD and InterPro
    domain_analysis_parser.add_argument("--cdd", "-C",
        default=False, action="store_true",
        help="domain prediction based on CDD only")
    domain_analysis_parser.add_argument("--ipr", "-I",
        default=False, action="store_true",
        help="domain prediction based on InterProScan only")
    domain_analysis_parser.add_argument("--force", "-F",
        default=False, action="store_true",
        help="force flag for the current analysis, removes already existing predictions")
    domain_analysis_parser.add_argument("--cdd_db", "-cdb",
        default='source_files/reference_db_files/cdd/Cdd', 
        help="Provide absolute path of CDD databases based on the chosen method")
    domain_analysis_parser.add_argument("--ipr_db", "-idb",
        default='source_files/reference_db_files/interpro/interproscan', 
        help="Provide absolute path of InterPro databases based on the chosen method")
    domain_analysis_parser.add_argument("--predicted", "-pred",
        default='output/0_predicted_domains',
        help="Provide output path for domain prediction files")
    domain_analysis_parser.add_argument("--fasta_path", "-fp",
        default='APRICOT_analysis/input/mapped_query_annotation/fasta_path_mapped_query',
        help="Provide absolute path of fasta files for query proteins")
    
    filter_domain_table = subparsers.add_parser(
        "filter",
        help="Filter queries predicted with domains of interest (and optional parameter thresholds) and extend their annotations")
    filter_domain_table.set_defaults(func=filter_domain)
    filter_domain_table.add_argument("--analysis_path", "-ap",
        default='APRICOT_analysis', help="Provide analysis path")
    filter_domain_table.add_argument("--cdd", "-C",
        default=False, action="store_true",
        help="Filter of domain prediction based on CDD only")
    filter_domain_table.add_argument("--ipr", "-I",
        default=False, action="store_true",
        help="Filter of domain prediction based on InterProScan only")
    filter_domain_table.add_argument("--domain_description_file", "-dd",
        default='source_files/domain_data/all_keyword_selected_domain_data.tab',
        help='Description table of the selected domains')
    filter_domain_table.add_argument("--similarity", "-sim", default=30,
        help="Percent similarity of prediction with reference")
    filter_domain_table.add_argument("--coverage", "-cov", default=40,
        help="Percent coverage of reference domain in prediction")
    filter_domain_table.add_argument("--identity", "-iden",
        help="Percent identity of prediction with reference")
    filter_domain_table.add_argument("--evalue", "-eval", default=1,
        help="Evalue of the domain prediction")
    filter_domain_table.add_argument("--gap", "-gap", default=None,
        help="Percent gap in predicted domain")
    filter_domain_table.add_argument("--bit", "-bit", default=None,
        help="Bit score in predicted domain")
    filter_domain_table.add_argument("--go_path",  "-gp",
        default='source_files/reference_db_files/go_mapping/',
        help='Go mapping data from fixed database reference files')
    filter_domain_table.add_argument("--pred_path", "-predp",
        default='output/0_predicted_domains', 
        help="Raw files of domain prediction")
    filter_domain_table.add_argument("--up_table", "-ref",
        default='input/uniprot_reference_table',
        help="Uniprot proteome table from UniProt")
    filter_domain_table.add_argument("--xml_info", "-feat",
        default='input/mapped_query_annotation/mapped_protein_xml_info_tables/query_feature_table.csv',
        help="Uniprot proteome table from UniProt")
    filter_domain_table.add_argument("--compile_out", "-co",
        default='output/1_compiled_domain_information', 
        help="Data with annotation after filtering")
    filter_domain_table.add_argument("--selected", "-sel",
        default="output/2_selected_domain_information/combined_data/annotation_extended_for_selected.csv",
        help="output path for the selected data with annotations")

    classify_selected_output = subparsers.add_parser(
        "classify",
        help="Optional classification of selected prediction in smaller groups by class keywords")
    classify_selected_output.set_defaults(func=classify_output)
    classify_selected_output.add_argument("--analysis_path", '-ap',
        default='APRICOT_analysis', help="Provide analysis path")
    classify_selected_output.add_argument("--selected", "-sel",
        default="output/2_selected_domain_information/combined_data/annotation_extended_for_selected.csv",
        help="Selected data file (from select) with annotations")
    classify_selected_output.add_argument("--class_kw", "-ck",
        default='source_files/domain_data/keywords_for_result_classification.txt',
        help="Path for keyword files")
    classify_selected_output.add_argument("--classify", "-cl",
        help="Optional comma separated keyword for result classification")
    classify_selected_output.add_argument("--classified", "-c",
        default="output/2_selected_domain_information/classified_data",
        help="Classification of selected data based on provided keywords")
    classify_selected_output.add_argument("--db_root", "-dr",
        default='source_files', help="Path for keyword files")

    get_annotation_scoring = subparsers.add_parser(
        "annoscore",
        help="Score and rank predicted data by 'annotation scoring'")
    get_annotation_scoring.set_defaults(func=annotation_scoring)
    get_annotation_scoring.add_argument("--analysis_path", '-ap',
        default='APRICOT_analysis', help="Provide analysis path")
    get_annotation_scoring.add_argument("--selected", "-sel",
        default='output/2_selected_domain_information/combined_data/annotation_extended_for_selected.csv',
        help='Provided selected protein table')
    get_annotation_scoring.add_argument("--cdd_pred", '-cp',
        default='output/0_predicted_domains/cdd_analysis',
        help='Raw files obtained from CDD based domain prediction')
    get_annotation_scoring.add_argument("--scored", "-sco",
        default="output/3_annotation_scoring",
        help="Output path for annotation scoring files")
    get_annotation_scoring.add_argument("--needle_dir", "-nd",
        default="needle",
        help="path for the locally configured EMBOSS suite")
    
    get_analysis_summary = subparsers.add_parser(
        "summary", help="Summary analysis output")
    get_analysis_summary.set_defaults(func=analysis_summary)
    get_analysis_summary.add_argument("--analysis_path", '-ap',
        default='APRICOT_analysis', help="Provide analysis path")
    get_analysis_summary.add_argument("--query_map", '-q',
        default='input/query_proteins/query_to_uids.txt',
        help='query_to_uids.txt file created by APRICOT to save query mapping information')
    get_analysis_summary.add_argument("--domains", '-d',
        default='source_files/domain_data/all_keyword_selected_domain_data.tab',
        help='File containing all the keyword selected_domains of interest')
    get_analysis_summary.add_argument("--unfilter_path", '-uf',
        default='output/1_compiled_domain_information/unfiltered_data',
        help='Directory with the unfiltered domain data from output-1 (unfiltered_data)')
    get_analysis_summary.add_argument("--summarized", '-sum',
        default='output/5_analysis_summary/APRICOT_analysis_summary.txt',
        help="Provide output path")
    
    get_additional_annotation = subparsers.add_parser(
        "addanno",
        help="Optional annotation of the selected protein by -PDB, -PSORTB, -RAPTORX or -REFSS (see addanno -h)")
    get_additional_annotation.set_defaults(func=additional_annotation)
    get_additional_annotation.add_argument("--force", "-F",
        default=False, action="store_true",
        help="force flag for the current analysis, removes already existing predictions")
    get_additional_annotation.add_argument("--pdb", "-PDB",
        default=False, action="store_true",
        help="Optional annotation of the selected protein by PDB structure homolog")
    get_additional_annotation.add_argument("--psortb", "-PSORTB",
        default=False, action="store_true",
        help="Optional annotation of the selected protein by localization using PsortB")
    get_additional_annotation.add_argument("--raptorx", "-RAPTORX",
        default=False, action="store_true",
        help="Optional annotation of the selected protein by secondary structure using RaptorX")
    get_additional_annotation.add_argument("--refss", "-REFSS",
        default=False, action="store_true",
        help="Optional annotation of the selected protein by secondary structure using literature reference")
    get_additional_annotation.add_argument("--analysis_path", '-ap',
        default='APRICOT_analysis', help="Provide analysis path")
    get_additional_annotation.add_argument("--fasta_path", "-fp",
        default='APRICOT_analysis/input/mapped_query_annotation/fasta_path_mapped_query',
        help="Provide absolute path of fasta files for query proteins")
    get_additional_annotation.add_argument("--selected", "-sel",
        default='output/2_selected_domain_information/combined_data/annotation_extended_for_selected.csv',
        help='Provided selected protein table')
    get_additional_annotation.add_argument("--add_out", "-ao",
        default="output/4_additional_annotations",
        help="Output path for additional annotation data")
    get_additional_annotation.add_argument("--pdb_path", "-pdb_path",
        default="source_files/reference_db_files/pdb/pdb_sequence/pdb_sequence.txt",
        help="Provide absolute path of APRICOT formatted pdb database ~pdb/pdb_sequence/pdb_sequence.txt")
    get_additional_annotation.add_argument("--psortb_path", '-psortb_path',
        default='psort', help="Provide absolute path of APRICOT installed psortb")
    get_additional_annotation.add_argument("--raptorx_path", '-raptorx_path',
        default='run_raptorx-ss8.pl',
        help="Provide absolute path of APRICOT installed raptorx till the perl script run_raptorx-ss8.pl")
    
    visualize_analysis_result = subparsers.add_parser(
        "vis", help="Visualize analysis results (see vis -h) for detail")
    visualize_analysis_result.set_defaults(func=visualize_result)
    visualize_analysis_result.add_argument("--analysis_path", '-ap',
        default='APRICOT_analysis', help="Provide analysis path")
    visualize_analysis_result.add_argument("--ann_score", '-an',
        default='output/3_annotation_scoring/annotation_scoring_of_selected_data.csv',
        help="Provide annotation score file")
    visualize_analysis_result.add_argument("--add_anno", '-ad',
        default='output/4_additional_annotations',
        help="Provide path to additional annotation")
    visualize_analysis_result.add_argument("--selected", "-sel",
        default='output/2_selected_domain_information/combined_data/annotation_extended_for_selected.csv',
        help='Provided selected protein table')
    visualize_analysis_result.add_argument("--domain", '-D',
        default=True, action='store_true',
        help="Visualizes predicted domains on the query by highlighting")
    visualize_analysis_result.add_argument("--annoscore", '-A',
        default=False, action="store_true",
        help="Visualizes overview of prediction statistics")
    visualize_analysis_result.add_argument("--secstr", '-S',
        default=False, action="store_true",
        help="Visualizes secondary structures predicted by RaptorX")
    visualize_analysis_result.add_argument("--localiz", '-L',
        default=False, action="store_true",
        help="Visualizes subcellular localization predcited by PsortB")
    visualize_analysis_result.add_argument("--msa", '-M',
        default=False, action="store_true",
        help="Visualizes Multiple Sequence Alignments of homologous sequences from PDB")
    visualize_analysis_result.add_argument("--complete", '-C',
        default=False, action="store_true",
        help="Visualizes all the possible features")
    visualize_analysis_result.add_argument("--visualized", "-vi",
        default="output/visualization_files",
        help="Output path for visualization files")
    
    create_output_files = subparsers.add_parser(
        "format", help="Optional output file format as html or excel")
    create_output_files.set_defaults(func=output_format)
    create_output_files.add_argument("--analysis_path", '-ap',
        default='APRICOT_analysis', help="Provide analysis path")
    create_output_files.add_argument("--inpath", '-ip',
        default='APRICOT_analysis/output',
        help="Choose folder from analysis to be converted")
    create_output_files.add_argument("--html", "-HT",
        default=False, action="store_true")
    create_output_files.add_argument("--xlsx", "-XL",
        default=False, action="store_true")
    create_output_files.add_argument("--formatted", "-form",
        default="output/format_output_data",
        help="Output path for files with different file formats")
    
    args = parser.parse_args()
    if args.version is True:
        print("Apricot version " + __version__)
    elif "func" in dir(args):
        args.func(args)
    else:
        parser.print_help()

def create_folders(args):
    setup_analysis_folders(args.analysis_path)
    print(
    "\n----------------------------------------------------------------\n"
    "APRICOT analysis folders are created at %s" % args.analysis_path)
    print("----------------------------------------------------------------\n")

def select_uniprot_taxid(args):
    reference_taxonomy_file = os.path.join(args.db_path, 'all_taxids/speclist.txt')
    selected_taxonomy_file = os.path.join(args.db_path, 'selected_taxonomy_ids.txt')
    if ',' in args.species:
        for each_species in args.species.split(','):
            select_taxids(each_species, reference_taxonomy_file, selected_taxonomy_file)
    else:
        select_taxids(args.species, reference_taxonomy_file, selected_taxonomy_file)
    print("\n----------------------------------------------------------------\n"
          "Selected taxonomy ids associated with the query species %s" % args.species)
    print("Please refer to source_files/reference_db_files/selected_taxonomy_ids.txt for the selected taxonomy ids."
    "\n----------------------------------------------------------------\n")

def query_set(args):
    xml_path = os.path.join(args.analysis_path, args.xml_path)
    fasta_path = args.fasta_path
    feature_file = os.path.join(args.analysis_path, args.feature_table)
    reference_file = os.path.join(args.analysis_path, args.proteome_path)
    query_file = os.path.join(args.analysis_path, args.query_path)
    for files in [query_file, reference_file, feature_file]:
        in_fh = open(files, 'w')
        in_fh.close()
    url_part1 = \
    "http://www.uniprot.org/uniprot/?sort=score&desc=&compress=no&query=Taxonomy:"
    url_part2 = "&force=no&preview=true&format="
    uniprot_url_query="http://www.uniprot.org/uniprot/?format=tab&query="
    uniprot_url_annotation=\
    "&columns=id,entry+name,reviewed,protein+names,genes,organism,length,go"
    header = '\t'.join(['Entry', 'Entry name', 'Status',
                'Protein names', 'Gene names',
                'Organism', 'Length', 'Gene ontology (GO)'])+'\n'
    if args.uids:
        print("UniProt query ids: %s" % args.uids)
        create_uid_query_file(args.uids, query_file)
        with open(reference_file, 'a') as proteome_table:
            proteome_table.write(header)
            if ',' in args.uids:
                for each_uid in args.uids.split(','):
                    uniprot_url = ''.join([uniprot_url_query,
                    "Accession:", each_uid, uniprot_url_annotation])
                    format_uniprot_table(proteome_table, uniprot_url)
            else:
                uniprot_url = ''.join([uniprot_url_query,
                "Accession:", args.uids, uniprot_url_annotation])
                format_uniprot_table(proteome_table, uniprot_url)
        collect_uniprot_information = CollectUniprotInformation(
            query_file, xml_path, fasta_path, feature_file)
        collect_uniprot_information.get_uniprot_xml_and_fasta()
        collect_uniprot_information.create_feature_table()
    if args.taxid:
        uniprot_url=''.join([uniprot_url_query,
        "Taxonomy:", args.taxid, uniprot_url_annotation])
        with open(reference_file, 'a') as proteome_table:
            proteome_table.write(header)
            format_uniprot_table(proteome_table, uniprot_url)
        if args.geneids:
            print("Query genes: %s" % args.geneids)
            create_gene_query_file(args.geneids, query_file, reference_file)
            collect_uniprot_information = CollectUniprotInformation(
            query_file, xml_path, fasta_path, feature_file)
            collect_uniprot_information.get_uniprot_xml_and_fasta()
            collect_uniprot_information.create_feature_table()
        elif args.proteome:
            print("\n----------------------------------------------------------------\n"
                  "Starting the full proteome analysis for the taxonomy id: %s" %
                  args.taxid)
            print("\n----------------------------------------------------------------\n")
            xml_url = url_part1 + args.taxid + url_part2 + 'xml'
            fasta_url = url_part1 + args.taxid + url_part2 + 'fasta'
            create_proteome_query_file('proteome', query_file, reference_file)
            collect_uniprot_information = CollectUniprotInformation(
                query_file, xml_path, fasta_path, feature_file)
            collect_uniprot_information.download_xml(xml_url, args.taxid)
            collect_uniprot_information.download_fasta(fasta_url, args.taxid)
            collect_uniprot_information.create_feature_table()
    if args.fasta:
        with open(args.fasta, 'r') as in_fh:
            for entry in in_fh.read().split('>'):
                if not entry == '':
                    uid = entry.split('\n')[0].split('|')[1]
                    with open(args.fasta_path + '/' + uid + '.fasta', 'w') as out_fh:
                        out_fh.write(">%s" % entry)

def keyword_input(args):
    kw_path = os.path.join(args.db_root, 'domain_data')
    print("\n----------------------------------------------------------------\n"
          "Keyword file path: %s" % kw_path)
    print("Keywords for domain selection: %s" % args.kw_domain)
    keyword_domains_files(kw_path, args.kw_domain)
    if args.classify:
        print("Keywords for classification of domain prediction: %s"
              % args.classify)
        keyword_class_files(kw_path, args.classify)
    else:
        print("Keywords for classification of domain prediction: %s"
              % args.kw_domain)
        keyword_class_files(kw_path, args.kw_domain)
    print("----------------------------------------------------------------\n")

def select_domain_data(args):
    if not args.skip_select:
        domain_path = os.path.join(args.db_root, 'domain_data')
        cdd_domain_path = os.path.join(domain_path, 'cdd')
        ipr_domain_path = os.path.join(domain_path, 'interpro')
        selected_domain_path = os.path.join(domain_path, 'all_keyword_selected_domain_data.tab')
        
        cdd_table = os.path.join(args.db_root, 'reference_db_files/cdd/cdd_annotation_data/cddid.tbl')
        ipr_table = os.path.join(args.db_root, 'reference_db_files/interpro/interpro_annotation_data/interproid.tbl')
        pfam_domain_file = os.path.join(args.db_root, 'reference_db_files/pfam/pfamA.txt')
        interpro_mapped_cdd = os.path.join(args.db_root, 'reference_db_files/interpro/interpro_annotation_data',
                                           'mapped_interpro_to_cdd_length.csv')
        
        if args.cdd:
            print("\n----------------------------------------------------------------\n"
                "Selecting domains from CDD only"
                "\n----------------------------------------------------------------\n")
            keyword_selected_cdd_selection = KeywordBasedDomainSelection(
                args.dom_kw, pfam_domain_file, cdd_table, ipr_table, 
                interpro_mapped_cdd, cdd_domain_path, ipr_domain_path)
            keyword_selected_cdd_selection.select_cdd_domains()
        elif args.ipr:
            print("\n----------------------------------------------------------------\n"
                "Selecting domains from InterPro only"
                "\n----------------------------------------------------------------\n")
            keyword_selected_ipr_selection = KeywordBasedDomainSelection(
                args.dom_kw, pfam_domain_file, cdd_table, ipr_table,
                interpro_mapped_cdd, cdd_domain_path, ipr_domain_path)
            keyword_selected_ipr_selection.select_ipr_domains()
        else:
            print("\n----------------------------------------------------------------\n"
                  "Selecting domains from both the databases"
                "\n----------------------------------------------------------------\n")
            keyword_selected_cdd_selection = KeywordBasedDomainSelection(
                args.dom_kw, pfam_domain_file, cdd_table, ipr_table,
                interpro_mapped_cdd, cdd_domain_path, ipr_domain_path)
            keyword_selected_cdd_selection.select_cdd_and_ipr_domains()
        merge_domain_data(
        cdd_domain_path + '/all_keyword_selected_cdd_data.tab',
        ipr_domain_path + '/all_keyword_selected_interpro_data.tab',
        selected_domain_path)
    else:
        print("\n----------------------------------------------------------------\n"
              "Skip running the subcommand 'select' due to the given flag '--skip_select'"
              "\n----------------------------------------------------------------\n")

def run_domain_analysis(args):
    script_path = '/'.join(
        os.path.realpath(__file__).split('/')[0:-1]) + '/apricotlib'
    analysis_path_cdd = os.path.join(args.analysis_path, args.predicted, 'cdd_analysis')
    analysis_path_ipr = os.path.join(args.analysis_path, args.predicted, 'ipr_analysis')
    if not os.path.exists(analysis_path_cdd):
        os.mkdir(analysis_path_cdd)
    if not os.path.exists(analysis_path_ipr):
        os.mkdir(analysis_path_ipr)
    if args.cdd:
        if not args.force:
            cdd_based_pred(args.fasta_path, analysis_path_cdd, args.cdd_db)
        else:
            force_cdd_based_pred(args.fasta_path, analysis_path_cdd, args.cdd_db)
    elif args.ipr:
        if not args.force:
            ipr_based_pred(args.fasta_path, analysis_path_ipr, args.ipr_db)
        else:
            force_ipr_based_pred(args.fasta_path, analysis_path_cdd, args.cdd_db)
    else:
        if not args.force:
            cdd_based_pred(args.fasta_path, analysis_path_cdd, args.cdd_db)
            ipr_based_pred(args.fasta_path, analysis_path_ipr, args.ipr_db)
        else:
            force_cdd_based_pred(args.fasta_path, analysis_path_cdd, args.cdd_db)
            force_ipr_based_pred(args.fasta_path, analysis_path_ipr, args.ipr_db)

def cdd_based_pred(fasta_path, outpath, cdd_db):
    for fasta in os.listdir(fasta_path):
        filename = fasta.split('.')[0]
        if os.path.exists(outpath + '/' + filename + '.txt'):
            print("CDD based analysis result for %s exists." % filename)
        else:
            print("CDD based domain prediction for file %s..." % filename)
            cdd_process = subprocess.Popen(
                ["rpsblast -i %s/%s -d %s/Cdd -o %s/%s.txt" % (
                fasta_path, fasta, cdd_db, outpath, filename)], shell=True).wait()
            
def force_cdd_based_pred(fasta_path, outpath, cdd_db):
    for fasta in os.listdir(fasta_path):
        filename = fasta.split('.')[0]
        if os.path.exists(outpath + '/' + filename + '.txt'):
            print("Removing the existing file %s" % filename)
            os.remove(outpath + '/' + filename + '.txt')
        print("CDD based domain prediction for file %s..." % filename)
        cdd_process = subprocess.Popen(
            ["rpsblast -i %s/%s -d %s/Cdd -o %s/%s.txt" % (
            fasta_path, fasta, cdd_db, outpath, filename)], shell=True).wait()
            
def ipr_based_pred(fasta_path, outpath, ipr_db):
    for fasta in os.listdir(fasta_path):
        filename = fasta.split('.')[0]
        if os.path.exists(outpath + '/' + filename + '.tsv'):
            print("IPR based analysis result for %s exists." % filename)
        else:
            ipr_process = subprocess.Popen(
                ["bash %s/interproscan.sh -i %s/%s -b %s/%s -f tsv -dp -goterms" %
                 (ipr_db, fasta_path, fasta, outpath, filename)], shell=True).wait()
            
def force_ipr_based_pred(fasta_path, outpath, ipr_db):
    for fasta in os.listdir(fasta_path):
        filename = fasta.split('.')[0]
        if os.path.exists(outpath + '/' + filename + '.tsv'):
            print("Removing the existing file %s.tsv" % filename)
            os.remove(outpath + '/' + filename + '.tsv')
        ipr_process = subprocess.Popen(
                ["bash %s/interproscan.sh -i %s/%s -b %s/%s -f tsv -dp -goterms" %
                 (ipr_db, fasta_path, fasta, outpath, filename)], shell=True).wait()

def filter_domain(args):
    parameter_list = []
    domain_description_file = args.domain_description_file
    go_path = args.go_path
    uniprot_table = os.path.join(args.analysis_path, args.up_table) #(:-2)
    xml_info = os.path.join(args.analysis_path, args.xml_info) #(3, 5:)
    outpath1 = os.path.join(args.analysis_path, args.compile_out)
    filtered_result_path = os.path.join(outpath1, 'selected_data')
    all_prediction_output = os.path.join(outpath1, 'unfiltered_data')
    outpath2 = os.path.join(args.analysis_path, args.selected)
    if args.similarity:
        parameter_list.append("similarity:%s" % args.similarity)
    else:
        parameter_list.append("similarity:%s" % 'NA')
    if args.identity:
        parameter_list.append("identity:%s" % args.identity)
    else:
        parameter_list.append("identity:%s" % 'NA')
    if args.evalue:
        parameter_list.append("evalue:%s" % args.evalue)
    else:
        parameter_list.append("evalue:%s" % 'NA')
    if args.gap:
        parameter_list.append("gaps:%s" % args.gap)
    else:
        parameter_list.append("gaps:%s" % 'NA')
    if args.coverage:
        parameter_list.append("coverage:%s" % args.coverage)
    else:
        parameter_list.append("coverage:%s" % 'NA')
    if args.bit:
        parameter_list.append("bit:%s" % args.bit)
    else:
        parameter_list.append("bit:%s" % 'NA')
    if args.cdd:
        analysis_result_path = '%s/%s/cdd_analysis' % (
            args.analysis_path, args.pred_path)
        if not os.path.exists(analysis_result_path):
            os.mkdir(analysis_result_path)
        filter_predicted_domains = FilterPredictedDomains('cdd',
        analysis_result_path, domain_description_file, go_path,
        filtered_result_path, all_prediction_output, parameter_list)
        filter_predicted_domains.streamline_filter_predicted_domains()
        selected_protein_table_without_uniprot = SelectedProteinTableWithoutUniprot(
        filtered_result_path, outpath2)
        selected_protein_table_without_uniprot.streamline_selected_protein_table_without_uniprot()
    elif args.ipr:
        analysis_result_path = '%s/%s/ipr_analysis' % (
            args.analysis_path, args.pred_path)
        if not os.path.exists(analysis_result_path):
            os.mkdir(analysis_result_path)
        filter_predicted_domains = FilterPredictedDomains('ipr',
        analysis_result_path, domain_description_file, go_path,
        filtered_result_path, all_prediction_output, parameter_list)
        filter_predicted_domains.streamline_filter_predicted_domains()
        selected_protein_table_without_uniprot = SelectedProteinTableWithoutUniprot(
        filtered_result_path, outpath2)
        selected_protein_table_without_uniprot.streamline_selected_protein_table_without_uniprot()
    else:
        print(
            "\n----------------------------------------------------------------\n"
            "Filtering of the predicted data using both CDD and InterPro databases\n"
            "----------------------------------------------------------------\n")
        for method in ('cdd', 'ipr'):
            analysis_result_path = '%s/%s/%s_analysis' % (
            args.analysis_path, args.pred_path, method)
            filter_predicted_domains = FilterPredictedDomains(method,
            analysis_result_path, domain_description_file, go_path,
            filtered_result_path, all_prediction_output, parameter_list)
            filter_predicted_domains.streamline_filter_predicted_domains()
        selected_protein_table = SelectedProteinTable(
        filtered_result_path, uniprot_table, xml_info, outpath2)
        selected_protein_table.streamline_selected_protein_table()

def classify_output(args):
    kw_path = args.db_root + '/domain_data'
    if args.classify:
        print("Keywords for classification of domain prediction: %s"
              % args.classify)
        keyword_class_files(kw_path, args.classify)
    print(
        "\n----------------------------------------------------------------\n"
        "Classifying selected domain data using classification keywords:\n"
        "source_files/domain_data/keywords_for_result_classification.txt\n"
        "----------------------------------------------------------------\n")
    select_data = args.analysis_path + '/' + args.selected
    outpath = args.analysis_path + '/' + args.classified
    class_kw = args.class_kw
    protein_classification = ProteinClassifier(
    select_data, class_kw, outpath)
    protein_classification.streamline_protein_classification()

def annotation_scoring(args):
    print("\n----------------------------------------------------------------\n"
      "Calculating annotation scores for the selected domain data\n"
      "This might take some time, please be patient ..."
    "\n----------------------------------------------------------------\n")
    outpath = args.analysis_path + '/' + args.scored
    needle_path = args.needle_dir
    select_data = args.analysis_path + '/' + args.selected
    cdd_pred = args.analysis_path + '/' + args.cdd_pred
    compute_composition_distance = ComputeCompositionDistance(
    select_data, cdd_pred, outpath, needle_path)
    compute_composition_distance.streamline_annotation_scoring()

def analysis_summary(args):
    summary_out = args.analysis_path + '/' + args.summarized #output/5_analysis_summary
    query_map = args.analysis_path + '/' + args.query_map
    unfilter_path = args.analysis_path + '/' + args.unfilter_path #output/2_selected_domain_information/unfiltered_data
    create_analysis_summary = CreateAnalysisSummary(
    query_map, args.domains, unfilter_path, summary_out)
    create_analysis_summary.streamline_create_analysis_summary()
    print("\n----------------------------------------------------------------\n"
            "Please check the analysis overview: "
            "%s/output/5_analysis_summary/APRICOT_analysis_summary.txt" % args.analysis_path)
    print("\n----------------------------------------------------------------\n")
    
def additional_annotation(args):
    selected_proteins = args.analysis_path + '/' + args.selected
    pdb_out = args.analysis_path + '/' + args.add_out+ '/pdb_sequence_prediction'
    psortb_out = args.analysis_path + '/' + args.add_out+ '/protein_localization'
    secstr_out = args.analysis_path + '/' + args.add_out+ '/protein_secondary_structure'
    if args.force:
        if args.pdb:
            annotation_pdb(args.pdb_path, args.fasta,
                           selected_proteins, pdb_out)
        if args.psortb:
            annotation_localize(args.psortb_path, args.fasta,
                           selected_proteins, psortb_out)
        if args.raptorx:
            annotation_secstr_raptorx(args.raptorx_path, args.fasta,
                           selected_proteins, secstr_out)
        if args.refss:
            annotation_secstr_literature(args.fasta, selected_proteins,
                            secstr_out)
    elif args.pdb:
        if 'pdb_analysis.txt' in os.listdir(pdb_out) and len(
            open(pdb_out + '/pdb_analysis.txt').readlines()) > 3:
            print("PDB based structural homologs are available in the path %s"
                  % pdb_out)
            print("Please use the --force (-F) flag to reanalyze.")
    elif args.psortb:
        if 'psortb_analysis.txt' in os.listdir(psortb_out) and len(open(
            psortb_out + '/psortb_analysis.txt').readlines()) > 3:
            print("PsortB based localization predictions are available in the path %s"
                  % psortb_out)
            print("Please use the --force (-F) flag to reanalyze.")
    elif args.raptorx:
        if 'raptorx_analysis.txt' in os.listdir(secstr_out) and len(
            open(secstr_out + '/raptorx_analysis.txt').readlines()) > 3:
            print("RaptorX calculated secondary structures are available in the path %s"
                      % secstr_out)
            print("Please use the --force (-F) flag to reanalyze.")
    elif args.refss:
        if 'literature_ss_analysis.txt' in os.listdir(secstr_out) and len(
            open(secstr_out + '/literature_ss_analysis.txt').readlines()) > 3:
            print("Literature based 3-state secondary structures are available in the path %s"
                  % secstr_out)
            print("Please use the --force (-F) flag to reanalyze.")
    else:
        if args.pdb:
            annotation_pdb(args.pdb_path, args.fasta,
                           selected_proteins, pdb_out)
        if args.psortb:
            annotation_localize(args.psortb_path, args.fasta,
                           selected_proteins, psortb_out)
        if args.raptorx:
            annotation_secstr_raptorx(args.raptorx_path, args.fasta,
                           selected_proteins, secstr_out)
        if args.refss:
            annotation_secstr_literature(args.fasta, selected_proteins, secstr_out)
    
def annotation_pdb(pdb_path, fasta_path, selected_proteins, outpath):
    pdb_homology_analysis = PdbHomologyAnalysis(
    selected_proteins, pdb_path, fasta_path, outpath)
    pdb_homology_analysis.streamline_pdb_homology_analysis()
        
def annotation_localize(psortb_path, fasta_path, selected_proteins, outpath):
    psortb_subcellular_localization = PsortbSubcellularLocalization(
    selected_proteins, psortb_path, fasta_path, outpath)
    psortb_subcellular_localization.streamline_psortb_localization_analysis()
    
def annotation_secstr_raptorx(raptorx_path, fasta_path, selected_proteins, outpath):
    raptorx_secstr_analysis = RaptorxSecstrAnalysis(
    selected_proteins, raptorx_path, fasta_path, outpath)
    raptorx_secstr_analysis.streamline_raptorx_secstr_analysis()
    
def annotation_secstr_literature(fasta_path, selected_proteins, outpath):
    literature_secstr_analysis = LiteratureSecstrAnalysis(
    selected_proteins, fasta_path, outpath)
    literature_secstr_analysis.streamline_literature_secstr_analysis()
    
def visualize_result(args):
    viz_apricot_analysis = VizApricotAnalysis(
        args.analysis_path + '/' + args.ann_score,
        args.analysis_path + '/' + args.selected,
        args.analysis_path + '/' + args.add_anno,
        args.analysis_path + '/' + args.visualized)
    print("\n----------------------------------------------------------------\n"
          "Running visualization module..."
          "\n----------------------------------------------------------------\n")
    viz_apricot_analysis.viz_domain_data()
    if args.domain:
        viz_apricot_analysis.domain_highlight()
    if args.annoscore:
        viz_apricot_analysis.viz_annotation_scoring()
    if args.secstr:
        viz_apricot_analysis.viz_secondary_structure()
    if args.localiz:
        viz_apricot_analysis.viz_subcellular_localization()
    if args.msa:
        viz_apricot_analysis.viz_homologous_pdb_msa()
    if args.complete:
        viz_apricot_analysis.viz_all_the_visualization_files()
    print("\n----------------------------------------------------------------\n")
        
def output_format(args):
    analysis_path = args.analysis_path
    inpath = args.inpath
    outpath = os.path.join(analysis_path, args.formatted)
    html_out = os.path.join(outpath, 'html_files')
    excel_out = os.path.join(outpath, 'excel_files')
    if args.html:
        for folders in os.listdir(inpath):
            #if folders == '5_analysis_summary' or folders == '2_selected_domain_information':
            if not 'format_' in folders and not 'visualization' in folders:
                if not os.path.exists(html_out + '/' + folders):
                    os.mkdir(html_out + '/' + folders)
                csv_to_html(inpath + '/' + folders, html_out + '/' + folders)
        print("\n----------------------------------------------------------------\n"
            "The output files are now generated in HTML format: "
            "%s/output/format_output_data/"  % args.analysis_path)
        print("\n----------------------------------------------------------------\n")
    if args.xlsx:
        for folders in os.listdir(inpath):
            #if folders == '5_analysis_summary' or folders == '2_selected_domain_information':
            if not 'format_' in folders and not 'visualization' in folders:
                if not os.path.exists(excel_out + '/' + folders):
                    os.mkdir(excel_out + '/' + folders)
                csv_to_xlsx(inpath + '/' + folders, excel_out + '/' + folders)
        print("\n-------------- --------------------------------------------------\n"
            "The output files are now generated in excel (.xlsx) format: "
            "%s/output/format_output_data/" % args.analysis_path)
        print("\n----------------------------------------------------------------\n")
        
def minimum_subcommands(args):
    create_folders(args)
    query_set(args)
    select_domain_data(args)
    run_domain_analysis(args)
    filter_domain(args)
    classify_output(args)
    annotation_scoring(args)
    analysis_summary(args)
    output_format(args)

main()
