#!/usr/bin/env python3
# vim: set fileencoding=utf-8 :
#
# Copyright (C) 2019 Satria A. Kautsar
# Wageningen University & Research
# Bioinformatics Group
"""the main script of bigslice"""

import argparse
from argparse import RawTextHelpFormatter
from os import getpid, path, makedirs, remove, sched_getaffinity
from sys import argv
from tempfile import TemporaryDirectory
import glob
import re
import shutil
from multiprocessing import Pool
import bigslice  # for referencing module folders
from bigslice.modules.data.database import Database
from bigslice.modules.data.bgc import BGC
from bigslice.modules.data.taxonomy import Taxonomy
from bigslice.modules.data.hmm import HMMDatabase
from bigslice.modules.data.run import Run
from bigslice.modules.data.hsp import HSP
from bigslice.modules.data.features import Features
from bigslice.modules.clustering.birch import BirchClustering
from bigslice.modules.clustering.membership import Membership
from bigslice.modules.utils import reversed_fp_iter, get_chunk
from bigslice.modules.utils import copy_output_template
from bigslice.modules.utils import store_pickle
from bigslice.modules.output.csv import export_tsv_to_folder
from time import time
from datetime import datetime
import subprocess
from typing import List
import sqlite3
import pandas as pd
import numpy as np
from tqdm import tqdm
from pyhmmer import easel, hmmer, plan7


_last_timestamp = None

def run_subpfam_scan(tuples):
    top_k = 3
    in_fasta_path, hmm_path, sub_pfam_ids = tuples
    sequences = list(easel.SequenceFile(in_fasta_path, digital=True, alphabet=easel.Alphabet.amino()))
    with plan7.HMMFile(hmm_path) as hmm_file:
        parsed = {}
        for top_hits in hmmer.hmmsearch(
                hmm_file, sequences,
                cpus=1, T=20, domT=20
            ):
            for hit in top_hits:
                if hit.best_domain.score < top_hits.domT:
                    continue
                target_name = hit.name.decode()
                hmm_name = top_hits.query_name.decode()
                alignment = hit.best_domain.alignment

                bgc_id, cds_id, parent_hsp_id, locs = target_name.split("|")
                bgc_id = int(bgc_id.split("bgc:")[-1])
                cds_id = int(cds_id.split("cds:")[-1])
                parent_hsp_id = int(parent_hsp_id.split("hsp:")[-1])
                locs = tuple(map(int, locs.split("-")))

                if target_name not in parsed:
                    parsed[target_name] = []
                parsed[target_name].append({
                    "cds_id": cds_id,
                    "hmm_id": sub_pfam_ids[hmm_name],
                    "parent_hsp_id": parent_hsp_id,
                    "bitscore": int(hit.best_domain.score)
                })

        results = []
        for target_name, hsps in parsed.items():
            for i, hsp in enumerate(
                sorted(hsps, key=lambda x: x["bitscore"], reverse=True)
            ):
                if i >= top_k:
                    break
                adjusted_bitscore = 255 - int((255 / top_k) * i)
                hsp["bitscore"] = adjusted_bitscore
                results.append(hsp)

        return results


def run_subpfam_scan_hmmscan(tuples):
    top_k = 3
    in_fasta_path, hmm_path, sub_pfam_ids = tuples

    with easel.SequenceFile(in_fasta_path, digital=True, alphabet=easel.Alphabet.amino()) as sequencefile:
        results = []
        pipeline = plan7.Pipeline(easel.Alphabet.amino(), T=20)

        for sequence in sequencefile:
            with plan7.HMMFile(hmm_path) as hmm_file:
                for i, hit in enumerate(pipeline.scan_seq(sequence, hmm_file)):
                    # this will return a sorted high-to-low hits, so no need to re-sort
                    if i >= top_k:
                        break
                    target_name = sequence.name.decode()
                    hmm_name = hit.name.decode()

                    bgc_id, cds_id, parent_hsp_id, locs = target_name.split("|")
                    bgc_id = int(bgc_id.split("bgc:")[-1])
                    cds_id = int(cds_id.split("cds:")[-1])
                    parent_hsp_id = int(parent_hsp_id.split("hsp:")[-1])
                    locs = tuple(map(int, locs.split("-")))

                    results.append({
                        "cds_id": cds_id,
                        "hmm_id": sub_pfam_ids[hmm_name],
                        "parent_hsp_id": parent_hsp_id,
                        "bitscore": int(255 - int((255 / top_k) * i))
                    })

        return results


def get_elapsed():
    # not so fancy, just for a quickie
    global _last_timestamp
    if not _last_timestamp:
        _last_timestamp = time()
        return 0
    else:
        now = time()
        elapsed = now - _last_timestamp
        _last_timestamp = now
        return elapsed


def fetch_pool(num_threads: int):
    available_cpu_ids = list(sched_getaffinity(0))
    pool = Pool(processes=num_threads)
    try:
        # set cores for the multiprocessing pools
        all_cpu_ids = set()
        for i, p in enumerate(pool._pool):
            cpu_id = str(available_cpu_ids[len(available_cpu_ids) - (i % len(available_cpu_ids)) - 1])
            subprocess.run(["taskset",
                            "-p", "-c",
                            cpu_id,
                            str(p.pid)], check=True)
            all_cpu_ids.add(cpu_id)

        # set core for the main python script
        subprocess.run(["taskset",
                        "-p", "-c",
                        ",".join(all_cpu_ids),
                        str(getpid())], check=True)

    except FileNotFoundError:
        pass  # running in OSX?
    return pool


def run_hmmscan(arguments: tuple):
    fasta_path, hmm_path, result_path, cutoff, make_alignment = arguments
    if path.exists(result_path):
        with open(result_path, "r") as fp:
            if next(reversed_fp_iter(fp)).rstrip() == "[ok]":
                # already hmmscanned
                return 1
            else:
                # file is broken, remove
                remove(result_path)

    command = [
        "hmmscan",
        "--acc",
        "--cpu", "1",
        "-o", result_path,
        hmm_path,
        fasta_path
    ]

    if not make_alignment:
        command.insert(1, "--noali")

    if cutoff == "ga":
        command.insert(1, "--cut_ga")
    elif isinstance(cutoff, float) or isinstance(cutoff, int):
        command = [command[0]] + [
            "--domT", str(cutoff),
            "--incdomT", str(cutoff)
        ] + command[1:]

    try:
        ret = subprocess.run(command, check=True)
        if ret.returncode != 0 and path.exists(result_path):
            remove(result_path)
        return ret.returncode == 0
    except subprocess.CalledProcessError as e:
        print("hmmscan error: {}".format(e.stderr))
        pass

    return False


def update_bgc_status(bgc_id, run_id, status, database):
    return database.update("run_bgc_status",
                           {"status": status},
                           "WHERE run_id=? AND bgc_id=?",
                           (run_id, bgc_id))


def subpfam_scan(arguments: tuple):
    # TODO: implement this
    return -1


def feature_extraction(arguments: tuple):
    # TODO: implement this
    return -1


def check_pfams_exist(pfam_ids: List[int], database: Database):
    result = {}
    for row in database.select(
        "hsp,cds",
        "WHERE hsp.cds_id=cds.id" +
        " AND hsp.hmm_id IN (" + ",".join(["?" for i in pfam_ids]) + ")",
        parameters=tuple(pfam_ids),
        props=["bgc_id", "hmm_id"],
        distinct=True
    ):
        if row["bgc_id"] not in result:
            result[row["bgc_id"]] = set()
        result[row["bgc_id"]].add(row["hmm_id"])
    return result


def check_hmmscanned_in_db(bgc_ids: List[int],
                           md5_biosyn_pfam: str, database: Database):
    rows = database.select(
        "run_bgc_status,run,hmm_db",
        "WHERE run_bgc_status.run_id=run.id AND " +
        "run.hmm_db_id=hmm_db.id AND " +
        "hmm_db.md5_biosyn_pfam=? AND " +
        "run_bgc_status.status>=2 AND " +
        "bgc_id IN (" + ",".join(map(str, bgc_ids)) + ")",
        parameters=(md5_biosyn_pfam,),
        props=["bgc_id"])

    return([row["bgc_id"] for row in rows])


def check_subpfam_scanned_in_db(bgc_ids: List[int],
                                md5_sub_pfam: str, database: Database):
    rows = database.select(
        "run_bgc_status,run,hmm_db",
        "WHERE run_bgc_status.run_id=run.id AND " +
        "run.hmm_db_id=hmm_db.id AND " +
        "hmm_db.md5_sub_pfam=? AND " +
        "run_bgc_status.status>=3 AND " +
        "bgc_id IN (" + ",".join(map(str, bgc_ids)) + ")",
        parameters=(md5_sub_pfam,),
        props=["bgc_id"])

    return([row["bgc_id"] for row in rows])


def check_features_extracted_in_db(bgc_ids: List[int],
                                   hmm_db_id: int, database: Database):
    rows = database.select(
        "run_bgc_status,run,hmm_db",
        "WHERE run_bgc_status.run_id=run.id AND " +
        "run.hmm_db_id=? AND " +
        "run_bgc_status.status>=4 AND " +
        "bgc_id IN (" + ",".join(map(str, bgc_ids)) + ")",
        parameters=(hmm_db_id,),
        props=["bgc_id"])

    return([row["bgc_id"] for row in rows])


def check_gbk_exists(dataset_id: int, orig_gbk_path: str, database: Database):
    existing = database.select("bgc",
                               "WHERE dataset_id=? AND orig_folder=? AND orig_filename=?",
                               parameters=(dataset_id, path.dirname(
                                   orig_gbk_path), path.basename(orig_gbk_path)),
                               props=["id"])
    if len(existing) > 0:
        return [row["id"] for row in existing]
    else:
        return []


def process_input_folder(folder_path: str, database: Database,
                         mp_pool: Pool):

    # load metadata file
    metadata_file = path.join(folder_path, "datasets.tsv")
    datasets = {}
    if not path.exists(metadata_file):
        raise Exception(
            "Can't find metadata file ({})".format(metadata_file))
    with open(metadata_file, "r") as metadata_handle:
        for line in metadata_handle:
            if line.startswith("#"):  # comments
                continue
            line = line.rstrip()
            ds_name, ds_path, ds_taxonomy_path, ds_desc = line.split("\t")

            # dataset name should be unique
            if ds_name in datasets:
                raise Exception(
                    "Dataset name should be unique! " +
                    "duplicated name found: " + ds_name)

            # check if dataset folder exists
            dataset_folder_path = path.join(
                folder_path,
                ds_path)
            if not path.exists(dataset_folder_path):
                raise Exception(
                    "Can't find dataset folder: {}".format(
                        dataset_folder_path))

            datasets[ds_name] = {
                "path": ds_path,
                "taxonomy_path": ds_taxonomy_path,
                "desc": ds_desc
            }

    # define eligible regexes for clustergbks
    eligible_regexes = [re.compile(rgx) for rgx in [
        "^BGC[0-9]{7}$",  # MIBiG
        "^.+\\.cluster[0-9]+$",  # antiSMASH4 clustergbks
        "^.+\\.region[0-9]+$",  # antiSMASH5 clustergbks
    ]]

    # process datasets
    dataset_bgc_ids = {}
    for dataset_name, dataset_meta in datasets.items():

        # This only store bgcs from gbk files
        # exist within the folder
        # e.g. if a dataset has been parsed before,
        # then a folder is deleted, the previous BGCs
        # won't be included here
        dataset_bgc_ids[dataset_name] = set()

        # check if dataset exists in database
        queried_dataset = database.select(
            "dataset",
            "WHERE name=?",
            parameters=(dataset_name,),
            props=["id"])
        assert len(queried_dataset) <= 1

        if len(queried_dataset) > 0:
            # dataset exists, use entries from db
            # note:
            # in the future, we would like to match
            # between what's in the database
            # and what's in the folder, and adjust
            # the included BGCs accordingly
            # i.e. if there's a new file to parse
            dataset_id = queried_dataset[0]["id"]
            dataset_bgc_ids[dataset_name] = [
                row["id"] for row in database.select(
                    "bgc",
                    "WHERE dataset_id=?",
                    parameters=(dataset_id,),
                    props=["id"])]
            print("Found {} BGCs in the database.".format(
                len(dataset_bgc_ids[dataset_name])))
            print("[{}s] processing dataset: {}".format(
                get_elapsed(), dataset_name))
            continue

        else:  # create a new dataset entry
            dataset_id = database.insert(
                "dataset",
                {
                    "name": dataset_name,
                    "orig_folder": dataset_meta["path"],
                    "description": dataset_meta["desc"]
                }
            )

        get_elapsed()
        print("processing dataset: {}...".format(dataset_name))
        new_bgcs_count = 0
        files_to_process = []
        count_gbk_exists = 0

        # fetch gbk files
        dataset_folder_path = path.join(
            folder_path,
            dataset_meta["path"])
        for gbk_full_path in glob.iglob(path.join(
                dataset_folder_path,
                "**/*.gbk"),
                recursive=True):

            # first check: only take antiSMASH/MIBiG-derived GBKs
            gbk_name = path.splitext(path.basename(gbk_full_path))[0]
            eligible_file = False
            for eligible_pattern in eligible_regexes:
                if eligible_pattern.match(gbk_name):
                    eligible_file = True
                    break

            # second check: see if already exists in db
            if eligible_file:
                gbk_path = path.join(path.dirname(gbk_full_path).split("/")[-1], path.basename(gbk_full_path))
                # note: this check is turned off for now,
                # see above note
                # check_gbk_exists(dataset_id, gbk_path, database)
                bgc_ids = []
                if len(bgc_ids) < 1:
                    files_to_process.append((gbk_path, gbk_full_path))
                else:
                    count_gbk_exists += 1
                    dataset_bgc_ids[dataset_name].update(bgc_ids)

        print("Found {} BGCs from {} GBKs, another {} to be parsed.".format(
            len(dataset_bgc_ids[dataset_name]),
            count_gbk_exists,
            len(files_to_process)))

        # parse and insert new GBKs #
        print("Parsing and inserting {} GBKs...".format(len(files_to_process)))
        pool_results = [x for x in tqdm(mp_pool.imap(parse_input_gbk, files_to_process), total=len(files_to_process), mininterval=1)]
        for file_path, bgcs in pool_results:
            for bgc in bgcs:
                bgc.save(dataset_id, database)
                new_bgcs_count += 1
                dataset_bgc_ids[dataset_name].add(bgc.id)
        database.commit_inserts()
        print("Inserted {} new BGCs.".format(new_bgcs_count))

        # parse and insert taxonomy information #
        if len(dataset_meta["taxonomy_path"]) > 0:
            taxonomy_file_path = path.join(
                folder_path, dataset_meta["taxonomy_path"])
            if not path.exists(taxonomy_file_path):
                raise Exception("Can't find taxonomy file: " +
                                taxonomy_file_path)
            with open(taxonomy_file_path, "r") as fp:
                print("Parsing and inserting taxonomy information...")
                columns = [
                    "path_startswith",
                    "Kingdom",
                    "Phylum",
                    "Class",
                    "Order",
                    "Family",
                    "Genus",
                    "Species",
                    "Organism"
                ]
                total_bgcs_assigned = 0
                for line in tqdm(fp, mininterval=1):
                    line = line.rstrip("\n")
                    if not line.startswith("#"):
                        tax_dict = {
                            columns[i]: value for i,
                            value in enumerate(line.split("\t"))
                        }
                        tax_dict["dataset_id"] = dataset_id
                        total_bgcs_assigned += len(
                            Taxonomy(tax_dict).save(database)
                        )
                database.commit_inserts()
                print("Added taxonomy info for {} BGCs...".format(
                    total_bgcs_assigned))

        print("[{}s] processing dataset: {}".format(
            get_elapsed(), dataset_name))

    return dataset_bgc_ids


def parse_input_gbk(arguments: tuple):
    orig_gbk_path, file_path = arguments
    return (file_path, BGC.parse_gbk(file_path, orig_gbk_path=orig_gbk_path))


def query_mode(report_name, input_folder, input_run_id, pool,
               program_db_folder, source_db_path,
               n_ranks, reports_folder, cache_folder, normalize_feature=True):
    # !! this is just a quick prototype to meet the paper deadline
    # should be properly refactored later on !!

    if not path.exists(reports_folder):
        makedirs(reports_folder)

    index_db_path = path.join(
        reports_folder, "reports.db")
    if not path.exists(index_db_path):  # prepare index db
        with sqlite3.connect(index_db_path) as con:
            cur = con.cursor()
            sql_schema = open(path.join(path.dirname(
                path.abspath(bigslice.__file__)),
                "modules", "data",
                "schema_reports_index.sql"), "r").read()
            cur.executescript(sql_schema)

    # check index file
    with sqlite3.connect(index_db_path) as con:
        cur = con.cursor()
        if cur.execute((
            "SELECT count(id)"
            " FROM reports"
            " WHERE type LIKE 'query'"
            " AND name LIKE ?"
        ), (report_name, )).fetchall()[0][0] > 0:
            print("report with this name exists! please use a different name.")
            return 1
        else:
            cur.execute((
                "INSERT INTO reports"
                "(type, name, creation_date)"
                " VALUES(?, ?, ?)"
            ), ("query", report_name, datetime.now()))
            if cur.lastrowid > 0:
                reports_id = cur.lastrowid
                query_folder = path.join(reports_folder, str(reports_id))
            else:
                print("Failed to save reports index!")
                return 1

    with TemporaryDirectory() as tmpdir:
        with Database(source_db_path) as source_db:
            tmp_db_path = path.join(tmpdir, "data.db")
            final_db_path = path.join(query_folder, "data.db")

            # create sqlite3 database
            query_db = Database(tmp_db_path, for_query_mode=True)

            # check schema versions
            source_schema_ver = source_db.select("schema", "WHERE 1")[0]["ver"]
            query_schema_ver = query_db.select("schema", "WHERE 1")[
                0]["parent_schema_ver"]
            if source_schema_ver != query_schema_ver:
                print(
                    ("BiG-SLiCE schema version did not match!"
                     " please use schema ver. {}").format(query_schema_ver))
                return 1

            # fetch run details
            print("Fetching run details...")
            hmm_db = HMMDatabase.load_folder(program_db_folder, source_db)
            if hmm_db.id < 1:
                print("Can't find a matching hmm library in the database!")
                return 1
            if input_run_id < 0:
                run = Run.get_latest(hmm_db.id, source_db, min_status=5)
                if not run:
                    print(("Can't find latest run with"
                           " clustering data available!"))
                    return 1
                else:
                    print(("--run_id not specified,"
                           " using Run#{}...").format(run.id))
            else:
                run = Run.fetch(input_run_id, source_db)

            # check run status
            if run.status < 5:
                print("Run does not have clustering data available yet!")
                return 1

            # add run_report
            with sqlite3.connect(index_db_path) as con:
                cur = con.cursor()
                cur.execute((
                    "INSERT INTO reports_run"
                    "(report_id, run_id)"
                    " VALUES(?, ?)"
                ), (reports_id, run.id))

            # parse input folder
            file_paths = set()
            eligible_regexes = [re.compile(rgx) for rgx in [
                "^BGC[0-9]{7}$",  # MIBiG
                "^.+\\.cluster[0-9]+$",  # antiSMASH4 clustergbks
                "^.+\\.region[0-9]+$",  # antiSMASH5 clustergbks
            ]]
            for gbk_full_path in glob.iglob(path.join(
                    input_folder,
                    "**/*.gbk"),
                    recursive=True):

                # only take antiSMASH/MIBiG-derived GBKs
                gbk_name = path.splitext(path.basename(gbk_full_path))[0]
                for eligible_pattern in eligible_regexes:
                    if eligible_pattern.match(gbk_name):
                        file_paths.add(gbk_full_path)
                        break
            print("Parsing & inserting {} GBKs...".format(len(file_paths)))
            bgc_ids = []
            for bgcs in pool.map(BGC.parse_gbk, file_paths):
                for bgc in bgcs:
                    # insert bgc
                    bgc.id = query_db.insert(
                        "bgc",
                        {
                            "name": bgc.name,
                            "type": bgc.type,
                            "on_contig_edge": bgc.on_contig_edge,
                            "length_nt": bgc.length_nt,
                            "orig_folder": bgc.orig_folder,
                            "orig_filename": bgc.orig_filename
                        }
                    )
                    bgc_ids.append(bgc.id)
                    # insert classes
                    for cc in bgc.chem_subclasses:
                        chem_subclass_object = BGC.ChemSubclass.search(
                            source_db, cc, bgc.type)
                        chem_subclass_object.bgc_id = bgc.id
                        chem_subclass_object.__save__(query_db)
                    # insert cds
                    for cds in bgc.cds:
                        cds.bgc_id = bgc.id
                        cds.__save__(query_db)
            query_db.commit_inserts()
            print("Inserted {} BGCs!".format(len(bgc_ids)))

            # perform biosyn_pfam scan
            print("Preparing fasta files for hmmscans...")
            hmmscan_queues = set()
            hmm_path = path.join(program_db_folder,
                                 "biosynthetic_pfams",
                                 "Pfam-A.biosynthetic.hmm")
            for chunk, chunk_name in get_chunk(
                    bgc_ids,
                    pool._processes,
                    5):
                pass
                fasta_sequences = BGC.get_all_cds_fasta(
                    chunk, query_db)
                # write down all CDSes multifasta to be hmmscanned
                in_fasta_path = path.join(tmpdir, "bio_" + chunk_name + ".fa")
                out_result_path = path.join(
                    tmpdir, "bio_" + chunk_name + ".hmmtxt")

                if not path.exists(in_fasta_path):
                    with open(in_fasta_path, "w") as fa:
                        fa.write(fasta_sequences)
                        hmmscan_queues.add(
                            (in_fasta_path, hmm_path,
                                out_result_path, "ga", True))

            print("Running hmmscans in parallel...")
            hmm_ids = {
                hmm.accession: hmm.id for hmm in hmm_db.biosyn_pfams}

            pbar = tqdm(total=len(hmmscan_queues), mininterval=1)
            for chunk, chunk_name in get_chunk(
                    bgc_ids,
                    pool._processes,
                    5):

                in_fasta_path = path.join(tmpdir, "bio_" + chunk_name + ".fa")
                sequences = list(easel.SequenceFile(in_fasta_path, digital=True, alphabet=easel.Alphabet.amino()))

                with plan7.HMMFile(hmm_path) as hmm_file:
                    for top_hits in hmmer.hmmsearch(
                        hmm_file, sequences,
                        cpus=1, bit_cutoffs="gathering"
                        ):
                        for hit in top_hits:
                            if hit.best_domain.score < top_hits.domT:
                                continue

                            target_name = hit.name.decode()
                            hmm_name = top_hits.query_accession.decode()
                            alignment = hit.best_domain.alignment

                            bgc_id, cds_id, parent_hsp_id, locs = target_name.split("|")
                            bgc_id = int(bgc_id.split("bgc:")[-1])
                            cds_id = int(cds_id.split("cds:")[-1])
                            parent_hsp_id = int(parent_hsp_id.split("hsp:")[-1])
                            locs = tuple(map(int, locs.split("-")))

                            hsp_alignment = {
                                "model_start": alignment.hmm_from - 1,
                                "model_end": alignment.hmm_to,
                                "model_gaps": [i for i, c
                                               in enumerate(alignment.hmm_sequence)
                                               if c == '.'],
                                "cds_start": alignment.target_from + locs[0] - 1,
                                "cds_end": alignment.target_to + locs[0],
                                "cds_gaps": [i for i, c
                                             in enumerate(str(alignment.target_sequence))
                                             if c == '-']
                            }
                            HSP({
                                "cds_id": cds_id,
                                "hmm_id": hmm_ids[hmm_name],
                                "parent_hsp_id": parent_hsp_id,
                                "bitscore": hit.best_domain.score,
                                "alignment": hsp_alignment
                            }).save(query_db)
                            pbar.update(1)
            pbar.close()
            query_db.commit_inserts()

            # perform subpfam_scan
            core_pfam_ids = {}
            core_pfam_accs = {}
            for parent_hmm_acc in hmm_db.sub_pfams:
                for hmm_obj in hmm_db.biosyn_pfams:
                    if hmm_obj.accession == parent_hmm_acc:
                        core_pfam_ids[hmm_obj.accession] = hmm_obj.id
                        core_pfam_accs[hmm_obj.id] = hmm_obj.accession
                        break
            chunks_and_pfam_ids = {}
            print("Preparing fasta files for subpfam_scans...")
            hmmscan_queues = set()
            for chunk, chunk_name in get_chunk(
                    bgc_ids,
                    pool._processes,
                    5):
                fasta_sequences = BGC.get_all_aligned_hsp(
                    chunk, core_pfam_ids.values(), query_db)
                chunks_and_pfam_ids[chunk_name] = set(
                    fasta_sequences.keys())
                for parent_hmm_id in fasta_sequences:
                    in_fasta_path = path.join(
                        tmpdir, "sub_bgc_{}_hmm_{}.fa".format(
                            chunk_name, parent_hmm_id))
                    hmm_path = path.join(program_db_folder,
                                         "sub_pfams",
                                         "hmm",
                                         core_pfam_accs[
                                             parent_hmm_id] +
                                         ".subpfams.hmm")
                    out_result_path = path.join(
                        tmpdir, "sub_bgc_{}_hmm_{}.hmmtxt".format(
                            chunk_name, parent_hmm_id))
                    if not path.exists(in_fasta_path):
                        with open(in_fasta_path, "w") as fa:
                            fa.write(fasta_sequences[parent_hmm_id])
                    hmmscan_queues.add(
                        (in_fasta_path,
                            hmm_path,
                            out_result_path,
                            20,  # for subpfam_scan
                            False
                         ))
            print("Running subpfam_scans in parallel...")
            subpfam_queues = []
            for chunk, chunk_name in get_chunk(
                    bgc_ids,
                    pool._processes,
                    5):
                chunk_parent_pfams = chunks_and_pfam_ids[chunk_name]
                for parent_hmm_id in chunk_parent_pfams:
                    in_fasta_path = path.join(
                        tmpdir, "sub_bgc_{}_hmm_{}.fa".format(
                            chunk_name, parent_hmm_id))
                    sub_pfam_ids = {
                        hmm_obj.name: hmm_obj.id
                        for hmm_obj in
                        hmm_db.sub_pfams[core_pfam_accs[parent_hmm_id]]
                    }
                    hmm_path = path.join(program_db_folder,
                                         "sub_pfams",
                                         "hmm",
                                         core_pfam_accs[
                                             parent_hmm_id] +
                                         ".subpfams.hmm")
                    subpfam_queues.append((in_fasta_path, hmm_path, sub_pfam_ids))

            for results in tqdm(
                pool.imap(run_subpfam_scan, subpfam_queues
            ), total=len(subpfam_queues), mininterval=1):
                for hsp in results:
                    HSP(hsp).save(query_db)

            query_db.commit_inserts()

            # perform features extraction
            print("Extracting features...")
            for chunk, chunk_name in get_chunk(
                    bgc_ids,
                    pool._processes,
                    5):
                for feature in Features.extract(
                    chunk, hmm_db.id, query_db,
                    hmm_db_source=source_db
                ):
                    feature.save(query_db)
            query_db.commit_inserts()

            # perform membership assignment
            print("Assigning GCF membership...")
            for membership in Membership.assign(
                run.id,
                source_db,
                top_hits=n_ranks,
                threshold=None,
                bgc_database=query_db,
                cache_folder=cache_folder,
                normalize_feature=normalize_feature
            ):
                membership.save(query_db)
            query_db.commit_inserts()

            # move files to final output folder
            if not path.exists(query_folder):
                makedirs(query_folder)
            shutil.copy(tmp_db_path, final_db_path)

            print(("Query success! please check results"
                   " via the output visualization"
                   " (under 'Reports->View all reports')"))
            return 0

    return 1


def main():
    # bigscape version
    version = "2.0.0"

    # TODO: check requirements

    # a dirty workaround, but alas
    if "--version" in argv:
        import sys
        sys.argv = [argv[0]]
        sys.argv.extend(["--version", "."])

    # program parameters
    parser = argparse.ArgumentParser(
        description=(
            "                            _________\n"
            " ___    _____|\\____|\\____|\\ \\____    \\\n"
            "|   \\  |       }     }     }  ___)_ _/__\n"
            "| >  | _--/--|/----|/----|/__/  __||  __|\n"
            "|   < | ||  __/ (  (| | \\___/  /__/|  _|\n"
            "| >  || || |_ | _)  ) |_ | |\\  \\__ | |__\n"
            "|____/|_| \\___/|___/|___||_| \\____||____|"
            " [ Version " + version + " ]\n"
            "\n"
            "Biosynthetic Gene clusters - Super Linear Clustering Engine\n"
            "(https://github.com/medema-group/bigslice)"
        ),
        formatter_class=RawTextHelpFormatter,
        epilog=("_"),
        add_help=False
    )
    parser.add_argument(
        "output_folder", type=str, metavar='<output_folder_path>',
        help=(
            "[Mandatory] the path to the (newly created"
            " or existing) output folder."
        ))

    # [Mode 1] Clustering analysis
    arg_group_main = parser.add_argument_group(
        "[Mode 1] Clustering analysis", (
            "Parse input datasets, then perform"
            " GCF models building (BIRCH clustering)"
            " and membership assignment according to"
            " the specified threshold (T)"
        ))
    arg_group_main.add_argument(
        "-i", "--input_folder", type=str, metavar='<folder_path>',
        help=("[Mode-mandatory] Path to input folder containing 'datasets.tsv'"
              " file and dataset subfolders."))
    arg_group_main.add_argument(
        "--resume", action='store_true',
        help=("Continue the last clustering run (do not specifiy "
              "--input_folder in combination with this parameter)."))
    arg_group_main.add_argument(
        "--complete", action="store_true",
        help=("When building GCF models, use only complete BGCs "
              "(antiSMASH > 4.2 BGCs annotated with 'on_contig_edge'"
              " = False)."))
    arg_group_main.add_argument(
        "--threshold", default=0.4, type=float, metavar="T",
        help=(
            "Clustering threshold (T) used in GCF models building"
            " (BIRCH algorithm) and membership assignment [0-1.2]."
            " Mutually exclusive with --threshold_pct, use '-1' to turn off"
            " this parameter (default: %(default)s)."))
    arg_group_main.add_argument(
        "--threshold_pct", default=-1, type=float, metavar="<N>",
        help=("Calculate clustering threshold (T) based on a random"
              " sampling of pairwise distances between the data, taking"
              " the N-th percentile value as the threshold. Mutually"
              " exclusive with --threshold, use '-1' to turn off"
              " this parameter (default: %(default)s)."))

    # [Mode 2] GCF queries
    arg_group_query = parser.add_argument_group(
        "[Mode 2] GCF queries", (
            "Given existing GCF models from [Mode 1],"
            " perform features extraction and membership"
            " assignment from a set of BGC genbank files"
            " in the input folder"
        ))
    arg_group_query.add_argument(
        "--query", type=str, metavar='<folder_path>',
        help=("[Mode-mandatory] Path to input folder containing all"
              "cluster genbank files (needs to be either clusterXXX.gbk"
              " of antiSMASH4, regionXXX.gbk of antiSMASH5, or"
              " BGCXXXXXXX.gbk of MIBiG >= 2.0 files)."))
    arg_group_query.add_argument(
        "--query_name",
        default=datetime.now().strftime(
            "%Y-%m-%d %H:%M:%S"), type=str, metavar="<name>",
        help=("Give a unique name to the query run so that it will"
              " be easier to trace within the output visualization."))
    arg_group_query.add_argument(
        "--no_normalize_feature",
        action="store_false",
        dest="normalize_feature",
        help=("Do not normalize the feature when querying. This is "
              "useful to keep the same distance value defined in "
              "previous version of bigslice, for example querying "
              "against the BiG-FAM database generated by BiG-SLICE "
              "v1.0.0."))

    # [Mode 1 + Mode 2]
    arg_group_combine = parser.add_argument_group(
        "[Mode 1+2]", (
            "Parameters relevant for both the Clustering"
            " and Query mode"
        ))
    arg_group_combine.add_argument(
        "--run_id", default=-1, type=int, metavar="<id>",
        help=("Rather than taking the latest run, perform query"
              " (or resume clustering) on the specific run id"
              " (you can check the list of run ids in the output"
              " visualization)."))
    arg_group_combine.add_argument(
        "--n_ranks", default=1, type=int,
        help=("Takes N-best GCF hits for each BGC's membership assignment"
              " procedure (default: %(default)s)."))

    # [Misc] Resource management"
    arg_group_perf = parser.add_argument_group(
        "[Misc] Resource management", (
            "CPU and RAM usage optimization parameters"
        ))
    arg_group_perf.add_argument(
        "-t", "--num_threads",
        default=len(sched_getaffinity(0)), type=int, metavar="<N>",
        help=("The number of parallel jobs to run (default: %(default)s)."))
    arg_group_perf.add_argument(
        "--hmmscan_chunk_size",
        default=100, type=int, metavar="<N>",
        help=("Split biosyn_pfam scanning into chunks of N BGCs"
              " (default: %(default)s)"))
    arg_group_perf.add_argument(
        "--subpfam_chunk_size", metavar="<N>",
        default=100, type=int,
        help=("Split sub_pfam scanning into chunks of N BGCs"
              " (default: %(default)s)"))
    arg_group_perf.add_argument(
        "--extraction_chunk_size",
        default=100, type=int,
        help=("Split features extraction into chunks of N BGCs"
              " (default: %(default)s)"))
    arg_group_perf.add_argument(
        "--scratch", action="store_true",
        help=("Don't load the Sqlite3 database into memory"
              " (lower RAM usage, but potentially slower)."))
    arg_group_perf.add_argument(
        "--early_dumping", action="store_true",
        help=("Dump the in-memory Sqlite3 db into a file after"
                " finishing every phase (will enable resume in"
                " case of crashes, but will increase overall "
                "runtime)."))

    # [Misc] Other optional parameters
    arg_group_misc = parser.add_argument_group(
        "[Misc] Other optional parameters",
        " ")
    arg_group_misc.add_argument(
        "-h", "--help", action="help",
        help="Show this help message.")
    arg_group_misc.add_argument(
        "--export-tsv", type=str, metavar='<folder_path>',
        help=("Export existing pre-calculated output data into"
            " TSVs (specify the target folder path)"))
    arg_group_misc.add_argument(
        "--program_db_folder",
        default=path.join(path.dirname(
            path.realpath(__file__)),
            "bigslice-models"), type=str,
        help=("Path to the HMM libraries (default: %(default)s)."))
    arg_group_misc.add_argument(
        "--version", action="store_true",
        help=("Show BiG-SLiCE version."))

    args = parser.parse_args()

    # define variables
    prog_params = " ".join(argv[1:])
    output_folder = path.abspath(args.output_folder)
    result_folder = path.join(output_folder, "result")
    reports_folder = path.join(output_folder, "reports")
    cache_folder = path.join(result_folder, "cache")
    tmp_folder = path.join(result_folder, "tmp")
    data_db_path = path.join(result_folder, "data.db")
    program_db_folder = path.abspath(args.program_db_folder)
    resume = args.resume
    input_run_id = args.run_id
    use_memory = not args.scratch
    early_dumping = args.early_dumping

    # show --version
    if args.version:
        # fetch hmm database md5s
        try:
            md5_biosyn_pfam = open(path.join(
                program_db_folder, "biosynthetic_pfams",
                "biopfam.md5sum"), "r").readline().rstrip()
            md5_sub_pfam = open(path.join(
                program_db_folder, "sub_pfams",
                "corepfam.md5sum"), "r").readline().rstrip()
        except FileNotFoundError:
            print("Can't find HMM model libraries!")
            return 1
        # try fetching HMM database models version
        db_folder_info = HMMDatabase.fetch_db_folder_info(program_db_folder)
        # print
        print((
            "==============\n"
            "BiG-SLiCE version {}\n"
            "HMM databases version: {}\n"
            "Biosynthetic-pfam md5: {}\n"
            "Sub-pfam md5: {}\n"
            "==============\n"
        ).format(
            version,
            "custom" if db_folder_info == None\
            else db_folder_info["name"],
            md5_biosyn_pfam,
            md5_sub_pfam
        ))
        return 0

    # first gate keeping -- conflicting modes
    if resume and args.input_folder:
        print("--resume is selected but --input_folder is specified, " +
              "please select either!")
        return 1
    if args.query and args.input_folder:
        print("--query is selected but --input_folder is specified, " +
              "please select either!")
        return 1
    if args.query and resume:
        print("--query is selected but --resume is also selected, " +
              "please select either!")
        return 1

    if resume or args.query:
        if not path.exists(result_folder):
            print("Output folder didn't exists!")
            return 1
        if not path.exists(cache_folder):
            makedirs(cache_folder)

    # prepare runs
    pool = fetch_pool(args.num_threads)

    # check if query mode, goto its specific workflow
    if args.query:
        return query_mode(args.query_name, args.query, input_run_id,
                          pool, program_db_folder, data_db_path,
                          args.n_ranks, reports_folder,
                          cache_folder, normalize_feature=args.normalize_feature)

    # check if export-tsv mode
    if args.export_tsv:
        return export_tsv_to_folder(output_folder, args.export_tsv)

    # check if specified folders exist
    if args.input_folder:
        input_folder = path.abspath(args.input_folder)
        if not path.exists(input_folder):
            print("Can't find {}!".format(input_folder))
            return 1

    # create output folder if not exists
    if not path.exists(output_folder):
        print("creating output folder...")
        makedirs(output_folder)
        # copy template from flask_app
        copy_output_template(output_folder)
        # make result folder
        makedirs(result_folder)
        # make cache folder
        makedirs(cache_folder)
    else:
        if not resume:
            user_input = input("Folder " + output_folder +
                               " exists! continue running program (Y/[N])? ")
            if user_input not in ["Y", "y"]:
                print("Cancelled.")
                return 1
            del user_input

    # check if database backup exists
    if use_memory:
        db_backup_path = data_db_path + ".bak"
        if path.exists(db_backup_path):
            user_input = input("File " + db_backup_path +
                               " exists! it will get overwritten," +
                               " continue (Y/[N])?")
            if user_input not in ["Y", "y"]:
                print("Cancelled.")
                return 1
            del user_input
        del db_backup_path

    # load/create SQLite3 database
    if use_memory:
        print("Loading database into memory (this can take a while)...")
    get_elapsed()
    with Database(data_db_path, use_memory) as output_db:
        print("[{}s] loading sqlite3 database".format(get_elapsed()))

        # check/verify HMM databases folder
        db_folder_info = HMMDatabase.fetch_db_folder_info(program_db_folder)
        if db_folder_info == None:
            print(("[!!] Using customized HMM databases"
                " (if this is not intended, please reinstall"
                " your HMM databases)"
                ))
        else:
            print(("Using HMM database version '{}'"
                " (built using antiSMASH version {})"
                ).format(
                    db_folder_info["name"],
                    db_folder_info["antismash_ver"]
                ))

        # load HMM databases
        print("Loading HMM databases...")
        get_elapsed()
        hmm_db = HMMDatabase.load_folder(program_db_folder, output_db)
        md5_biosyn_pfam = hmm_db.md5_biosyn_pfam
        md5_sub_pfam = hmm_db.md5_sub_pfam
        if hmm_db.id < 1:
            hmm_db.save(output_db)
            output_db.commit_inserts()
        print("[{}s] loading hmm databases".format(get_elapsed()))

        if resume:
            if input_run_id > -1:
                # fetch specific run_id
                run = Run.fetch(input_run_id, output_db)
                if run.hmm_db_id != hmm_db.id:
                    print("Resumed run used different " +
                          "HMM database source! exiting..")
                    return 1
            else:
                # fetch latest run data
                run = Run.get_latest(hmm_db.id, output_db)
            if not run:
                print("No run data to resume, exiting..")
                return 1
            else:
                print("Resuming run #{}".format(
                    run.id
                ))

                # re-load old params
                resumed_params = run.prog_params.split(" ")
                for i, param in enumerate(resumed_params):
                    if param == "--complete":
                        print("using --complete" + " (from resumed run)")
                        args.complete = True
                    elif param == "--threshold":
                        value = float(resumed_params[i + 1])
                        print("using --threshold {}".format(value) +
                              " (from resumed run)")
                        args.threshold = value
                    elif param == "--threshold_pct":
                        value = float(resumed_params[i + 1])
                        print("using --threshold_pct {}".format(value) +
                              " (from resumed run)")
                        args.threshold_pct = value
                    elif param == "--n_ranks":
                        value = int(resumed_params[i + 1])
                        print("using --n_ranks {}".format(value) +
                              " (from resumed run)")
                        args.n_ranks = value
                    elif param == "--hmmscan_chunk_size":
                        value = int(resumed_params[i + 1])
                        print(
                            "using --hmmscan_chunk_size {}".format(value) +
                            " (from resumed run)")
                        args.hmmscan_chunk_size = value
                    elif param == "--subpfam_chunk_size":
                        value = int(resumed_params[i + 1])
                        print(
                            "using --subpfam_chunk_size {}".format(value) +
                            " (from resumed run)")
                        args.subpfam_chunk_size = value
                    elif param == "--extraction_chunk_size":
                        value = int(resumed_params[i + 1])
                        print(
                            "using --extraction_chunk_size {}".format(value) +
                            " (from resumed run)")
                        args.extraction_chunk_size = value

                run.log("run resumed " + str(len(run.bgcs)) + " BGCs")
        else:
            # read input folder
            all_bgc_ids = set()
            datasets_count = 0

            for dataset_name, dataset_bgc_ids in process_input_folder(
                    args.input_folder, output_db, pool).items():
                datasets_count += 1
                all_bgc_ids.update(dataset_bgc_ids)
            print("Found {} BGC(s) from {} dataset(s)".format(
                len(all_bgc_ids),
                datasets_count))

            # create new run data
            run = Run.create(all_bgc_ids, hmm_db.id, prog_params, output_db)
            output_db.commit_inserts()
            run.log("run created " + str(len(run.bgcs)) + " BGCs")

            if use_memory and early_dumping and early_dumping:
                output_db.dump_db_file()

        # set up parameters (that are re-load-able from run if --resume is on)
        hmmscan_chunk_size = args.hmmscan_chunk_size
        subpfam_chunk_size = args.subpfam_chunk_size
        extraction_chunk_size = args.extraction_chunk_size
        clustering_threshold = args.threshold
        clustering_threshold_percentile = args.threshold_pct
        clustering_complete_only = args.complete
        n_ranks = args.n_ranks
        if n_ranks < 1:
            print("--n_ranks can't be lower than 1.")
            return 1

        # check run_bgc status, bin into a list of statuses
        get_elapsed()
        bgc_status_bin = {
            1: set(),  # RUN_STARTED
            2: set(),  # BIOSYN_SCANNED
            3: set(),  # SUBPFAM_SCANNED
            4: set(),  # FEATURES_EXTRACTED
            5: set(),  # CLUSTERING_FINISHED
            6: set(),  # MEMBERSHIPS_ASSIGNED
            7: set()   # RUN_FINISHED
        }
        print("Checking run status of {} BGCs...".format(len(run.bgcs)))
        for bgc_id, bgc_status in tqdm(run.bgcs.items()):
            if bgc_status > run.status:
                print("bgc_id {} run_status is {} while the whole run is \
                    {}. the run data might be corrupted!".format(
                    bgc_id, bgc_status, run.status))
                return 1
            bgc_status_bin[bgc_status].add(bgc_id)
        print("[{}s] checking run status".format(get_elapsed()))

        # phase 2: HMM scanning (BIOSYNC_SCANNED)
        if len(bgc_status_bin[1]) > 0:
            print("Doing biosyn_pfam scan on {} BGCs...".format(
                len(bgc_status_bin[1])))
            run.log("biosyn_pfam start " +
                    str(len(bgc_status_bin[1])) + " BGCs")
            get_elapsed()
            hmm_scanned = set()

            # check if already hmmscanned in previous run
            run.log("check existing hmmscan results...")
            pbar = tqdm(total=len(bgc_status_bin[1]), mininterval=1)
            for chunk, chunk_name in get_chunk(
                    list(bgc_status_bin[1]), args.num_threads, 1000):
                for bgc_id in check_hmmscanned_in_db(
                        chunk, md5_biosyn_pfam, output_db):
                    if update_bgc_status(bgc_id, run.id, 2, output_db) == 1:
                        hmm_scanned.add(bgc_id)
                    else:
                        print("Failed to update bgc status " +
                              "(bgc_id: " + str(bgc_id) + ") " +
                              "(run_id: " + str(run.id) + ")")
                        return 1
                pbar.update(len(chunk))
            pbar.close()

            print("{} BGCs are already scanned in previous run".format(
                len(hmm_scanned)))

            # prepare for hmmscan
            biosyn_tmp_folder = path.join(tmp_folder, md5_biosyn_pfam)
            if not path.exists(biosyn_tmp_folder):
                makedirs(biosyn_tmp_folder)

            def fasta_path(chunk_name):
                return path.join(
                    biosyn_tmp_folder, "bgc_{}.fa".format(chunk_name))

            def hmm_result_path(chunk_name):
                return path.join(
                    biosyn_tmp_folder, "bgc_{}.hmmtxt".format(chunk_name))

            to_be_hmmscanned = list(bgc_status_bin[1] - hmm_scanned)
            if len(to_be_hmmscanned) > 0:

                print("Preparing fasta files for hmmscans...")
                run.log("hmmscan prepare " +
                        str(len(to_be_hmmscanned)) + " BGCs")

                hmmscan_queues = set()
                hmm_path = path.join(program_db_folder,
                                     "biosynthetic_pfams",
                                     "Pfam-A.biosynthetic.hmm")
                pbar = tqdm(total=len(to_be_hmmscanned), mininterval=1)
                for chunk, chunk_name in get_chunk(
                        to_be_hmmscanned,
                        args.num_threads,
                        hmmscan_chunk_size):

                    fasta_sequences = BGC.get_all_cds_fasta(
                        chunk, output_db)

                    # write down all CDSes multifasta to be hmmscanned
                    in_fasta_path = fasta_path(chunk_name)
                    out_result_path = hmm_result_path(chunk_name)

                    if not path.exists(in_fasta_path):
                        with open(in_fasta_path, "w") as fa:
                            fa.write(fasta_sequences)
                            hmmscan_queues.add(
                                (in_fasta_path, hmm_path,
                                    out_result_path, "ga", True))
                    pbar.update(len(chunk))
                pbar.close()

                print("Running hmmsearch in parallel..." +
                        str(len(to_be_hmmscanned)) + " BGCs" +
                        " (in " + str(len(hmmscan_queues)) + " chunks)")
                hmm_ids = {
                    hmm.accession: hmm.id for hmm in hmm_db.biosyn_pfams}
                pbar = tqdm(total=len(to_be_hmmscanned), mininterval=1)
                for chunk, chunk_name in get_chunk(
                        to_be_hmmscanned,
                        args.num_threads,
                        hmmscan_chunk_size):

                    in_fasta_path = fasta_path(chunk_name)

                    sequences = list(easel.SequenceFile(in_fasta_path, digital=True, alphabet=easel.Alphabet.amino()))
                    with plan7.HMMFile(hmm_path) as hmm_file:
                        for top_hits in hmmer.hmmsearch(
                            hmm_file, sequences,
                            cpus=args.num_threads, bit_cutoffs="gathering"
                            ):
                            for hit in top_hits:
                                if hit.best_domain.score < top_hits.domT:
                                    continue

                                target_name = hit.name.decode()
                                hmm_name = top_hits.query_accession.decode()
                                alignment = hit.best_domain.alignment

                                bgc_id, cds_id, parent_hsp_id, locs = target_name.split("|")
                                bgc_id = int(bgc_id.split("bgc:")[-1])
                                cds_id = int(cds_id.split("cds:")[-1])
                                parent_hsp_id = int(parent_hsp_id.split("hsp:")[-1])
                                locs = tuple(map(int, locs.split("-")))

                                hsp_alignment = {
                                    "model_start": alignment.hmm_from - 1,
                                    "model_end": alignment.hmm_to,
                                    "model_gaps": [i for i, c
                                                   in enumerate(alignment.hmm_sequence)
                                                   if c == '.'],
                                    "cds_start": alignment.target_from + locs[0] - 1,
                                    "cds_end": alignment.target_to + locs[0],
                                    "cds_gaps": [i for i, c
                                                 in enumerate(str(alignment.target_sequence))
                                                 if c == '-']
                                }
                                HSP({
                                    "cds_id": cds_id,
                                    "hmm_id": hmm_ids[hmm_name],
                                    "parent_hsp_id": parent_hsp_id,
                                    "bitscore": hit.best_domain.score,
                                    "alignment": hsp_alignment
                                }).save(output_db)

                    output_db.commit_inserts()

                    for bgc_id in chunk:
                        if update_bgc_status(
                                bgc_id, run.id, 2, output_db) == 1:
                            hmm_scanned.add(bgc_id)
                        else:
                            print("Failed to update bgc status " +
                                  "(bgc_id: " + str(bgc_id) + ") " +
                                  "(run_id: " + str(run.id) + ")")

                    # clean up
                    remove(in_fasta_path)

                    pbar.update(len(chunk))
                pbar.close()

            # update status bin
            for bgc_id in hmm_scanned:
                assert run.bgcs[bgc_id] == 1
                run.bgcs[bgc_id] = 2
            bgc_status_bin[1] = bgc_status_bin[1] - hmm_scanned
            bgc_status_bin[2].update(hmm_scanned)

            if len(to_be_hmmscanned) > 0 and use_memory and early_dumping:
                output_db.dump_db_file()

            # garbage collection
            del to_be_hmmscanned
            del biosyn_tmp_folder
            del fasta_path
            del hmm_result_path

            print("[{}s] biosyn_pfam scan".format(get_elapsed()))
            run.log("biosyn_pfam end")
        if len(bgc_status_bin[1]) == 0:
            if run.status < 2:
                if run.update_status(2) > 0:
                    print("run_status is now BIOSYN_SCANNED")
                else:
                    print("failed to update run_status")
        else:
            print("Encountered errors in hmmscans, " +
                  str(len(bgc_status_bin[1])) +
                  " BGCs are not yet scanned.")
            return 1

        # phase 3: Subpfam scanning
        if len(bgc_status_bin[2]) > 0:
            get_elapsed()
            print("Doing sub_pfam scan on {} BGCs...".format(
                len(bgc_status_bin[2])))
            run.log("sub_pfam start " +
                    str(len(bgc_status_bin[2])) + " BGCs")

            subpfam_scanned = set()

            # check if already subpfam_scanned in previous run
            run.log("check existing hmmscan results...")
            pbar = tqdm(total=len(bgc_status_bin[2]), mininterval=1)
            for chunk, chunk_name in get_chunk(
                    list(bgc_status_bin[2]),
                    args.num_threads, subpfam_chunk_size):
                for bgc_id in check_subpfam_scanned_in_db(
                        chunk, md5_sub_pfam, output_db):
                    if update_bgc_status(bgc_id, run.id, 3, output_db) == 1:
                        subpfam_scanned.add(bgc_id)
                    else:
                        print("Failed to update bgc status " +
                              "(bgc_id: " + str(bgc_id) + ") " +
                              "(run_id: " + str(run.id) + ")")
                        return 1
                pbar.update(len(chunk))
            pbar.close()

            print("{} BGCs are already scanned in previous run".format(
                len(subpfam_scanned)))

            # prepare for subpfam_scan
            subpfam_tmp_folder = path.join(tmp_folder, md5_sub_pfam)
            if not path.exists(subpfam_tmp_folder):
                makedirs(subpfam_tmp_folder)

            def fasta_path(bgcs_md5, hmm_id):
                return path.join(
                    subpfam_tmp_folder, "bgc_{}_hmm_{}.fa".format(
                        bgcs_md5, hmm_id))

            def hmm_result_path(bgcs_md5, hmm_id):
                return path.join(
                    subpfam_tmp_folder, "bgc_{}_hmm_{}.hmmtxt".format(
                        bgcs_md5, hmm_id))

            # fetch core pfam ids
            core_pfam_ids = {}
            core_pfam_accs = {}
            for parent_hmm_acc in hmm_db.sub_pfams:
                for hmm_obj in hmm_db.biosyn_pfams:
                    if hmm_obj.accession == parent_hmm_acc:
                        core_pfam_ids[hmm_obj.accession] = hmm_obj.id
                        core_pfam_accs[hmm_obj.id] = hmm_obj.accession
                        break

            to_be_subpfam_scanned = list(bgc_status_bin[2] - subpfam_scanned)
            if len(to_be_subpfam_scanned) > 0:

                chunks_and_pfam_ids = {}

                print("Preparing fasta files for subpfam_scans...")
                run.log("hmmscan prepare " +
                        str(len(to_be_subpfam_scanned)) + " BGCs")
                hmmscan_queues = set()
                pbar = tqdm(total=len(to_be_subpfam_scanned), mininterval=1)
                for chunk, chunk_name in get_chunk(
                        to_be_subpfam_scanned,
                        args.num_threads,
                        subpfam_chunk_size):

                    fasta_sequences = BGC.get_all_aligned_hsp(
                        chunk, core_pfam_ids.values(), output_db)
                    chunks_and_pfam_ids[chunk_name] = set(
                        fasta_sequences.keys())

                    for parent_hmm_id in fasta_sequences:
                        in_fasta_path = fasta_path(
                            chunk_name, parent_hmm_id)
                        hmm_path = path.join(program_db_folder,
                                             "sub_pfams",
                                             "hmm",
                                             core_pfam_accs[
                                                 parent_hmm_id] +
                                             ".subpfams.hmm")
                        out_result_path = hmm_result_path(
                            chunk_name, parent_hmm_id)

                        if not path.exists(in_fasta_path):
                            with open(in_fasta_path, "w") as fa:
                                fa.write(fasta_sequences[parent_hmm_id])

                        hmmscan_queues.add(
                            (in_fasta_path,
                                hmm_path,
                                out_result_path,
                                20,  # for subpfam_scan
                                False
                             ))
                    pbar.update(len(chunk))
                pbar.close()

                # do hmmscans in parallel
                print("Running subpfam_scans in parallel... " +
                        str(len(to_be_subpfam_scanned)) + " BGCs")
                run.log("hmmscan run " +
                        str(len(to_be_subpfam_scanned)) + " BGCs")

                use_pyhmmer = True
                use_multi_pyhmmer = True
                use_pyhmmer_scan = False
                top_k = 3
                if use_pyhmmer:
                    if use_multi_pyhmmer:
                        subpfam_queues = []
                        for chunk, chunk_name in get_chunk(
                            to_be_subpfam_scanned,
                            args.num_threads,
                            subpfam_chunk_size):
                            chunk_parent_pfams = chunks_and_pfam_ids[chunk_name]
                            for parent_hmm_id in chunk_parent_pfams:
                                in_fasta_path = fasta_path(
                                    chunk_name, parent_hmm_id)
                                sub_pfam_ids = {
                                    hmm_obj.name: hmm_obj.id
                                    for hmm_obj in
                                    hmm_db.sub_pfams[core_pfam_accs[parent_hmm_id]]
                                }
                                hmm_path = path.join(program_db_folder,
                                                     "sub_pfams",
                                                     "hmm",
                                                     core_pfam_accs[
                                                         parent_hmm_id] +
                                                     ".subpfams.hmm")
                                subpfam_queues.append((in_fasta_path, hmm_path, sub_pfam_ids))

                        if use_pyhmmer_scan:
                            parallel_proc_func = run_subpfam_scan_hmmscan
                        else:
                            parallel_proc_func = run_subpfam_scan

                        for results in tqdm(
                            pool.imap(parallel_proc_func, subpfam_queues
                        ), total=len(subpfam_queues), mininterval=1):
                            for hsp in results:
                                HSP(hsp).save(output_db)

                        output_db.commit_inserts()

                        
                        for chunk, chunk_name in get_chunk(
                            to_be_subpfam_scanned,
                            args.num_threads,
                            subpfam_chunk_size):                            
                            for bgc_id in chunk:
                                if update_bgc_status(
                                        bgc_id, run.id, 3, output_db) == 1:
                                    subpfam_scanned.add(bgc_id)
                                else:
                                    print("Failed to update bgc status " +
                                          "(bgc_id: " + str(bgc_id) + ") " +
                                          "(run_id: " + str(run.id) + ")")


                        # clean up
                        for chunk, chunk_name in get_chunk(
                            to_be_subpfam_scanned,
                            args.num_threads,
                            subpfam_chunk_size):
                            chunk_parent_pfams = chunks_and_pfam_ids[chunk_name]
                            for parent_hmm_id in chunk_parent_pfams:
                                for parent_hmm_id in chunk_parent_pfams:
                                    pass
                                    #remove(fasta_path(
                                    #    chunk_name, parent_hmm_id))

                    else:
                        pbar = tqdm(total=len(hmmscan_queues), mininterval=1)
                        for chunk, chunk_name in get_chunk(
                                to_be_subpfam_scanned,
                                args.num_threads,
                                subpfam_chunk_size):
                            chunk_parent_pfams = chunks_and_pfam_ids[chunk_name]
                            for parent_hmm_id in chunk_parent_pfams:
                                in_fasta_path = fasta_path(
                                    chunk_name, parent_hmm_id)
                                sub_pfam_ids = {
                                    hmm_obj.name: hmm_obj.id
                                    for hmm_obj in
                                    hmm_db.sub_pfams[core_pfam_accs[parent_hmm_id]]
                                }
                                hmm_path = path.join(program_db_folder,
                                                     "sub_pfams",
                                                     "hmm",
                                                     core_pfam_accs[
                                                         parent_hmm_id] +
                                                     ".subpfams.hmm")

                                results = {}
                                sequences = list(easel.SequenceFile(in_fasta_path, digital=True, alphabet=easel.Alphabet.amino()))
                                with plan7.HMMFile(hmm_path) as hmm_file:
                                    for top_hits in hmmer.hmmsearch(
                                        hmm_file, sequences,
                                        cpus=args.num_threads, T=20
                                        ):
                                        for hit in top_hits:
                                            target_name = hit.name.decode()
                                            hmm_name = top_hits.query_name.decode()
                                            alignment = hit.best_domain.alignment

                                            bgc_id, cds_id, parent_hsp_id, locs = target_name.split("|")
                                            bgc_id = int(bgc_id.split("bgc:")[-1])
                                            cds_id = int(cds_id.split("cds:")[-1])
                                            parent_hsp_id = int(parent_hsp_id.split("hsp:")[-1])
                                            locs = tuple(map(int, locs.split("-")))

                                            if target_name not in results:
                                                results[target_name] = []
                                            results[target_name].append({
                                                "cds_id": cds_id,
                                                "hmm_id": sub_pfam_ids[hmm_name],
                                                "parent_hsp_id": parent_hsp_id,
                                                "bitscore": int(hit.best_domain.score)
                                            })

                                for target_name, hsps in results.items():
                                    for i, hsp in enumerate(
                                        sorted(hsps, key=lambda x: x["bitscore"], reverse=True)
                                    ):
                                        if i >= top_k:
                                            break
                                        adjusted_bitscore = 255 - int((255 / top_k) * i)
                                        HSP(hsp).save(output_db)

                                pbar.update(1)

                            for bgc_id in chunk:
                                if update_bgc_status(
                                        bgc_id, run.id, 3, output_db) == 1:
                                    subpfam_scanned.add(bgc_id)
                                else:
                                    print("Failed to update bgc status " +
                                          "(bgc_id: " + str(bgc_id) + ") " +
                                          "(run_id: " + str(run.id) + ")")

                            output_db.commit_inserts()

                            # clean up
                            for parent_hmm_id in chunk_parent_pfams:
                                remove(fasta_path(
                                    chunk_name, parent_hmm_id))

                        pbar.close()

                else:

                    _ = list(tqdm(pool.imap(
                        run_hmmscan, hmmscan_queues), total=len(hmmscan_queues), mininterval=1))

                    print("Parsing subpfam_scans results...")
                    run.log("hmmscan parse " +
                            str(len(to_be_subpfam_scanned)) + " BGCs")
                    pbar = tqdm(total=len(to_be_subpfam_scanned), mininterval=1)
                    for chunk, chunk_name in get_chunk(
                            to_be_subpfam_scanned,
                            args.num_threads,
                            subpfam_chunk_size):
                        chunk_parent_pfams = chunks_and_pfam_ids[chunk_name]
                        for parent_hmm_id in chunk_parent_pfams:
                            out_result_path = hmm_result_path(
                                chunk_name, parent_hmm_id)
                            sub_pfam_ids = {
                                hmm_obj.name: hmm_obj.id
                                for hmm_obj in
                                hmm_db.sub_pfams[core_pfam_accs[parent_hmm_id]]
                            }

                            for hsp_object in HSP.parse_hmmtext(
                                    out_result_path, sub_pfam_ids,
                                    save_alignment=False,
                                    top_k=top_k,
                                    rank_normalize=True):
                                hsp_object.save(output_db)

                        output_db.commit_inserts()

                        for bgc_id in chunk:
                            if update_bgc_status(
                                    bgc_id, run.id, 3, output_db) == 1:
                                subpfam_scanned.add(bgc_id)
                            else:
                                print("Failed to update bgc status " +
                                      "(bgc_id: " + str(bgc_id) + ") " +
                                      "(run_id: " + str(run.id) + ")")

                        # clean up
                        for parent_hmm_id in []:#chunk_parent_pfams:
                            remove(fasta_path(
                                chunk_name, parent_hmm_id))
                            remove(hmm_result_path(
                                chunk_name, parent_hmm_id))

                        pbar.update(len(chunk))
                    pbar.close()

            # update status bin
            for bgc_id in subpfam_scanned:
                assert run.bgcs[bgc_id] == 2
                run.bgcs[bgc_id] = 3
            bgc_status_bin[2] = bgc_status_bin[1] - subpfam_scanned
            bgc_status_bin[3].update(subpfam_scanned)

            print("[{}s] sub_pfam scan".format(get_elapsed()))
            run.log("sub_pfam end")

            if len(to_be_subpfam_scanned) > 0 and use_memory and early_dumping:
                output_db.dump_db_file()

        if len(bgc_status_bin[2]) == 0:
            if run.status < 3:
                if run.update_status(3) > 0:
                    print("run_status is now SUBPFAM_SCANNED")
                else:
                    print("failed to update run_status")
        else:
            print("Encountered errors in hmmscans, " +
                  str(len(bgc_status_bin[2])) +
                  " BGCs are not yet scanned.")
            return 1

        # phase 4: Features extraction
        if 2 < run.status < 4:
            get_elapsed()
            print("Extracting features from {} BGCs...".format(len(run.bgcs)))
            run.log("features_extraction start " +
                    str(len(run.bgcs)) + " BGCs")

            bgc_features = {}

            # check if already features_extracted in previous run
            pickled_file_path = path.join(
                cache_folder, "bgc_features_{}.pkl".format(hmm_db.id))
            if path.exists(pickled_file_path):
                bgc_features = pd.read_pickle(pickled_file_path).to_dict(orient='index')

            # update status for previously-extracted features
            for bgc_id in bgc_features:
                if update_bgc_status(bgc_id, run.id, 4, output_db) == 1:
                    pass
                else:
                    print("Failed to update bgc status " +
                          "(bgc_id: " + str(bgc_id) + ") " +
                          "(run_id: " + str(run.id) + ")")
                    return 1

            print("{} BGCs are already extracted in previous run".format(
                len(bgc_features)))

            features_extracted = set(bgc_features.keys())
            to_be_extracted = list(bgc_status_bin[3] - features_extracted)
            if len(to_be_extracted) > 0:

                print("Extracting features...")
                run.log("features_extraction extract " +
                        str(len(to_be_extracted)) + " BGCs")

                pbar = tqdm(total=len(to_be_extracted), mininterval=1)
                for chunk, chunk_name in get_chunk(
                        to_be_extracted,
                        args.num_threads,
                        extraction_chunk_size):

                    for feature in Features.extract(
                        chunk, hmm_db.id, output_db
                    ):
                        feature.save(output_db)
                        if feature.bgc_id not in bgc_features:
                            bgc_features[feature.bgc_id] = {}
                        bgc_features[feature.bgc_id][feature.hmm_id] = feature.value
                    output_db.commit_inserts()

                    for bgc_id in chunk:
                        if update_bgc_status(
                                bgc_id, run.id, 4, output_db) == 1:
                            features_extracted.add(bgc_id)
                        else:
                            print("Failed to update bgc status " +
                                  "(bgc_id: " + str(bgc_id) + ") " +
                                  "(run_id: " + str(run.id) + ")")

                    pbar.update(len(chunk))
                pbar.close()

                run.log("dumping pickled bgc features...")
                hmm_ids = [row[0] for row in output_db.select(
                    "hmm,run",
                    "WHERE hmm.db_id=run.hmm_db_id" +
                    " AND run.id=?",
                    parameters=(run.id, ),
                    props=["hmm.id"],
                    as_tuples=True
                )]
                bgc_features = pd.DataFrame.from_dict(
                    bgc_features, orient='index'
                ).reindex(columns=hmm_ids).fillna(0).astype(np.uint8)
                store_pickle(bgc_features, pickled_file_path)

            # update status bin
            for bgc_id in features_extracted:
                assert run.bgcs[bgc_id] == 3
                run.bgcs[bgc_id] = 4
            bgc_status_bin[3] = bgc_status_bin[2] - features_extracted
            bgc_status_bin[4].update(features_extracted)

            print("[{}s] features extraction".format(get_elapsed()))
            run.log("features_extraction end")

            if len(to_be_extracted) > 0 and use_memory and early_dumping:
                output_db.dump_db_file()

        if len(bgc_status_bin[3]) == 0:
            if run.status < 4:
                if run.update_status(4) > 0:
                    print("run_status is now FEATURES_EXTRACTED")
                else:
                    print("failed to update run_status")
        else:
            print("Encountered errors in feature_extraction, " +
                  str(len(bgc_status_bin[3])) +
                  " BGCs are not yet extracted.")
            return 1

        # phase 5: Clustering
        if 3 < run.status < 5:
            get_elapsed()
            print("Building GCF models...")
            run.log("clustering start " +
                    str(len(run.bgcs)) + " BGCs")

            # build GCF models
            run.log("BIRCH run start")
            clustering = BirchClustering.run(
                run.id,
                output_db,
                cache_folder,
                complete_only=clustering_complete_only,
                threshold=clustering_threshold,
                threshold_percentile=clustering_threshold_percentile
            )
            clustering.save(output_db, cache_folder=cache_folder)
            output_db.commit_inserts()
            del clustering
            run.log("BIRCH run end")

            print("[{}s] clustering".format(get_elapsed()))
            run.log("clustering end")
            update_bgcs_status = output_db.update("run_bgc_status",
                                                  {"status": 5},
                                                  "WHERE run_id=?",
                                                  (run.id,))
            if update_bgcs_status > 0 and run.update_status(5) > 0:
                print("run_status is now CLUSTERING_FINISHED")
            else:
                print("failed to update run_status")

        # phase 6: Membership assignment
        if 4 < run.status < 6:
            get_elapsed()
            print("Assigning GCF membership...")
            run.log("membership assignment start for " +
                    str(len(run.bgcs)) + " BGCs")

            for membership in Membership.assign(
                run.id,
                output_db,
                top_hits=n_ranks,
                threshold=None,
                cache_folder=cache_folder
            ):
                membership.save(output_db)
            output_db.commit_inserts()

            print("[{}s] membership_assignment".format(get_elapsed()))
            run.log("membership assignment end")
            update_bgcs_status = output_db.update("run_bgc_status",
                                                  {"status": 6},
                                                  "WHERE run_id=?",
                                                  (run.id,))
            if update_bgcs_status > 0 and run.update_status(6) > 0:
                print("run_status is now MEMBERSHIPS_ASSIGNED")
            else:
                print("failed to update run_status")

        # phase 7: finishing
        if 5 < run.status < 7:
            get_elapsed()

            pass  # todo: need to do something here?

            print("[{}s] preparing_output".format(get_elapsed()))
            update_bgcs_status = output_db.update("run_bgc_status",
                                                  {"status": 7},
                                                  "WHERE run_id=?",
                                                  (run.id,))
            if update_bgcs_status > 0 and run.update_status(7) > 0:
                run.log("run finished")
                print("run_status is now RUN_FINISHED")
            else:
                print("failed to update run_status")

    # exit program cleanly
    return 0


if __name__ == "__main__":
    return_code = main()
    if return_code == 0:
        print("BiG-SLiCE run complete!")
    else:
        print("BiG-SLiCE run failed.")
