#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import absolute_import, division, print_function, unicode_literals

"""convert assembly text file to json

eg$ ./sbin/assembly-to-json -p bioutils/_data/assemblies -f pull/GCF_000001405.29.assembly.txt

"""


from argparse import ArgumentParser
import csv
import json
import logging
import os
import re
import sys

import six


class AssemblyParser(object):
    """Parse an assembly file from NCBI, like this one:
      ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000001405.25.assembly.txt
    """

    def __init__(self, body):
        self._body = body.replace("\r","")
    
    @property
    def name(self):
        return re.search("^# Assembly name:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip()

    @property
    def description(self):
        try:
            return re.search("^# Description:\s+(.+)", self._body, flags=re.MULTILINE).group(1).strip()
        except AttributeError:
            return None

    @property
    def taxid(self):
        return re.search("^# Taxid:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip()
    
    @property
    def genbank_accession(self):
        try:
            return re.search("^# GenBank assembly accession:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip()
        except AttributeError:
            return None

    @property
    def refseq_accession(self):
        return re.search("^# RefSeq assembly accession:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip()

    @property
    def submitter(self):
        return re.search("^# Submitter:\s+(.+)", self._body, flags=re.MULTILINE).group(1).strip()

    @property
    def date(self):
        dt = re.search("^# Date:\s+(\S+)", self._body, flags=re.MULTILINE).group(1).strip()
        ymd = map(int,dt.split("-"))
        return "{:4d}-{:02d}-{:02d}".format(*ymd)

    @property
    def assembly_units(self):
        block = re.search("## Assembly-Units:\n(.+?)\n#\n", self._body, flags=re.DOTALL).group(1).replace("## ","")
        dr = csv.DictReader(six.StringIO(block), delimiter=str("\t"))
        return list(dr)

    @property
    def sequences(self):
        block = re.search("# (Sequence-Name.+)", self._body, flags=re.DOTALL).group(1)
        dr = csv.DictReader(six.StringIO(block), delimiter=str("\t"))
        return list(dr)



def parse_options(argv):
    ap = ArgumentParser(
        description = __doc__,
        )
    ap.add_argument("ASSEMBLIES", nargs="+", help="accessions (GCF or GCA) or filenames of downloaded assemblies")
    ap.add_argument("--prefix", "-p", required=True, help="directory prefix for saving files -- must exist")
    opts = ap.parse_args(argv[1:])
    return opts


def build_seq_rec(seqrec):
    aliases = []
    if seqrec["UCSC-style-name"] != "na":
        aliases = [seqrec["UCSC-style-name"]]     # "chr17_ctg5_hap1"

    if seqrec["GenBank-Accn"] != "na":
        genbank_ac = seqrec["GenBank-Accn"]       # "GL000258.1",
    else:
        genbank_ac = None

    return {
        "aliases": aliases,
        "assembly_unit": seqrec["Assembly-Unit"], # "ALT_REF_LOCI_9"
        "genbank_ac": genbank_ac,                 # "GL000258.1",
        "length": int(seqrec["Sequence-Length"]), # "1680828"
        "name": seqrec["Sequence-Name"],          # "HSCHR17_1_CTG5",
        "refseq_ac": seqrec["RefSeq-Accn"],       # "NT_167251.1"
        "relationship": seqrec["Relationship"],   # "=",
        "sequence_role": seqrec["Sequence-Role"],  # "alt-scaffold"
        #"": seqrec["Assigned-Molecule"], # "17"
        #"": seqrec["Assigned-Molecule-Location/Type"], # "Chromosome",
    }


def process1(opts, assy_id_or_name):
    content = open(assy_id_or_name,"r").read()

    assy = AssemblyParser(content)

    obj = {
        "name": assy.name,
        "description": assy.description,
        "date": assy.date,
        "submitter": assy.submitter,
        "genbank_ac": assy.genbank_accession,
        "refseq_ac": assy.refseq_accession,
        "sequences": [build_seq_rec(sr) for sr in assy.sequences],
        }

    out_fn = os.path.join(opts.prefix, assy.name + ".json")

    with open(out_fn,"w") as out_fd:
        json.dump(fp=out_fd, sort_keys=True, indent=2, obj=obj)
    return out_fn


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger()

    opts = parse_options(sys.argv)

    for assy_id_or_name in opts.ASSEMBLIES:
        try:
            out_fn = process1(opts, assy_id_or_name)
            if out_fn is not None:
                logger.info("wrote {}".format(out_fn))
        except Exception as e:
            logger.error("oopsie on " + assy_id_or_name)
            logger.exception(e)
            
