#!/usr/bin/env python
#
# BioLite - Tools for processing gene sequence data and automating workflows
# Copyright (c) 2012-2014 Brown University. All rights reserved.
# 
# This file is part of BioLite.
# 
# BioLite is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# BioLite is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with BioLite.  If not, see <http://www.gnu.org/licenses/>.

import argparse
import os
import shutil
import subprocess
import wget

from collections import defaultdict
from string import Template

from biolite import catalog
from biolite import config
from biolite import diagnostics
from biolite import utils
from biolite.workflows import sra

### SRA output ###

# Templates from EBI instructions here:
# http://www.ebi.ac.uk/ena/about/sra_preparing_metadata

xml = """<?xml version="1.0" encoding="UTF-8"?>
<{0}_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.{1}.xsd">
{2}
</{0}_SET>
"""

names = ('sample', 'experiment', 'run')

templates = {}

templates['study'] = Template("""
	<STUDY alias="${study}" center_name="${center}">
		<DESCRIPTOR>
			<STUDY_TITLE>${study_title}</STUDY_TITLE>
			<STUDY_TYPE existing_study_type="${study_type}"/>
		</DESCRIPTOR>
	</STUDY>
""")

templates['submission'] = Template("""
	<SUBMISSION alias="${study}" center_name="${center}">
		<ACTIONS>
			<ACTION>
				<ADD source="study.xml" schema="study"/>
			</ACTION>
			<ACTION>
				<ADD source="sample.xml" schema="sample"/>
			</ACTION>
			<ACTION>
				<ADD source="experiment.xml" schema="experiment"/>
			</ACTION>
			<ACTION>
				<ADD source="run.xml" schema="run"/>
			</ACTION>
		</ACTIONS>
	</SUBMISSION>
""")

templates['sample'] = Template("""
	<SAMPLE alias="samp_${id}" center_name="${center}">
		<TITLE>samp_${id}</TITLE>
		<SAMPLE_NAME>
			<TAXON_ID>${ncbi_id}</TAXON_ID>
			<SCIENTIFIC_NAME>${species}</SCIENTIFIC_NAME>
		</SAMPLE_NAME>
		<DESCRIPTION>${note}</DESCRIPTION>
		<SAMPLE_LINKS>
			<SAMPLE_LINK>
				<URL_LINK>
					<LABEL>ITIS ID ${itis_id}</LABEL>
					<URL>%s${itis_id}</URL>
				</URL_LINK>
			</SAMPLE_LINK>
		</SAMPLE_LINKS>
		<SAMPLE_ATTRIBUTES>
			<SAMPLE_ATTRIBUTE>
				<TAG>tissue</TAG>
				<VALUE>${tissue}</VALUE>
			</SAMPLE_ATTRIBUTE>
			<SAMPLE_ATTRIBUTE>
				<TAG>itis_id</TAG>
				<VALUE>${itis_id}</VALUE>
			</SAMPLE_ATTRIBUTE>
		</SAMPLE_ATTRIBUTES>
	</SAMPLE>
""" % catalog.itis_url.replace('&', '&amp;'))

templates['experiment'] = Template("""
	<EXPERIMENT alias="exp_${id}" center_name="${center}">
		<TITLE>exp_${id}</TITLE>
		<STUDY_REF refname="${study}" refcenter="${center}"/>
		<DESIGN>
			<DESIGN_DESCRIPTION></DESIGN_DESCRIPTION>
			<SAMPLE_DESCRIPTOR refname="samp_${id}" refcenter="${center}"/>
			<LIBRARY_DESCRIPTOR>
				<LIBRARY_NAME>${library_id}</LIBRARY_NAME>
				<LIBRARY_STRATEGY>${library_strategy}</LIBRARY_STRATEGY>
				<LIBRARY_SOURCE>${library_source}</LIBRARY_SOURCE>
				<LIBRARY_SELECTION>RANDOM</LIBRARY_SELECTION>
				<LIBRARY_LAYOUT>
					<PAIRED${insert_size}/>
				</LIBRARY_LAYOUT>
				<LIBRARY_CONSTRUCTION_PROTOCOL>${sample_prep}</LIBRARY_CONSTRUCTION_PROTOCOL>
			</LIBRARY_DESCRIPTOR>
		</DESIGN>
		<PLATFORM>
			<ILLUMINA>
				<INSTRUMENT_MODEL>${sequencer}</INSTRUMENT_MODEL>
			</ILLUMINA>
		</PLATFORM>
	</EXPERIMENT>
""")

templates['run'] = Template("""
	<RUN alias="run_${id}" center_name="${center}">
		<TITLE>run_${id}</TITLE>
		<EXPERIMENT_REF refname="exp_${id}"/>
		<PLATFORM>
			<ILLUMINA>
				<INSTRUMENT_MODEL>${sequencer}</INSTRUMENT_MODEL>
			</ILLUMINA>
		</PLATFORM>
		<DATA_BLOCK>
			<FILES>
				<FILE filename="${path0}" filetype="fastq" checksum_method="MD5" checksum="${checksum0}"/>
				<FILE filename="${path1}" filetype="fastq" checksum_method="MD5" checksum="${checksum1}"/>
			</FILES>
		</DATA_BLOCK>
	</RUN>
""")


def validate_sra(**kwargs):
	"""
	Validate five XML files in the specified `workdir` against the appropriate
	SRA schema:

	* submission.xml
	* study.xml
	* sample.xml
	* experiment.xml
	* run.xml
	"""
	for name in ('submission', 'study', 'sample', 'experiment', 'run'):
		subprocess.call([
			'xmllint', '--noout', '--schema',
			os.path.join(config.datadir, 'SRA.%s.xsd' % name), name+'.xml'],
			cwd=kwargs.get('workdir', None))


def export_sra(**kwargs):
	"""
	Create four XML files (study, sample, experiment, run) with entries for
	each catalog ID, and make local uncompressed copies of the sequence files.
	Create a submission XML file for upload to SRA.
	"""
	content = defaultdict(list)
	outdir = utils.safe_mkdir(kwargs['outdir'])
	os.chdir(outdir)
	open('study.xml', 'w').write(
		xml.format('STUDY', 'study', templates['study'].substitute(kwargs)))
	for id in kwargs['IDS']:
		record = catalog.select(id)._asdict()
		record.update(kwargs)
		paths = catalog.split_paths(record['paths'])
		if len(paths) != 2 or not record['sequencer'].startswith('Illumina'):
			utils.die(id, "is an unsupported data type\n ",
			              "SRA export only supports paired-end Illumina data")
		# Infer a few additional SRA attributes from the library type.
		if record['library_type'] == 'genome':
			record['library_strategy'] = 'WGS'
			record['library_source'] = 'GENOMIC'
		elif record['library_type'] == 'transcriptome':
			record['library_strategy'] = 'RNA-Seq'
			record['library_source'] = 'TRANSCRIPTOMIC'
		else:
			utils.die("unknown library type '%s' in catalog for '%s'" % (
			          record['library_type'], id))
		# Look for insert size estimates in the diagnostics
		insert_size = diagnostics.lookup_by_id(id, diagnostics.INSERT_SIZE)
		if 'mean' in insert_size:
			record['insert_size'] = ' NOMINAL_LENGTH="%.0f"' % float(insert_size['mean'])
			if 'stddev' in insert_size:
				record['insert_size'] += ' NOMINAL_SDEV="%.f"' % float(insert_size['stddev'])
		else:
			record['insert_size'] = ''
		# Make local copy of files and checksum them
		for i in (0,1):
			src = paths[i]
			dst = os.path.abspath(os.path.basename(src).rstrip('.gz'))
			if not os.path.exists(dst):
				if src.endswith('.gz'):
					utils.info("uncompressing '%s'" % src)
					os.system("gunzip -c %s >%s" % (src, dst))
				else:
					utils.info("copying '%s'" % src)
					shutil.copyfile(src, dst)
			else:
				utils.info("found '%s'" % dst)
			if not os.path.exists(dst+'.md5'):
				md5 = utils.md5sum(dst)
				open(dst+'.md5', 'w').write(md5)
			record['path%d' % i] = os.path.basename(dst)
			record['checksum%d' % i] = open(dst+'.md5').readline()
		# Write XML files
		for name in names:
			content[name].append(templates[name].substitute(record))
	for name in names:
		open(name+'.xml', 'w').write(
			xml.format(name.upper(), name, '\n'.join(content[name])))
	open('submission.xml', 'w').write(
		xml.format(
			'SUBMISSION', 'submission',
			templates['submission'].substitute(kwargs)))
	validate_sra(workdir=outdir)


def import_sra(**kwargs):
	"""
	Retrieve and catalog all data sets associated with the SRA accession
	number, which can be for a study, sample, experiment or individual run.
	"""
	if 'email' in kwargs:
		sra.Entrez.email = kwargs['email']
	clean = kwargs.get('clean', False)
	gzip = kwargs.get('gzip', False)
	datadir = kwargs.get('datadir', None)

	ids = sra.all_ids(kwargs['ACCESSION'])

	if datadir:
		os.chdir(datadir)

	for xml in sra.download_xmls(ids):
		record = sra.xml_metadata(xml)
		utils.info('found experiment', record['id'])
		if not kwargs['metadata']:
			new_paths = []
			for run_id in record['paths']:
				ftp_path = sra.ftp_url(run_id)
				path = run_id+'.sra'
				utils.info('downloading', ftp_path)
				dl_path = wget.download(ftp_path)
				shutil.move(dl_path, path)
				utils.info('converting', path, 'to FASTQ')
				fastq_dump = [
					'fastq-dump', '--split-files', '--defline-seq', '@$sn/$ri',
					'--defline-qual', '+', './%s.sra' % run_id]
				if gzip:
					fastq_dump.insert(-1, '--gzip')
					new_paths += [run_id+'_1.fastq.gz', run_id+'_2.fastq.gz']
				else:
					new_paths += [run_id+'_1.fastq', run_id+'_2.fastq']
				subprocess.check_call(fastq_dump)
				print ""
				if clean:
					os.unlink(path)
			record['paths'] = catalog.path_sep.join(map(os.path.abspath, new_paths))
		else:
			record.pop('paths')
		catalog.insert(**record)
		catalog.print_record(record['id'])


if __name__ == '__main__':

	parser = argparse.ArgumentParser( \
		formatter_class=argparse.RawDescriptionHelpFormatter,\
		description="""
Command-line tool for interfacing the BioLite catalog with the Sequence Read
Archive (SRA).
""")

	subparsers = parser.add_subparsers(title='commands')

	export_parser = subparsers.add_parser('export', help=export_sra.__doc__)
	export_parser.add_argument('IDS', nargs='+', help="""
		The list of catalog IDs to export.""")
	export_parser.add_argument('--outdir', '-o', default='.', help="""
		Directory in which to copy sequence files and write XML files,
		which can then be tarred up for submission to SRA. [default='.']""")
	export_parser.add_argument('--study', '-s', required=True,
		default="TODO: ADD STUDY NAME", help="""
		An alias for the SRA study.""")
	export_parser.add_argument('--center', '-c', required=True,
		default="TODO: ADD CENTER NAME", help="""
		The SRA username for your center.""")
	export_parser.add_argument('--study_title',
		default="TODO: ADD STUDY TITLE", help="""
		A short title describing the study.""")
	export_parser.add_argument('--study_type',
		default="TODO: ADD STUDY TYPE", help="""
		The study type from the SRA list of existing study types.""")
	export_parser.set_defaults(func=export_sra)

	validate_parser = subparsers.add_parser('validate', help=validate_sra.__doc__)
	validate_parser.add_argument('--workdir', '-d', metavar='DIR', help="""
		XML files are located in DIR.""")
	validate_parser.set_defaults(func=validate_sra)

	import_parser = subparsers.add_parser('import', help=import_sra.__doc__)
	import_parser.add_argument('ACCESSION')
	import_parser.add_argument('--datadir', '-d', help="""
		Download SRA files to a directory hierarchy in the root directory
		DATADIR (default: '.')""")
	import_parser.add_argument('--clean', '-c', action='store_true', help="""
		Remove .sra files after they have been successfully converted to
		FASTQ.""")
	import_parser.add_argument('--email', '-e', help="""
		Email address to register for Entrez queries (default: 'email' value
		in BioLite configuration file).""")
	import_parser.add_argument('--gzip', '-z', action='store_true', help="""
		gzip the converted FASTQ files to save space.""")
	import_parser.add_argument('--metadata', '-m', action='store_true', help="""
		Only update metadata without downloading data.""")
	import_parser.set_defaults(func=import_sra)

	kwargs = vars(parser.parse_args())
	func = kwargs.pop('func')
	func(**kwargs)

# vim: syntax=python ts=4
