#!/usr/bin/perl
##
##
## plibrary.pl
## Copyright (C) 2009 Ronald C Beavis, all rights reserved
## The Global Proteome Machine 
## This software is a component of the X! proteomics software
## development project
##
## Use of this software governed by the Artistic license,
## as reproduced at http://www.opensource.org/licenses/artistic-license.php
##
## plibrary accepts a GPM data file as an input and outputs an X! Hunter
## compatible MGF annotated spectrum library. 
##
## Version 2009.07.08 - first version
##
##

use strict;

my $file_version = "plibrary.pl, v. 2009.08.18";

my $path = $ARGV[0];

my $label;
my @taxa;
if(not open(INPUT,"<$path"))    {
    if(open(INPUT,"<$path.gz")) {
        close(INPUT);
        system("gzip -d $path.gz");
        open(INPUT,"<$path");
    }
}

while(<INPUT>)  {
    if(/\<bioml/)   {
        if(/label=\"/)  {
            s/.*label=\"(.*?)\".*/$1/i;
            $label = $_;
        }
        close(INPUT);
    }
}
close(INPUT);
$label =~ s/\\/-/g;
$label =~ s/://g;
$label =~ s/\//-/g;
$label =~ s/\-//g;
if(length($label) == 0) {
    $label = "main model listing";
}

my $gpm = $path;
$gpm =~ s/.*\///i;

open(INPUT,"<$path")  or die "$path not found";
my @libraries;
my @expects;
my @indices;
my %index;
while(<INPUT>)	{
	if(/\<group/ and /type\=\"model\"/)	{
		my @group;
		chomp($_);
		push(@group,$_);
		$_ = <INPUT>;
		while($_ and not /\<\/group\>\<\/group\>/)	{
			chomp($_);
			push(@group,$_);
			$_ = <INPUT>;
		}
		my ($l,$e,$i) = ProcessGroup(\@group);
		if($index{$i})	{
			if($e < $index{$i})	{
				$index{$i} = $e;
			}
		}
		else	{
			$index{$i} = $e;
		}
		push(@libraries,$l);
		push(@indices,$i);
		push(@expects,$e);
	}
}
close(INPUT);
my $libsize = 0;
my $x = 0;
while($x < scalar(@expects))	{
	if(1 or $index{@indices[$x]} == @expects[$x])	{
		$libsize++;
	}
	$x++;
}
PrintHead($gpm,$libsize);
$x = 0;
while($x < scalar(@expects))	{
	if(1 or $index{@indices[$x]} == @expects[$x])	{
		print @libraries[$x];
	}
	$x++;
}

sub GetLibrarySize
{
	my ($_p) = @_;
	open(IN,"<$_p");
	my $size = 0;
	while(<IN>)	{
		if(/\<group/ and /type\=\"model\"/)	{
			$size++;
		}
	}
	return $size;
}

sub PrintHead
{
	my ($_h,$_l) = @_;
	print qq(SEARCH=MIS\r\nREPTYPE=Peptide\r\nLIBSIZE=$_l\r\n## created by: $file_version\r\n);
	print qq(## original data set: $gpm\r\n);
	my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
	printf "## creation date: %4d.%02d.%02d %02d:%02d:%02d\r\n",$year+1900,$mon+1,$mday,$hour,$min,$sec;
}

sub ProcessGroup
{
	my ($_g) = @_;
	my $label;
	my @starts;
	my @expects;
	my @sequences;
	my @mods;
	my $g;
	my @labels;
	my $spec = 0;
	my $x = 0;
	my $length = scalar(@$_g);
	my $mz;
	my $z;
	my @xs;
	my @ys;
	while($x < $length)	{
		$g = @$_g[$x];
		$x++;
		if($x == 1)	{
			($z) = $g =~ /z\=\"(.+?)\"/;
		}
		elsif($g =~ /\<protein/)	{
			($label) = $g =~ /label\=\"(.+?)\"/;
		}
		elsif($g =~ /\<domain/)	{
			my ($s) = $g =~ /start\=\"(.+?)\"/;
			my $st = $s;
			push(@starts,$s);
			($s) = $g =~ /expect\=\"(.+?)\"/;
			push(@expects,$s);
			($s) = $g =~ /seq\=\"(.+?)\"/;
			push(@sequences,$s);
			push(@labels,"$label\@$st");
			($mz) = $g =~ /mh\=\"(.+?)\"/;
			$mz = ($mz - 1.007276)/$z + 1.007276;
			$mz = sprintf("%.4f",$mz);
			my @m = ();
			while(not $g =~ /\<\/domain/)	{
				$g = @$_g[$x];
				$x++;
				if($g =~ /\<aa/)	{
					my $t;
					($t) = $g =~ /modified\=\"(.+?)\"/;
					($s) = $g =~ /at\=\"(.+?)\"/;
					$s = $s - $st + 1;
					$t = "$t\@$s";
					if($g =~ /pm\=/)	{
						($s) = $g =~ /pm\=\"(.*?)\"/;
						if($s)	{
							$t .= ",$s";
							($s) = $g =~ /id\=\"(.*?)\"/;
							if($s)	{
								$t .= ",$s";
							}
						}
					}
					push(@m,$t);
				}
			}
			push(@mods,\@m);
		}
		elsif($g =~ /type\=\"support\" label\=\"fragment ion mass spectrum\"/)	{
			while($x < $length)	{
				$g = @$_g[$x];
				$x++;
				if($g =~ /\<GAML\:Xdata/)	{
					$x++;
					$g = @$_g[$x];
					while(not $g =~ /\<\/GAML\:values\>/)	{
						my @v = split / /,$g;
						my $b;
						foreach $b(@v)	{
							push(@xs,$b);
						}
						$x++;
						$g = @$_g[$x];
					}
				}
				if($g =~ /\<GAML\:Ydata/)	{
					$x++;
					$g = @$_g[$x];
					while(not $g =~ /\<\/GAML\:values\>/)	{
						my @v = split / /,$g;
						my $b;
						foreach $b(@v)	{
							push(@ys,$b);
						}
						$x++;
						$g = @$_g[$x];
					}
				}
			}
			last;
		}

	}
	my @rank = sort { $ys[$b] <=> $ys[$a] }  0 .. $#ys;
	my @xt;
	my @yt;
	my $count = 0;
	foreach $x(@rank)	{
		push(@xt,@xs[$x]);
		push(@yt,@ys[$x]);
		$count++;
		if($count == 20)	{
			last;
		}
	}
	@rank = sort { $xt[$a] <=> $xt[$b] }  0 .. $#xt;
	my $out = "BEGIN IONS\r\n";
	my $index = "@sequences[0]$z";
	$out .= "PEPMASS=$mz\r\n";
	$out .= "CHARGE=$z\r\n";
	$out .= "PEPSEQ=" . @sequences[0] . "\r\n";
	$out .= "PEPEXP=" . @expects[0] . "\r\n";
	my $mod = @mods[0];
	foreach $x(@$mod)	{
		$out .= "PEPMOD=$x\r\n";
		$index .= $x;
	}
	foreach $x(@labels)	{
		$out .= "PEPACC=$x\r\n";
	}
	foreach $x(@rank)	{
		$out .= "@xt[$x] @yt[$x]\r\n";
	}
	$out .= "END IONS\r\n\r\n";
	return ($out,@expects[0],$index);
}
