#!/usr/bin/perl -w
#
# Creation Date: 29th May 2002
# Written by:    Andy Sanwell
# Version 1.0:  19th August 2002
# Current 1.1:  17th September    
#
# Reason for Change to 1.1
# ========================
# Sorted insertion codes need and added in case of A:-222--30
my $version = "scop2stamp.pl - version 1.1 ACS (17th September 2002)";

# Description
# ===========
# This program uses the downloadable SCOP files from
#
# http:// scop.mrc-lmb.cam.ac.uk/scop/
#
# dir.cla.scop.txt - SCOP classification of each PDB file
# dir.des.scop.txt - SCOP description file for each SCOP id
# dir.hie.scop.txt - SCOP hierarchy information file
#
# and assumes they are version 1.55 or later since the format changed
# then.
#
# Alogrithm
# =========
# The programme works by reading in the classification (cla) file and 
# proccessing the data into a hash table keyed on sunid in a form
# that can be written directly out.
# The description file (des) is used to build up hash tables of different 
# node types so that direct processing at each tree level can take place.
# The hierarchy file (hie) is used so that given any particular sunid, the
# tree can be scanned until a leaf node is found.
# So for any particular node level - FOLD, SUPFAM, FAMILY, PROTEIN, SPECIES,
# or DOMAIN, the type hash table gets an initial set of sunids and for
# each of them a domain is calculated using the hierarchy hash'  This is
# then written out from the classification hash to the output file.
#



use Getopt::Long;

########## Options variables
my $help = 0;
my $allPdb = 0;
my $singlePdb = 0; 
my $singleSpecProt = 0; 
my $oneProt = 0;
my $oneProtFam = 0; 
my $oneProtSupFam = 0;
my $oneFold = 0;
my $allTypes = 0;      # program does all the files
my $dirpath = "./";    # define the input files to be in the current directory

#@scopNames = qw(dir.cla.scop.txt dir.des.scop.txt dir.hie.scop.txt);
                       # list of the SCOP filenames
my %SCOPtypes;         # storage place of sunid groups of SCOP types
my %SCOPclass;         # processed SCOP class file enteries
my %SCOPhier;          # SCOP hiearchy index by node SUNID gives 
                       # two element array of parent SUNID and 
                       # a string of comma seperated child SUNID

# process command options
GetOptions('help' ,\$help,
	   'version|v', sub { print "scop2stamp version = $version \n" },
	   'directory=s', \$dirpath,
	   'all', \$allTypes,
	   'apdb', \$allPdb,
	   'spdb', \$singlePdb, 
	   'sps', \$singleSpecProt,
	   'op', \$oneProt, 
	   'opf', \$oneProtFam, 
	   'opsf', \$oneProtSupFam,
	   'of', \$oneFold);

if ($help == 1){
    printHelpInfo();
    exit();
}
# output all databases if --all is given
if ($allTypes == 1) {
    $allPdb = 1;
    $singlePdb = 1;
    $singleSpecProt = 1;
    $oneProt = 1;
    $oneProtFam = 1;
    $oneProtSupFam = 1;
    $oneFold = 1;
}

# open input files
$SCOPCLA = $dirpath."/"."dir.cla.scop.txt";
open SCOPCLA or die "Unable to open $SCOPCLA file.\n";
$SCOPDES = $dirpath."/"."dir.des.scop.txt";
open SCOPDES or die "Unable to open $SCOPDES file.\n";
$SCOPHIE = $dirpath."/"."dir.hie.scop.txt";
open SCOPHIE or die "Unable to open $SCOPHIE file.\n";

# read the SCOP data into hash tables
processScopGroups(SCOPDES);
processScopClass(SCOPCLA);
processScopHierarchy(SCOPHIE);
#writeScopHierarchy();

# write out the different .dom files
if ($allPdb == 1){
    $outfile = $dirpath . "/scop.dom";
    writeDomainFile($outfile, DOMAIN, 1);
}
# this is just the scop.dom file but each protein has only 
# one domain representation
if ($singlePdb == 1){
    $outfile = $dirpath . "/scop_domain.dom";
    # check to see if "scop.dom" has been generate and use it.
    if ($allPdb == 1) {
	@perlArgs = ("/bin/cp", $dirpath."/scop.dom", $outfile);
	system(@perlArgs) == 0 or die "system @perlArgs failed: $? \n"; 
    }
    else {
	writeDomainFile($outfile, DOMAIN, 0);
    }
    filterDomainFile($outfile);
}
if ($singleSpecProt == 1){
    $outfile = $dirpath . "/scop_species.dom";
    writeDomainFile($outfile, SPECIES, 1);
}
if ($oneProt == 1){
    $outfile = $dirpath . "/scop_prot.dom";
    writeDomainFile($outfile, PROTEIN, 1);
}
if ($oneProtFam == 1){
    $outfile = $dirpath . "/scop_fam.dom";
    writeDomainFile($outfile, FAMILY, 1);
}
if ($oneProtSupFam == 1){
    $outfile = $dirpath . "/scop_supf.dom";
    writeDomainFile($outfile, SUPFAM, 1);
}
if ($oneFold == 1){
    $outfile = $dirpath . "/scop_fold.dom";
    writeDomainFile($outfile, FOLD, 1);
}

print "Finished program.\n";

###########################################################################
#
# Subroutines
#
###########################################################################

# Encode the SCOP hierarchy file info a hash table
#
# *processScopHierarchy
#
sub processScopHierarchy {
    my ($infile, $outfile) = @_;

    # read in the hierarchy tree to speed up domain search
    # [node sunid] [parent sunid] [child sunids csl]
    HLINE: while (<$infile>) {
	next HLINE if /^#/;
	next HLINE if /^$/;
	chomp();
	($nid, $ptid, @chid) = split(" ");
	$SCOPhier{$nid} = [ ($ptid, @chid) ];
    }
}

#
# Used for debugging purposes.
#
# *writeScopHierarchy
#
sub writeScopHierarchy {
    $scopout = "/homes/andy/tmp/scop.hie.txt";
    open(SCOPOUT,">".$scopout );
    
    foreach $nid (keys %SCOPhier) {
	print SCOPOUT  $nid." ".$SCOPhier{$nid}[0]."  ".$SCOPhier{$nid}[1]."\n" ;
    }
    close SCOPOUT;
}

# This subroutine constructs a hash table for every row in the 
# "dir.cla.scop.txt" file, this may get too much if the SCOP database
# gets much larger? It is currently ~60k
#
# *processScopClass
#
sub processScopClass {
    my ($infile,$outfile) = @_;

    LINE: while (<$infile>) {
	next LINE if /^#/;
	next LINE if /^$/;
	@words = split(" "); #could do a (a,b,c) type here for readability
	$pdbfilestr = "/<pdb_dir>/pdb/pdb" . $words[1] . ".ent ";
	$reduced = $words[0];
	$reduced =~ s/_//g;
	$domainstr = $reduced; #. "." . $words[3] . $words[4];
	$chainstr = " { " . generateDomainString($words[2]) . "}\n";
	#print $outfile "$pdbfilestr $domainstr  $chainstr";
	$SCOPclass{$words[4]} = $pdbfilestr.$domainstr.$chainstr;
    }
}

# Work out which sunid's belong into which group
#
# *processScopGroups
#
sub processScopGroups {
    my ($infile, @stuff) = @_;
    my $type = "";
    my $sunid = 0;

  DLINE: while (<$infile>) {
      next DLINE if /^#/;
      next DLINE if /^$/;
      ($sunid, $type, undef) = split(" ");
      if ($type =~ /cl/) { 
	  push @{$SCOPtypes{CLASS}}, $sunid;
	  #print "$type $sunid \n";
	  next DLINE;
      };
      if ($type =~ /cf/) { 
	  push @{$SCOPtypes{FOLD}}, $sunid;
	  next DLINE;
      };
      if ($type =~ /sf/) { 
	  push  @{$SCOPtypes{SUPFAM}}, $sunid;
	  next DLINE;
      };
      if ($type =~ /fa/) { 
	  push @{$SCOPtypes{FAMILY}}, $sunid;
	  next DLINE;
      };
      if ($type =~/dm/) { 
	  push @{$SCOPtypes{PROTEIN}}, $sunid;
	  next DLINE;
      };
      if ($type =~ /sp/) { 
	  push @{$SCOPtypes{SPECIES}}, $sunid;
	  next DLINE;
      };
      if ($type =~ /px/) { 
	  push @{$SCOPtypes{DOMAIN}}, $sunid;
	  next DLINE;
      };
  }
}   

# construct information on domain encoding
#
# *generateDomainString
#
sub generateDomainString {
    my ($original) = @_;
    my $result = "";
    my @subchain = ();
    my @parts = ();
    my $chainid = "";

    # structure generally <char>:<num>-<num> though just "-" means ALL
    @parts = split(/,/,$original);
    
  PART: foreach $part (@parts) {
      @subchain = split(/:/,$part);
      #print STDERR "$#subchain parts\n";
      if ($#{subchain} > 0) {
	  # \w:\d-\d and \w:-\d-\d
	  $chainid = $subchain[0];
	  if ($subchain[1] =~ /-(\w+)-(\w+)/) {
	      $result .= "$chainid -".insertString($1);
	      $result .="to $chainid ".insertString($2); 
	      next PART; }
	  if ($subchain[1] =~ /(\w+)-(\w+)/) 
	  {
	      $result .= "$chainid ".insertString($1);
	      $result .="to $chainid ".insertString($2); 
	      next PART; 
	  }
	  # it is also possible to be \w:-\w--\w eg C:-20--3 though this
	  # doesn't seem to be the case yet
	  if ($subchain[1] =~ /-(\w+)--(\w+)/ )
	  {
	      $result .= "$chainid -".insertString($1);
	      $result .="to $chainid -".insertString($2);
	      next PART; 
	  }
      }
      else {
	  #print STDERR "single part $subchain[0]\n";
	  if ($subchain[0] =~ /^-$/){
	      $result .= "ALL "; next PART; }
	  if ($subchain[0] =~ /(\w)/) { 
	      $result .= "CHAIN $1 "; next PART; }
	  if ($subchain[0] =~ /-(\w+)-(\w+)/) {
	      $result .= "_ -".insertString($1);
	      $result .="to _ ".insertString($2); next PART;}
	  if ($subchain[0] =~ /(\w+)-(\w+)/) {
	      $result .= "_ ".insertString($1);
	      $result .="to _ ".insertString($2); next PART;}
	  if ($subchain[0] =~ /-(\w+)--(\w+)/) {
	      $result .= "_ -".insertString($1);
	      $result .="to _ -".insertString($2); next PART;}
      }
  }  
    #print " @subchain $result\n";
    return($result);
}



# These uses the hash table %SCOPtype and %SCOPclass to write out
# the required Stamp file.  Only the complete domain file can be 
# written out directly since these are leaves in our hierarchy tree.
# The other types needs to be processed to find a descendent domain.
#
# *generatePdb
#
sub generatePdb {
    my ($class, $outfile) = @_;
    my $entry = "";
    my @typeSet = ();

    @typeSet = @{$SCOPtypes{$class}};
    if ($class =~ /DOMAIN/) {
	foreach $entry (@typeSet) {
	    print $outfile $SCOPclass{$entry};
	}
    }
    else {
	foreach $entry (@typeSet) {
	    $requiredChild = findFirstDomain($entry);
	    #print $SCOPclass{$requiredChild};
	    print $outfile $SCOPclass{$requiredChild};
	}
    }
}
# Find the first domain leaf in the hierarchy
#
# *findFirstDomain
#
sub findFirstDomain {
    my ($pid) = @_;
    my $kid_id = "";
    my @temp = ();
    
    until ($kid_id eq "-") {
	$kid_str = $SCOPhier{$pid}[1];
	@temp = split(/,/ , $kid_str);
	$kid_id = $temp[0];
	if ($kid_id ne "-") {
	    $pid = $kid_id;
	}
    }
    #print "$pid \n";
    return ($pid);
}

#
# Make a copy of the whole domain file and then filter out the 
# duplicate proteins using the first field.
#
# *filterDomainFile
#
sub filterDomainFile {
    my ($stampFile) = @_;
    my @args = {};
    my $tmpfile = $dirpath . "/tempstamp.dom";
    my $outfile = "";
    my $current = "";
    my $last = "";
    
    @args = ("/bin/mv", $stampFile, $tmpfile);
    system(@args) == 0 or die "system @args failed: $? \n"; 
    $outfile = $dirpath . "/scop_domain.dom";
    open(OUTFILE, ">" . $outfile) or die "Unable to open $outfile.\n";
    open(TMPFILE, "<" . $tmpfile) or die "Unable to open $tmpfile.\n";

    # do the filtering
    while ($line = <TMPFILE>) {
	@fields = split(" ", $line);
	@dirs = split(/\//, $fields[0]);
	$current = $dirs[3];
	#print $current," ", $line;
	if ($last ne $current) {    # if different write it out otherwise skip
	    print OUTFILE  $line;
	    $last = $current;
	}
    }
    close(OUTFILE);
    close(TMPFILE);

    # tidy up
    @args = ("/bin/rm", $tmpfile);
    system(@args) == 0 or die "system @args failed: $? \n"; 
    print "Generated> $outfile\n";
    return();
}

# Find a combined string of digits and characters eg 123A return
# a split string "123 A ". Insertion codes are single characters.
#
# *insertString
#
sub insertString {
    my ($pattern) = @_;
    my $digit = "";
    my $insert = "";

    $pattern =~ /(\d+)(\w)?/;
    $digit = $1;
    $insert = $2;
    if (! defined $insert) {$insert = "_";}
    return $digit." ".$insert." ";
}

#
# *printHelpInfo
#
sub printHelpInfo {
    print "
Usage:
    scop2stamp [-v] [-h] [-d <path>] --[all|apdb|spdb|sps|op|opf|ospf|of]
where:
 --help       show this information
 --version    show version number
 --directory  path to directory of SCOP files
              defaults to current directory
 --all    create all file types
 --apdb   create file of all pdb entries
 --spdb   create file of pdb entries ignoring multiple chains
 --sps    create file of single protein for each species
 --op     create file of each protein
 --opf    create file of single protein for each family
 --ospf   create file of single protein for each superfamily
 --of     create file of single fold representative

Note: Running the program without options will just test the format of the
      SCOP files.
      Expected filenames are dir.cla.scop.txt, 
                             dir.hie.scop.txt, 
                             dir.des,scop.txt
";
}

# Inform user of the file created.
#
# *writeDomainFile
#
sub writeDomainFile {
    my ($file, $type, $flag) = @_;

    open(OUTFILE, ">" . $file) or die "Unable to open $outfile.\n";
    generatePdb($type, OUTFILE);
    close OUTFILE;
    print "Generated> $outfile\n" if $flag;
}
########################### redundent code


