/* select3  -	1. read output file of scan6 run
		2. read each entry of database on stdin - if entry is in output
		of scan6 run, then save.
		3. sort saved sequences and output to stdout

Author: Geoff Barton 1990
Laboratory of Molecular Biophysics
South Parks Road
Oxford OX1 3QU

VERBOSE = 1 for interactive messages
          0 for command line arguments
SGI = if 1 then include SGI specific comtop routine (not needed - 15/Feb/93).

Does not work properly on SGI - seems to miss Four character identifiers...
Fixed (Summer 1992).

select4: 15/2/93: Change to read and process floating point scores.
If scores are less than 1, then they are output in e format.  This
allows select to cope with probability/score pairs.

If the input scores are < 1.0, then the program assumes they are
probabilities and sorts into ascending order.  Otherwise, it assumes
they are raw scores, so sorts into descending order.

11 Feb 1994: Modify to get the database filename from the environment variables
GJNDBDIR and GJNDBROOT if these are set.


*/


#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <string.h>
#include "array.h"
#include "defaults.h"
#include <gjutil.h>

#define VERBOSE 1
#define SGI 0

int MAXnseq = MAX_NSEQ;
int MAXslen = MAX_SEQ_LEN;
int MAXilen = MAX_ID_LEN;
int MAXtlen = MAX_TITLE_LEN;

    struct result {
	float score;
	struct seqdat seq;
    };

    struct tops {
	char *id;
	float score;
    };

int comres();
int comres2();
int comtop();
void copseq();

main(argc, argv)

int	argc;
char	*argv[];

{
    FILE *fp,*fdb,*fout;
    char *id;
    char *com;
    int nseq, count, i, total,j, brief;
    float score;
    struct seqdat *seqs;
    int lc;
    int end;

    char *idfile,*dbfile,*ofile;

    struct tops *topid, *found, *search;/* id, score pairs required */

    struct result *res;

    char *ddir,*droot;

    com = (char *) malloc(sizeof(char) *10);

    seqs = (struct seqdat *) malloc(sizeof(struct seqdat));

/*    res = (struct result *) malloc(sizeof(struct result));*/

    topid = (struct tops *) 
	    malloc(sizeof(struct tops) * MAX_NSEQ); 
    search = (struct tops *) malloc(sizeof(struct tops));

    nseq = 0;

    ddir = NULL;
    droot = NULL;

    /* get GJNDBDIR and GJNDBROOT - set the dbfile using these values */
    ddir = getenv("GJNDBDIR");
    droot = getenv("GJNDBROOT");
    if(ddir != NULL && droot != NULL){
      dbfile = GJcat(3,ddir,droot,".seq");
    }else{
      if(VERBOSE){
	fprintf(stderr,"Ideally you should define the GJNDBDIR and GJNDBROOT\n");
	fprintf(stderr,"environment variables - I will expect the database on stdin\n");
	fprintf(stderr,"or as defined in the SCANPSDEFAULTS file.\n");
      }
    }

    if(VERBOSE){
	idfile = (char *) malloc(sizeof(char)*MAXtlen);
	if(ddir == NULL || droot == NULL){
	  dbfile = (char *) malloc(sizeof(char)*MAXtlen);
	}
	ofile = (char *) malloc(sizeof(char)*MAXtlen);
	printf("\n\nProgram S E L E C T\n\n");
	printf("Extracts sequences from PIR database\n\n");
	printf("Author: G. J. Barton (1990)\n");
	printf("Maximum Allowed Sequence Length: %d\n",MAXslen);
	printf("Maximum Allowed Number of Sequences: %d\n\n",MAX_NSEQ);
	printf("Enter name of file containing SCORE ID pairs: ");
	scanf("%s",idfile);
	printf("\nOpening File: %s\n\n",idfile);
	fp = fopen(idfile,"r");
	if(fp == NULL)error("Cannot open file",1);
	if(ddir == NULL || droot == NULL){
	  printf("Enter Database Filename: ");
	  scanf("%s",dbfile);
	}
	printf("\nOpening File: %s\n\n",dbfile);
	fdb = fopen(dbfile,"r");
	if(fdb == NULL)error("Cannot open file",2);
	printf("Just Extract Identifiers/titles (no sequences) ?[Y/N]: ");
	scanf("%s",com);
	brief=0;
	if(*com=='y'||*com=='Y'){
	    brief=1;
	    printf("Only identifiers and titles will be Output\n");
	}
	printf("\nEnter Output Filename: ");
	scanf("%s",ofile);
	printf("\nOpening File: %s\n\n",ofile);
	fout = fopen(ofile,"w");
	if(fout == NULL)error("Cannot Open file",2);
    }else{
	    if(argc > 1){
		fp = fopen(argv[1],"r");
		brief = (argc > 2) ? 1 : 0;
	    }else{
		fprintf(stderr,"Must Supply scan output file as 1st arg\n");
		exit(1);
	    }
    }

    count = 0;
    total = 0;
    /* get the id list to find in the database */
    while(count < MAX_NSEQ && (topid[count].id = malloc(MAX_ID_LEN)) && 
	fscanf(fp,"%f %s", &score, topid[count].id) != EOF){
	topid[count].id = realloc(topid[count].id,(strlen(topid[count].id)+1));
	topid[count].score = score;
	++count;	
    }
    topid = (struct tops *) realloc(topid, sizeof(struct tops) * count);

    /* sort the id list in ascending order */
    qsort((char *) topid, count, sizeof(struct tops), comtop);	


    if(VERBOSE){
	printf("Searching for: %d Sequences\n",count);
        for(i=0;i<count;++i){
	    printf("%d %s\n",i+1,topid[i].id);
	}
    }
    
    seqs[0].slen = 0;
    while(total < count){
	if(VERBOSE){
	    end = gseq(fdb,seqs,&nseq,2);	    
	}else{
	    end = gseq(stdin,seqs,&nseq,2);	    
	}
	if(end == 0){
	    fprintf(stderr,"ERROR:  End of Database file\n");
	    exit(0);
	}

	search->id = seqs[nseq].id;
	search->score = 0.0;

/*	fprintf(stderr,"Searching with: %s:%d\n",search->id,strlen(search->id));*/

    	found = (struct tops *) 
		bsearch((char *) search, (char *) topid,
			count,sizeof(struct tops), comtop);

/*        if(seqs[nseq].id[0] == 'H' && seqs[nseq].id[1] == 'Z'){
            if(found == NULL)
	    printf("%s %d\n", seqs[nseq].id,strlen(seqs[nseq].id));
	}
*/

	if(found != NULL){
		if(total < 1){
		    res = (struct result *) malloc(sizeof(struct result));
		}else{
		    res = (struct result *) 
                          realloc(res,sizeof(struct result) * (total+1));
		}

/*		res[total].seq = *seqs;*/
		copseq(&res[total].seq,seqs);
		res[total].score = found->score;
		if(VERBOSE){
		    printf("Found: %s %5d\n",found->id,total+1);
		}

		++total;
	}
    }

    fclose(fp);

    if(res[1].score < 1){
      qsort((char *) res, total, sizeof(struct result),comres2);
    }else{
      qsort((char *) res, total, sizeof(struct result),comres);
    }

    if(VERBOSE){
	printf("Extracted: %d Sequences\n",total);
    }

    if(!brief){
	if(VERBOSE){
	    for(j=0; j<total; ++j){
	        lc = 0;
		fprintf(fout,">%s\n",res[j].seq.id);
		fprintf(fout,"%s\n",res[j].seq.title);
		for(i = 1; i < res[j].seq.slen-1; ++i){
		    ++lc;
		    if(lc == 50){
		      fprintf(fout,"\n");
		      lc = 0;
		    }
		    fputc(res[j].seq.seq[i],fout);
		}
		fprintf(fout,"*\n");
	    }
	}else{
	    for(j=0; j<total; ++j){
		fprintf(stdout,">%s\n",res[j].seq.id);
		fprintf(stdout,"%s\n",res[j].seq.title);
		for(i = 1; i < res[j].seq.slen-1; ++i){
		    ++lc;
		    if(lc == 50){
		      fprintf(stdout,"\n");
		      lc = 0;
		    }
		    fputc(res[j].seq.seq[i],stdout);
		}
		fprintf(stdout,"*\n");
	    }
	}
    }else{
	if(VERBOSE){
	    for(j=0; j<total; ++j){
		fprintf(fout,"%s\t",res[j].seq.id);
		if(res[j].score < 1.0){
		  fprintf(fout,"%.1e\t",res[j].score);
		}else{
		  fprintf(fout,"%f\t",res[j].score);
		}
		fprintf(fout,"%s\n",res[j].seq.title);
	    }
	}else{
	    for(j=0; j<total; ++j){
		fprintf(stdout,"%s\t",res[j].seq.id);
		if(res[j].score < 1.0){
		  fprintf(stdout,"%.1e\t",res[j].score);
		}else{
		  fprintf(stdout,"%f\t",res[j].score);
		}
		fprintf(stdout,"%s\n",res[j].seq.title);
	    }
	}
    }
}

int comres(left,right)

struct result *left, *right;

{
    return (int) right->score - left->score;
}

int comres2(left,right)

struct result *left, *right;

{
    if(left->score < right->score){
      return -1;
    }else{
      if(left->score == right->score){
	return 0;
      }
      return 1;
    }
}


int comtop(left,right)

struct tops  *left, *right;

{
    return strcmp(left->id, right->id);
}



void copseq(to, from)
struct seqdat *to,*from;
{
    int i;
    to->ilen = from->ilen;
    to->id = (char *) malloc(from->ilen * sizeof(char));
    for(i=0;i<to->ilen;++i){
        to->id[i]=from->id[i];
    }
    to->tlen = from->tlen;
    to->title = (char *) malloc(from->tlen * sizeof(char));
    for(i=0;i<to->tlen;++i){
	to->title[i]=from->title[i];
    }
    to->slen = from->slen;
    to->seq = (char *) malloc(from->slen * sizeof(char));
    for(i=0;i<to->slen;++i){
	to->seq[i]=from->seq[i];
    }
}





