#!/usr/bin/env perl

use strict;
use warnings;
use Getopt::Long;
use File::Temp qw(tempfile);

$|++;

my $logger_missing;

BEGIN {
  # we need to check if Log4perl is installed
  # if not then we cant use logdie;
  eval {
    require Log::Log4perl;
    Log::Log4perl->import(':easy');
  };

  if ($@) {
    $logger_missing = 1;
  }
};


my $e_max = 10000;
my $score_flex = 0.5;
my $overlap_trim_default = 3;
my $default_cpu = 4;

main();

1;

sub main {
   my $args = processCommandLineArgs();

   verify_programs($args);
   my ($hits, $header) = get_nhmmer_hits($args); #array pointer
   my $cnt_before = 1 + $#{$hits};

   if ($args->{no_overlap} ) {
      printf STDERR ("nhmmer hits (no overlap removal):   %7d\n", $cnt_before);
   } else {
      $hits = filter_covered_hits($hits, $args->{overlap_trim});
      my $cnt_after = 1 + $#{$hits};
      printf STDERR ("nhmmer hits before overlap removal: %7d\n", $cnt_before);
      printf STDERR ("nhmmer hits after overlap removal:  %7d\n", $cnt_after);
   }


   my @sorted = ();
   if ( $args->{sort_method} eq "seq") {
      @sorted = sort by_targetandpos(@{$hits});
   } elsif ($args->{sort_method} eq "model") {
      @sorted = sort by_model(@{$hits});
   } elsif ($args->{sort_method} eq "eval") {
      @sorted = sort by_evalue(@{$hits});
   }

   open DFOUT, ">$args->{dfam_outfile}" or logdie ("Can't open $args->{dfam_outfile}: $!");
   print DFOUT $header;
   print DFOUT @sorted;
   close DFOUT;

   if ($args->{trf_outfile}) {
      run_trf($args);
   }

}

sub logdie {
  my $message = shift;
  if ($logger_missing) {
    die $message;
  }
  else {
    LOGDIE($message);
  }
}

sub processCommandLineArgs {

  my %args = ( cpu => $default_cpu, );

  &GetOptions( \%args,
    "help",
    "dfam_infile=s",
    "fastafile=s",
    "hmmfile=s",
    "dfam_outfile=s",
    "E=f",
    "T=f",
    "masking_thresh|cut_ga",
    "annotation_thresh|cut_tc",
    "species=i",
    "trf_outfile=s",
    "cpu=i",
    "no_overlap",
    "overlap_trim=i",
    "sortby_model",
    "sortby_seq",
    "sortby_eval",
    "log_file=s",
  )
  or logdie ("Unknown option, try running --help for more infortmation.");

  help() if ($args{help});


  if ($args{log_file}) {
    if ($logger_missing) {
      warn "Log::Log4perl failed to load. No logs will be created\n";
    }
    else {
      if ( $args{log_file} =~ m[^(.+)/]) { # some other directory, it better exist
        die("The directory $1 does not exist")  unless (-d $1);
      }
      Log::Log4perl->easy_init( {  file    => ">$args{log_file}" } );
    }
  }

  if ($args{dfam_infile}) { #queries have already been run
    help("Illegal flags used in addition to --dfam_infile" ) if ($args{fastafile} || $args{hmmfile} || $args{trf_outfile} || $args{masking_thresh} || $args{annotation_thresh} || $args{species});
    logdie("Unable to open $args{dfam_infile}: $!")  unless -e $args{dfam_infile};
  }
  elsif ( $args{fastafile} && $args{hmmfile} ) { #need to run nhmmer
    help("Flag --dfam_infile may not be used with --hmmfile") if ($args{dfam_infile});
    logdie("Unable to open $args{fastafile}: $!")  unless -e $args{fastafile};
    logdie("Unable to open $args{hmmfile}: $!")    unless -e $args{hmmfile};
  }
  else {
    help("Use either (--dfam_infile) or (--fastafile and --hmmfile)");
  }

  if ( $args{dfam_outfile} ) {
    # does the containing directory exist?
    if ( $args{dfam_outfile} =~ m[^(.+)/]) { #some other directory, it better exist
      logdie("The directory $1 does not exist")  unless (-d $1);
    }
   }
   else {
     help("Must specify --dfam_outfile");
   }

   if ( $args{trf_outfile} ) {
     # does the containing directory exist?
     if ( $args{trf_outfile} =~ m[^(.+)/]) { #some other directory, it better exist
       logdie("The directory $1 does not exist")  unless (-d $1);
     }
   }


   my $thresh_arg_cnt=0;
   $args{mask_method} = "TC"; #default
   if ($args{E}) {
      $thresh_arg_cnt++;
      $args{mask_method} = "E";
      logdie("Invalid E value: $!")    if ( $args{E} <= 0 || $args{E} > $e_max ) ;
   } elsif ($args{T}) {
      $thresh_arg_cnt++;
      $args{mask_method} = "T";
   } elsif ($args{masking_thresh}) {
      $thresh_arg_cnt++;
      $args{mask_method} = "GA";
   } elsif ($args{annotation_thresh}) {
      $thresh_arg_cnt++;
      $args{mask_method} = "TC";
   } elsif ($args{species}) {
      $thresh_arg_cnt++;
      $args{mask_method} = "SP";
   }
   if ($thresh_arg_cnt > 1) { #only one should be specified
      help("Only one threshold method should be given");
   }


   my $sort_arg_cnt=0;
   $args{sort_method} = "seq"; #default
   if ($args{sortby_eval}) {
      $sort_arg_cnt++;
      $args{sort_method} = "eval";
   } elsif ($args{sortby_model}) {
      $sort_arg_cnt++;
      $args{sort_method} = "model";
   } elsif ($args{sortby_seq}) {
      $sort_arg_cnt++;
      $args{sort_method} = "seq";
   }
   if ($sort_arg_cnt > 1) { #only one should be specified
      help("Only one sort method should be specified");
   }

   if ($args{no_overlap}) {
     if ($args{overlap_trim}) {
        help("Don't specify both --no_overlap and --overlap_trim" );
     }
   } else {
      $args{overlap_trim} = $overlap_trim_default;
   }

   return(\%args);
}


sub help {

print STDERR <<EOF;

Command line options for controlling $0
-------------------------------------------------------------------------------

   --help       : prints this help messeage
    
   Requires either
    --dfam_infile <s>
   or both of these
    -fastafile <s>
    -hmmfile <s>

   Requires
    -dfam_outfile <s>

   Optionally, one of these  (only -E and -T allowed with --dfam_infile)
    -E <f>             (>0, <=$e_max)
    -T <f>
    --masking_thresh/--cut_ga
    --annotation_thresh/--cut_tc  (default)
    --species <i>  (not yet implemented)

   Optionally one of these
    --sortby_eval
    --sortby_model
    --sortby_seq     (default)

   All optional
    --trf_outfile <s>  (runs trf, put results in <s>; only with --fastafile)
    --cpu <i>    (default $default_cpu)
    --no_overlap
    --overlap_trim <i> (default $overlap_trim_default, only if not --no_overlap)
    --log_file <s>
EOF


  logdie ("\n**********\n\n$_[0]\n") if $_[0];


  exit(1);
}


sub verify_programs {
   my ($args) = @_;

   logdie("nhmmer not found in \$PATH")  unless (in_path('nhmmer'));

   if ($args->{trf_outfile}) {
      logdie("trf not found in \$PATH")  unless (in_path('trf'));
   }
}

sub in_path {
   my $prog = $_[0];
   my @dirs = split(/:/,$ENV{'PATH'});
   foreach my $dir (@dirs) {
      #from perl cookbook, 2nd edition
      $dir =~   s{ ^ ~ ( [^/]* ) }
                     { $1
                           ? (getpwnam($1))[7]
                           : ( $ENV{HOME} || $ENV{LOGDIR}
                                || (getpwuid($<))[7]
                             )
                     }ex;

      if (-e "$dir/$prog")  {
         return 1;
      }
   }
   return 0;
}

sub run_trf {
   my ($args) = @_;

   # change dir to the input directory as trf wont let us
   # specify where the output should go
   my $out_dir = $args->{fastafile};
   $out_dir =~ s|[^/]*$||;
   chdir($out_dir);
   # params to search for old simple repeats:  2 3 5 75 20 33 7
   my $cmd = "trf $args->{fastafile} 2 7 7 80 10 50 10 -d -h > /dev/null";
   my $res = system($cmd);
   #   logdie("Error running trf: $!\n") if $res;  #no error checking, 'cause TRF returns non-zero exit code...
   rename "$args->{fastafile}.2.7.7.80.10.50.10.dat", $args->{trf_outfile} or logdie("Error running TRF\n");

   #get counts
   open FH, "<$args->{trf_outfile}";
   my $cnt = 0;
   while (my $line = <FH>) {
      $cnt++ if ($line =~ /^\d/);
   }
   close FH;
   printf STDERR ("TRF repeat count:                   %7d\n", $cnt);
}

sub get_nhmmer_hits {
   my ($args) = @_;

   my @hits;
   my $header;
   my $header_done = 0;
   if ($args->{dfam_infile}) {
      open FH, "<$args->{dfam_infile}";
      while (my $line = <FH>) {
         if ($line =~ /^#/) {
            $header .= $line if  !$header_done;
            next;
         }
         $header_done = 1;
         if ($args->{mask_method} =~ /^(E|T)$/ ) { #filter hits
            my @vals = split(/\s+/, $line);
            if ($args->{mask_method} eq "E") {
               next if ($vals[4] > $args->{E});
            } elsif ($args->{mask_method} eq "T") {
               next if ($vals[3] < $args->{T});
            }
         }
         push @hits, $line;

      }
      close FH;
   } else { # ($args{fastafile} && $args{hmmfile});
      my $cmd = "nhmmer --noali";

      if ($args->{mask_method} eq "SP") {
         #um ...   do I have to split the hmm file in half, part for matching species, and part for not?
         logdie("Don't know how to handle --species yet");
         # two nhmmer jobs?

      } else {

         if ($args->{mask_method} eq "E") {
            $cmd .= " -E $args->{E}";
         } elsif ($args->{mask_method} eq "T") {
            $cmd .= " -T $args->{T}";
         } elsif ($args->{mask_method} eq "GA") {
            $cmd .= " --cut_ga";
         } elsif ($args->{mask_method} eq "TC") {
            $cmd .= " --cut_tc";
         }


         my ($tmpfh, $dfout_filename) = tempfile();

         $cmd .= " --dfamtblout $dfout_filename";
         $cmd .= " --cpu=$args->{cpu}";
         $cmd .= " $args->{hmmfile}";
         $cmd .= " $args->{fastafile}";

         #print ("$cmd\n");
         my $result = system ("$cmd > /dev/null");
         logdie("Error running command:\n$cmd\n") if $result;
         open FH, "<$dfout_filename";
         while (my $line = <FH>) {
            if ($line =~ /^#/) {
               $header .= $line if  !$header_done;
               next;
            }
            $header_done = 1;
            push @hits, $line;
         }
         close FH;
         unlink $dfout_filename;
      }
   }
   return \@hits, $header;
}


sub filter_covered_hits {
   my @sorted = sort by_targetandpos(@{$_[0]});
   my $trim   =  $_[1];

   my $i=0;
   my $j=1;

   while ($j <= $#sorted) {
      my ($target_a, $acc_a, $model_a, $score_a, $eval_a, $tmp1, $tmp2, $tmp3, $orient_a, $start_a, $end_a) = split(/\s+/,$sorted[$i]);
      my ($target_b, $acc_b, $model_b, $score_b, $eval_b, $tmp4, $tmp5, $tmp6, $orient_b, $start_b, $end_b) = split(/\s+/,$sorted[$j]);

     if ($orient_a eq "-") {
        my $tmp = $start_a;
        $start_a = $end_a;
        $end_a = $tmp;
        $tmp = $start_b;
        $start_b = $end_b;
        $end_b = $tmp;
     }

      my $domination = 0;
      if (    $target_b eq $target_a
           && $orient_b eq $orient_a
           )
      {
      
         #print "$start_a..$end_a v $start_b..$end_b ($score_a v $score_b)\n" ;
      
         if (      $end_b    <= $end_a + $trim
                && $score_b  <= $score_a + $score_flex
            )
         {
            # A dominates B,  bypass B
            #print "$i > $j  - remove B \n";
            $domination = 1;
         } elsif ( $start_a  >= $start_b - $trim
                && $score_a  <= $score_b + $score_flex
            ) {
            # B dominates A, overwrite A
            $sorted[$i] = $sorted[$j];
            #print "$i < $j  - remove A\n";
            $domination = 1;
         }
      }

      unless ( $domination ) { # neither is dominant, so add B on to list, leaving A in place
         $sorted[$i+1] = $sorted[$j];
         $i++;
         #print "$i\n";
      }
      $j++;
   }

   #resize the array, to drop off leftover cruft;
   $#sorted = $i;

   return \@sorted;
}


## sorting helper functions

sub by_targetandpos {
   my ($target_a, $acc_a, $model_a, $score_a, $eval_a, $tmp1, $tmp2, $tmp3, $orient_a, $start_a, $end_a) = split(/\s+/,$a);
   my ($target_b, $acc_b, $model_b, $score_b, $eval_b, $tmp4, $tmp5, $tmp6, $orient_b, $start_b, $end_b) = split(/\s+/,$b);

   if ($orient_a eq "-") {
      my $tmp = $start_a;
      $start_a = $end_a;
      $end_a = $tmp;
      $tmp = $start_b;
      $start_b = $end_b;
      $end_b = $tmp;            
   }

   return (   ($target_a cmp $target_b)
           || ($orient_a cmp $orient_b)
           || ($start_a <=> $start_b)
           || ($end_b   <=> $end_a)  # so a covered entry will follow a covering entry
           || ($score_b <=> $score_a)
           || ($model_a cmp $model_b)
           );
}

sub by_model {
   my ($target_a, $acc_a, $model_a, $score_a, $eval_a, $tmp1, $tmp2, $tmp3, $orient_a, $start_a, $end_a) = split(/\s+/,$a);
   my ($target_b, $acc_b, $model_b, $score_b, $eval_b, $tmp4, $tmp5, $tmp6, $orient_b, $start_b, $end_b) = split(/\s+/,$b);

   if ($orient_a eq "-") {
      my $tmp = $start_a;
      $start_a = $end_a;
      $end_a = $tmp;
      $tmp = $start_b;
      $start_b = $end_b;
      $end_b = $tmp;            
   }


   return (   ($model_a cmp $model_b)
           || ($target_a cmp $target_b)
           || ($orient_a cmp $orient_b)
           || ($start_a <=> $start_b)
           || ($score_a <=> $score_b)
           );
}

sub by_evalue {
   my ($target_a, $acc_a, $model_a, $score_a, $eval_a, $tmp1, $tmp2, $tmp3, $orient_a, $start_a, $end_a) = split(/\s+/,$a);
   my ($target_b, $acc_b, $model_b, $score_b, $eval_b, $tmp4, $tmp5, $tmp6, $orient_b, $start_b, $end_b) = split(/\s+/,$b);

   if ($orient_a eq "-") {
      my $tmp = $start_a;
      $start_a = $end_a;
      $end_a = $tmp;
      $tmp = $start_b;
      $start_b = $end_b;
      $end_b = $tmp;            
   }


   return (   ($eval_b   cmp $eval_a)
           || ($score_a <=> $score_b)
           || ($target_a cmp $target_b)
           || ($orient_a cmp $orient_b)
           || ($start_a <=> $start_b)
           || ($model_a cmp $model_b)
           );
}

sub by_score {
   my ($target_a, $acc_a, $model_a, $score_a, $eval_a, $tmp1, $tmp2, $tmp3, $orient_a, $start_a, $end_a) = split(/\s+/,$a);
   my ($target_b, $acc_b, $model_b, $score_b, $eval_b, $tmp4, $tmp5, $tmp6, $orient_b, $start_b, $end_b) = split(/\s+/,$b);

   if ($orient_a eq "-") {
      my $tmp = $start_a;
      $start_a = $end_a;
      $end_a = $tmp;
      $tmp = $start_b;
      $start_b = $end_b;
      $end_b = $tmp;      
   }


   return (   ($score_a <=> $score_b)
           || ($eval_b   cmp $eval_a)
           || ($target_a cmp $target_b)
           || ($orient_a cmp $orient_b)
           || ($start_a <=> $start_b)
           || ($model_a cmp $model_b)
           );
}