#!/usr/bin/env perl

use strict;
use warnings;
use Getopt::Long;
use File::Temp qw(tempfile);

$|++;

my $logger_missing;

BEGIN {
  # we need to check if Log4perl is installed
  # if not then we cant use logdie;
  eval {
    require Log::Log4perl;
    Log::Log4perl->import(':easy');
  };

  if ($@) {
    $logger_missing = 1;
  }
};


my $e_max = 10000;
my $score_dominate_default  = 7;
my $rph_trim_default        = 3;
my $min_cov_frac_default    = 0.8;
my $default_cpu             = 8;

main();

1;

sub main {
   my $args = processCommandLineArgs();

   verify_programs($args);
   my ($hits, $header) = get_nhmmscan_hits($args); #array pointer
   my $cnt_before = 1 + $#{$hits};

   if ($args->{no_rph_removal} ) {
      printf STDERR ("nhmmer hits (no overlap removal):   %7d\n", $cnt_before);
   } else {
      $hits = filter_covered_hits($hits, $args);
      my $cnt_after = 1 + $#{$hits};
      printf STDERR ("nhmmer hits before overlap removal: %7d\n", $cnt_before);
      printf STDERR ("nhmmer hits after overlap removal:  %7d\n", $cnt_after);
   }


   my @sorted = ();
   if ( $args->{sort_method} eq "seq") {
      @sorted = sort by_seqandorientandpos(@{$hits});
   } elsif ($args->{sort_method} eq "model") {
      @sorted = sort by_modelandseqandorientandpos(@{$hits});
   } elsif ($args->{sort_method} eq "eval") {
      @sorted = sort by_evalue(@{$hits});
   }

   open DFOUT, ">$args->{dfam_outfile}" or logdie ("Can't open $args->{dfam_outfile}: $!");
   print DFOUT $header;
   print DFOUT @sorted;
   close DFOUT;

   if ($args->{trf_outfile}) {
      run_trf($args);
   }

}

sub logdie {
  my $message = shift;
  if ($logger_missing) {
    die $message;
  }
  else {
    LOGDIE($message);
  }
}

sub processCommandLineArgs {

  my %args = ( cpu => $default_cpu, );

  &GetOptions( \%args,
    "help",
    "dfam_infile=s",
    "fastafile=s",
    "hmmfile=s",
    "dfam_outfile=s",
    "E=f",
    "T=f",
    "masking_thresh|cut_ga",
    "annotation_thresh|cut_tc",
    "species=i",
    "trf_outfile=s",
    "cpu=i",
    "no_rph_removal",
    "rph_trim=i",
    "score_dominate=f",
    "min_cov_frac=f",
    "sortby_model",
    "sortby_seq",
    "sortby_eval",
    "log_file=s",
  )
  or logdie ("Unknown option, try running --help for more infortmation.");

  help() if ($args{help});


  if ($args{log_file}) {
    if ($logger_missing) {
      warn "Log::Log4perl failed to load. No logs will be created\n";
    }
    else {
      if ( $args{log_file} =~ m[^(.+)/]) { # some other directory, it better exist
        die("The directory $1 does not exist")  unless (-d $1);
      }
      Log::Log4perl->easy_init( {  file    => ">$args{log_file}" } );
    }
  }

  if ($args{dfam_infile}) { #queries have already been run
    help("Illegal flags used in addition to --dfam_infile" ) if ($args{fastafile} || $args{hmmfile} || $args{trf_outfile} || $args{masking_thresh} || $args{annotation_thresh} || $args{species});
    logdie("Unable to open $args{dfam_infile}: $!")  unless -e $args{dfam_infile};
  }
  elsif ( $args{fastafile} && $args{hmmfile} ) { #need to run nhmmer
    help("Flag --dfam_infile may not be used with --hmmfile") if ($args{dfam_infile});
    logdie("Unable to open $args{fastafile}: $!")  unless -e $args{fastafile};
    logdie("Unable to open $args{hmmfile}: $!")    unless -e $args{hmmfile};
  }
  else {
    help("Use either (--dfam_infile) or (--fastafile and --hmmfile)");
  }

  if ( $args{dfam_outfile} ) {
    # does the containing directory exist?
    if ( $args{dfam_outfile} =~ m[^(.+)/]) { #some other directory, it better exist
      logdie("The directory $1 does not exist")  unless (-d $1);
    }
   }
   else {
     help("Must specify --dfam_outfile");
   }

   if ( $args{trf_outfile} ) {
     # does the containing directory exist?
     if ( $args{trf_outfile} =~ m[^(.+)/]) { #some other directory, it better exist
       logdie("The directory $1 does not exist")  unless (-d $1);
     }
   }


   my $thresh_arg_cnt=0;
   $args{mask_method} = "TC"; #default
   if ($args{E}) {
      $thresh_arg_cnt++;
      $args{mask_method} = "E";
      logdie("Invalid E value: $!")    if ( $args{E} <= 0 || $args{E} > $e_max ) ;
   } elsif ($args{T}) {
      $thresh_arg_cnt++;
      $args{mask_method} = "T";
   } elsif ($args{masking_thresh}) {
      $thresh_arg_cnt++;
      $args{mask_method} = "GA";
   } elsif ($args{annotation_thresh}) {
      $thresh_arg_cnt++;
      $args{mask_method} = "TC";
   } elsif ($args{species}) {
      $thresh_arg_cnt++;
      $args{mask_method} = "SP";
   }
   if ($thresh_arg_cnt > 1) { #only one should be specified
      help("Only one threshold method should be given");
   }


   my $sort_arg_cnt=0;
   $args{sort_method} = "seq"; #default
   if ($args{sortby_eval}) {
      $sort_arg_cnt++;
      $args{sort_method} = "eval";
   } elsif ($args{sortby_model}) {
      $sort_arg_cnt++;
      $args{sort_method} = "model";
   } elsif ($args{sortby_seq}) {
      $sort_arg_cnt++;
      $args{sort_method} = "seq";
   }
   if ($sort_arg_cnt > 1) { #only one should be specified
      help("Only one sort method should be specified");
   }

   if ($args{no_rph_removal}) {
     if ($args{rph_trim}) {
        help("Don't specify both --no_rph_removal and --rph_trim" );
     }
   } else {
      $args{rph_trim} = $rph_trim_default;
   }


   unless ($args{score_dominate}) {
      $args{score_dominate} = $score_dominate_default;
   }

   unless ($args{min_cov_frac}) {
      $args{min_cov_frac} = $min_cov_frac_default;
   }


   return(\%args);
}


sub help {

print STDERR <<EOF;

Command line options for controlling $0
-------------------------------------------------------------------------------

   --help       : prints this help messeage
    
   Requires either
    --dfam_infile <s>    Use this is you've already run nhmmscan, and 
                         just want to perfom dfamscan filtering/sorting
                         (Note: must be nhmmscan output, not nhmmer output)
   or both of these
    --fastafile <s>      Use these if you want dfamscan to control a
    --hmmfile <s>        run of nhmmscan, then do filtering/sorting

   Requires
    --dfam_outfile <s>   Output file, also in dfamtblout format

   Optionally, one of these  (only -E and -T allowed with --dfam_infile)
    -E <f>               >0, <=$e_max
    -T <f>
    --masking_thresh/--cut_ga
    --annotation_thresh/--cut_tc  Default
    --species <i>        (not yet implemented)

   Optionally one of these
    --sortby_eval
    --sortby_model
    --sortby_seq         Default

   Redundant Profile Hit (RPH) removal (only if not --no_rph_removal)
    --rph_trim <i>       If hit A has higher score than hit B and covers
                         all but at most rph_trim bases on either side, 
                         then A dominates B. Default $rph_trim_default
    --score_dominate <f> If hit A has score this much higher (bits) than 
                         hit B, then A can dominate B even if it's is 
                         shorter, so long as the --min_cov_frac threshold
                         is met. Default $score_dominate_default
    --min_cov_frac <f>   If hit A exceeds hit B by --score_dominate bits,
                         and either A or B is covered at least --min_cov_frac
                         by the other, then A dominates B. Default $min_cov_frac_default
    
   All optional
    --trf_outfile <s>    Runs trf, put results in <s>; only with --fastafile
    --cpu <i>            Default $default_cpu
    --no_rph_removal     Don't remove redundant profile hits
    --log_file <s>
EOF


  logdie ("\n**********\n\n$_[0]\n") if $_[0];


  exit(1);
}


sub verify_programs {
   my ($args) = @_;

   logdie("nhmmscan not found in \$PATH")  unless (in_path('nhmmscan'));

   if ($args->{trf_outfile}) {
      logdie("trf not found in \$PATH")  unless (in_path('trf'));
   }
}

sub in_path {
   my $prog = $_[0];
   my @dirs = split(/:/,$ENV{'PATH'});
   foreach my $dir (@dirs) {
      #from perl cookbook, 2nd edition
      $dir =~   s{ ^ ~ ( [^/]* ) }
                     { $1
                           ? (getpwnam($1))[7]
                           : ( $ENV{HOME} || $ENV{LOGDIR}
                                || (getpwuid($<))[7]
                             )
                     }ex;

      if (-e "$dir/$prog")  {
         return 1;
      }
   }
   return 0;
}

sub run_trf {
   my ($args) = @_;

   # change dir to the input directory as trf wont let us
   # specify where the output should go
   my $out_dir = $args->{fastafile};
   $out_dir =~ s|[^/]*$||;
   chdir($out_dir);
   # params to search for old simple repeats:  2 3 5 75 20 33 7
   my $cmd = "trf $args->{fastafile} 2 7 7 80 10 50 10 -d -h > /dev/null";
   my $res = system($cmd);
   #   logdie("Error running trf: $!\n") if $res;  #no error checking, 'cause TRF returns non-zero exit code...
   rename "$args->{fastafile}.2.7.7.80.10.50.10.dat", $args->{trf_outfile} or logdie("Error running TRF\n");

   #get counts
   open FH, "<$args->{trf_outfile}";
   my $cnt = 0;
   while (my $line = <FH>) {
      $cnt++ if ($line =~ /^\d/);
   }
   close FH;
   printf STDERR ("TRF repeat count:                   %7d\n", $cnt);
}

sub get_nhmmscan_hits {
   my ($args) = @_;

   my @hits;
   my $header;
   my $header_done = 0;
   if ($args->{dfam_infile}) {
      open FH, "<$args->{dfam_infile}";
      while (my $line = <FH>) {
         if ($line =~ /^#/) {
            $header .= $line if  !$header_done;
            
            $header_done = 1 if ($line =~ /-------------------/); # that'll be the final line of the first header, no need to replicate more headers
            next;
         }
         $header_done = 1;
         if ($args->{mask_method} =~ /^(E|T)$/ ) { #filter hits
            my @vals = split(/\s+/, $line);
            if ($args->{mask_method} eq "E") {
               next if ($vals[4] > $args->{E});
            } elsif ($args->{mask_method} eq "T") {
               next if ($vals[3] < $args->{T});
            }
         }
         push @hits, $line;

      }
      close FH;
   } else { # ($args{fastafile} && $args{hmmfile});
      my $cmd = "nhmmscan --noali";

      if ($args->{mask_method} eq "SP") {
         #um ...   do I have to split the hmm file in half, part for matching species, and part for not?
         logdie("Don't know how to handle --species yet");
         # two nhmmer jobs?

      } else {

         if ($args->{mask_method} eq "E") {
            $cmd .= " -E $args->{E}";
         } elsif ($args->{mask_method} eq "T") {
            $cmd .= " -T $args->{T}";
         } elsif ($args->{mask_method} eq "GA") {
            $cmd .= " --cut_ga";
         } elsif ($args->{mask_method} eq "TC") {
            $cmd .= " --cut_tc";
         }


         my ($tmpfh, $dfout_filename) = tempfile();

         $cmd .= " --dfamtblout $dfout_filename";
         $cmd .= " --cpu=$args->{cpu}";
         $cmd .= " $args->{hmmfile}";
         $cmd .= " $args->{fastafile}";

         #print ("$cmd\n");
         my $result = system ("$cmd > /dev/null");
         logdie("Error running command:\n$cmd\n") if $result;
         open FH, "<$dfout_filename";
         while (my $line = <FH>) {
            if ($line =~ /^#/) {
               $header .= $line if  !$header_done;
               next;
            }
            $header_done = 1;
            push @hits, $line;
         }
         close FH;
         unlink $dfout_filename;
      }
   }
   return \@hits, $header;
}


sub filter_covered_hits {
   my @sorted = sort by_seqandpos(@{$_[0]});   
   
   my $args   = $_[1];
   my $trim   = $args->{rph_trim};
   my $i=0;
   my $j;
   my $tmp;
   my ($seq_a, $acc_a, $model_a, $score_a, $eval_a, $orient_a, $start_a, $end_a);
   my ($seq_b, $acc_b, $model_b, $score_b, $eval_b, $orient_b, $start_b, $end_b);

   my @dominated;

   for ($i=0; $i<$#sorted; $i++) {
      next if $dominated [$i];
      
      ($model_a, $acc_a, $seq_a, $score_a, $eval_a, $tmp, $tmp, $tmp, $orient_a, $start_a, $end_a, $tmp, $tmp) = split(/\s+/,$sorted[$i]);
      if ($orient_a eq "-") {
         my $tmp = $start_a;
         $start_a = $end_a;
         $end_a = $tmp;
      }
         
      $j=$i+1;
      ($model_b, $acc_b, $seq_b, $score_b, $eval_b, $tmp, $tmp, $tmp, $orient_b, $start_b, $end_b, $tmp, $tmp) = split(/\s+/,$sorted[$j]);
      if ($orient_b eq "-") {   
         my $tmp = $start_b;
         $start_b = $end_b;
         $end_b = $tmp;            
      }
      
      while ($j<=$#sorted && $seq_b eq $seq_a && $start_b < $end_a) {
         my $domination = 0;
         unless ( $dominated[$j] )
         {
#            print "$start_a..$end_a v $start_b..$end_b ($score_a v $score_b)\n" ;
      
            if ( $score_a > $score_b &&  $end_a >= $end_b - $args->{rph_trim}) { # A is clearly superior both in score and (essentially) in boundary
#               print "(1) $i > $j  - remove B \n";
               $dominated[$j] = 1;
            } elsif ( $score_a > $score_b + $args->{score_dominate} ) { 
               # A has ~100x better E-value, is one of the two hits mostly covered by the other, if so, A dominates B
               my $covered_start = $start_a > $start_b ? $start_a : $start_b;
               my $covered_end   = $end_a < $end_b     ? $end_a   : $end_b;            
               my $covered_len   = $covered_end-$covered_start +1;
               my $a_len = $end_a - $start_a + 1;
               my $b_len = $end_b - $start_b + 1;                                    
               if ( $covered_len/$a_len >= $args->{min_cov_frac} || $covered_len/$b_len >= $args->{min_cov_frac} ) { 
#                  print "(2) $i > $j  - remove B \n";
                  $dominated[$j] = 1;
               }      
            } elsif ( $score_b > $score_a &&  $start_b <= $start_a + $args->{rph_trim} ) { # B is cleanly superior both in score and (essentially) in boundary
#               print "(3) $i < $j  - remove A\n";
               $dominated[$i] = 1;
               last;
            } elsif ( $score_b > $score_a + $args->{score_dominate}) {
               # B has ~100x better E-value, is one of the two hits mostly covered by the other, if so, B dominates A
               my $covered_start = $start_a > $start_b ? $start_a : $start_b;
               my $covered_end   = $end_a < $end_b     ? $end_a   : $end_b;            
               my $covered_len   = $covered_end-$covered_start +1;
               my $a_len = $end_a - $start_a + 1;
               my $b_len = $end_b - $start_b + 1;                                    
               if ( $covered_len/$a_len >= $args->{min_cov_frac} || $covered_len/$b_len >= $args->{min_cov_frac} ) {             
#                  print "(4) $i < $j  - remove A\n";
                  $dominated[$i] = 1;
                  last;
               }      
            }
         }
         
         $j++;
         next if ($j>$#sorted);
         ($model_b, $acc_b, $seq_b, $score_b, $eval_b, $tmp, $tmp, $tmp, $orient_b, $start_b, $end_b, $tmp, $tmp) = split(/\s+/,$sorted[$j]);
         
         if ($orient_b eq "-") {   
            my $tmp = $start_b;
            $start_b = $end_b;
            $end_b = $tmp;            
         }   
      } 
   }
   

   $i = 0;
   $j = 0;
   while ($j <= $#sorted) { 
      unless ($dominated[$j]) {
#         chomp $sorted[$j];
#         print "keep $j in $i ($sorted[$j])\n";
         $sorted[$i] = $sorted[$j];
         $i++;
      }
      $j++;
   }

   #resize the array, to drop off leftover cruft;
   $#sorted = $i-1;

   return \@sorted;
}


## sorting helper functions

sub by_modelandseqandorientandpos {
   my ($model_a, $acc_a, $seq_a, $score_a, $eval_a, $tmp1, $tmp2, $tmp3, $orient_a, $start_a, $end_a) = split(/\s+/,$a);
   my ($model_b, $acc_b, $seq_b, $score_b, $eval_b, $tmp4, $tmp5, $tmp6, $orient_b, $start_b, $end_b) = split(/\s+/,$b);

   if ($orient_a eq "-") {
      my $tmp = $start_a;
      $start_a = $end_a;
      $end_a = $tmp;
   }
   if ($orient_b eq "-") {   
      my $tmp = $start_b;
      $start_b = $end_b;
      $end_b = $tmp;            
   }

   return (   ($model_a cmp $model_b)
           || ($seq_a cmp $seq_b)
           || ($orient_a cmp $orient_b)
           || ($start_a <=> $start_b)
           || ($end_b   <=> $end_a)  # so a covered entry will follow a covering entry
           || ($score_b <=> $score_a)
           || ($model_a cmp $model_b)
           );
}


sub by_seqandpos {
   my ($model_a, $acc_a, $seq_a, $score_a, $eval_a, $tmp1, $tmp2, $tmp3, $orient_a, $start_a, $end_a) = split(/\s+/,$a);
   my ($model_b, $acc_b, $seq_b, $score_b, $eval_b, $tmp4, $tmp5, $tmp6, $orient_b, $start_b, $end_b) = split(/\s+/,$b);

   if ($orient_a eq "-") {
      my $tmp = $start_a;
      $start_a = $end_a;
      $end_a = $tmp;
   }
   if ($orient_b eq "-") {   
      my $tmp = $start_b;
      $start_b = $end_b;
      $end_b = $tmp;            
   }

   return (   ($seq_a cmp $seq_b)
           || ($start_a <=> $start_b)
           || ($end_b   <=> $end_a)  # so a covered entry will follow a covering entry
           || ($score_b <=> $score_a)
           || ($model_a cmp $model_b)
           );
}

sub by_seqandorientandpos {
   my ($model_a, $acc_a, $seq_a, $score_a, $eval_a, $tmp1, $tmp2, $tmp3, $orient_a, $start_a, $end_a) = split(/\s+/,$a);
   my ($model_b, $acc_b, $seq_b, $score_b, $eval_b, $tmp4, $tmp5, $tmp6, $orient_b, $start_b, $end_b) = split(/\s+/,$b);

   if ($orient_a eq "-") {
      my $tmp = $start_a;
      $start_a = $end_a;
      $end_a = $tmp;
   }
   if ($orient_b eq "-") {   
      my $tmp = $start_b;
      $start_b = $end_b;
      $end_b = $tmp;            
   }

   return (   ($seq_a cmp $seq_b)
           || ($orient_a cmp $orient_b)
           || ($start_a <=> $start_b)
           || ($end_b   <=> $end_a)  # so a covered entry will follow a covering entry
           || ($score_b <=> $score_a)
           || ($model_a cmp $model_b)
           );
}


sub by_model {
   my ($model_a, $acc_a, $seq_a, $score_a, $eval_a, $tmp1, $tmp2, $tmp3, $orient_a, $start_a, $end_a) = split(/\s+/,$a);
   my ($model_b, $acc_b, $seq_b, $score_b, $eval_b, $tmp4, $tmp5, $tmp6, $orient_b, $start_b, $end_b) = split(/\s+/,$b);

   if ($orient_a eq "-") {
      my $tmp = $start_a;
      $start_a = $end_a;
      $end_a = $tmp;
   }
   if ($orient_b eq "-") {   
      my $tmp = $start_b;
      $start_b = $end_b;
      $end_b = $tmp;            
   }


   return (   ($model_a cmp $model_b)
           || ($seq_a cmp $seq_b)
           || ($orient_a cmp $orient_b)
           || ($start_a <=> $start_b)
           || ($score_a <=> $score_b)
           );
}

sub by_evalue {
   my ($model_a, $acc_a, $seq_a, $score_a, $eval_a, $tmp1, $tmp2, $tmp3, $orient_a, $start_a, $end_a) = split(/\s+/,$a);
   my ($model_b, $acc_b, $seq_b, $score_b, $eval_b, $tmp4, $tmp5, $tmp6, $orient_b, $start_b, $end_b) = split(/\s+/,$b);

   if ($orient_a eq "-") {
      my $tmp = $start_a;
      $start_a = $end_a;
      $end_a = $tmp;
   }
   if ($orient_b eq "-") {   
      my $tmp = $start_b;
      $start_b = $end_b;
      $end_b = $tmp;            
   }


   return (   ($eval_b   cmp $eval_a)
           || ($score_a <=> $score_b)
           || ($seq_a cmp $seq_b)
           || ($orient_a cmp $orient_b)
           || ($start_a <=> $start_b)
           || ($model_a cmp $model_b)
           );
}

sub by_score {
   my ($model_a, $acc_a, $seq_a, $score_a, $eval_a, $tmp1, $tmp2, $tmp3, $orient_a, $start_a, $end_a) = split(/\s+/,$a);
   my ($model_b, $acc_b, $seq_b, $score_b, $eval_b, $tmp4, $tmp5, $tmp6, $orient_b, $start_b, $end_b) = split(/\s+/,$b);

   if ($orient_a eq "-") {
      my $tmp = $start_a;
      $start_a = $end_a;
      $end_a = $tmp;
   }
   if ($orient_b eq "-") {   
      my $tmp = $start_b;
      $start_b = $end_b;
      $end_b = $tmp;            
   }

   return (   ($score_a <=> $score_b)
           || ($eval_b   cmp $eval_a)
           || ($seq_a cmp $seq_b)
           || ($orient_a cmp $orient_b)
           || ($start_a <=> $start_b)
           || ($model_a cmp $model_b)
           );
}
