#!/usr/bin/perl
use strict;
use Data::Dumper;

# Simulated Sequence
#my $benchmarkTargetLen = 200000000;
my $benchmarkTargetLen = 2000;
my $simpleFraction = 0.02;

# Data
my $simple = [];
open IN,"<simple.dat" or die;
while ( <IN> ) {
  if ( /^SIMPLE:/ ) 
  {
    s/[\n\r\s]+//g;
    push @{$simple}, $_;
  }
}
close IN;

my $simpleTargetLen = int($benchmarkTargetLen * $simpleFraction);
my $backgroundTargetLen = $benchmarkTargetLen - $simpleTargetLen;

open IN,"/usr/local/hmmer/bin/esl-shuffle -N 1 -G --dna -L $backgroundTargetLen |" or die;
my $seq;
while ( <IN> ){
  next if ( /^>/ );
  s/[\n\r\s]+//g;
  $seq .= $_;
}
close IN;

my $attempts = 0;
my $simpleBPInserted = 0;
while ( $simpleBPInserted < $simpleTargetLen && $attempts < 10)
{
  my $repeat = getSimpleRepeat($simple);

  # Randomly select a position to insert it
  my $pos = int( rand( ( length $seq ) - 100 ) );
  my $leftFlank = "";
  if ( $pos >= 5 ) {
    $leftFlank = substr($seq, $pos-5, 5);
  }
  my $rightFlank = "";
  if ( $pos <= length($seq) - 5 ) {
    $rightFlank = substr($seq, $pos, 5);
  }
  my $flank = $leftFlank.$rightFlank;
  if ( $flank =~ m/[acgt]/ ){
    $attempts++;
    next;
  }

  #print STDERR "Successfully inserted $repeat @ $pos\n";
  substr($seq,$pos,0) = $repeat;
  $simpleBPInserted += length($repeat);
  $attempts = 0;
}

print ">genericBenchmark\n$seq\n";


sub getSimpleRepeat {
  my $simple = shift;
  my $numSimpleEx = $#{$simple} + 1;
  my $simpleInstance = $simple->[ int( rand $numSimpleEx ) ];
  my $orient = rand;
  my ( $lab, $seed, $dir, $exp, $div, $indel ) = split( /:/, $simpleInstance );
  my $seq = $seed x ( int( $exp ) + 1 );
  if ( $orient > 0.5 ) {
    $seq = reverse($seq);
    $seq =~ tr/ACGTNacgtn/TGCANtgcan/;
  }
  my $delPct = int( rand $indel );
  my $insPct = $indel - $delPct;
  
print STDERR "$simpleInstance = Calling with $div, $delPct, $insPct\n";
  # Substitutions
  $seq = addSubs($seq,$div);
  # Deletions
  $seq = addDels($seq,$delPct);
  # Insertions
  $seq = addIns($seq,$insPct);
  $seq = lc($seq);
  
  return $seq;
}


sub addIns
{
  my $seq = shift;
  my $percent = shift;
  my $newSeq = $seq;
  my $numIns = int(( $percent / 100 ) * length($seq));
  my $seqLen = length($seq);
  my %used;
  # Hexadecimal table containing the relative frequencies of
  # 1-15bp insertions.
  # 
  # Derived from:
  # gunzip -c hg38.fa.align.gz | perl -ne '{ if ( /^\s+(\S+)\s+\d+\s(\S+)\s\d+/ ) { $sav = $2; $label = $1; if ( $1 !~ /chr\S+/ ) { $seq .= $sav; } } if ( /^\s*\d+\s+\d+\./ ) {  while ( $seq =~ /(\-+)/ig ) { $gap{length($1)}++; } $seq = ""; } if ( eof ) { for($i=0;$i<16;$i++){ print "$i " . $gap{$i} . "\n"; } } }'
  #
  # Ins_Size	Count
  # 1 3784426
  # 2 1346759
  # 3 739926
  # 4 478935
  # 5 292533
  # 6 195729
  # 7 140717
  # 8 107377
  # 9 83049
  # 10 59827
  # 11 47193
  # 12 41111
  # 13 30760
  # 14 22560
  # 15 19366
  my $insTable = "1"x51   . "2"x18   . "3"x10   . "4"x7 .
                 "5"x4   . "6"x3   . "7"x2   . "8"x2   .
                 "9"x1   . "A"x1   . "B"x1;
  my %insSize = ( "1" => 1,
                  "2" => 2,
                  "3" => 3,
                  "4" => 4,
                  "5" => 5,
                  "6" => 6,
                  "7" => 7,
                  "8" => 8,
                  "9" => 9,
                  "A" => 10,
                  "B" => 11 );

  my %bases = ( "0" => "A", "1" => "C", "2" => "G", "3" => "T" );

  while ( $numIns > 0 )
  {
    my $pos = int(rand(length($newSeq)));
    if ( ! exists $used{$pos} )
    {
      my $insIdx = int(rand(100));
      my $size = $insSize{substr($insTable,$insIdx,1)};
      my $insSeq = "";
      for ( my $i = 0; $i < $size; $i ++ )
      {
        $insSeq .= $bases{int(rand(4))};
      }
      substr($newSeq,$pos,0) = $insSeq;
      #print "Orig: Adding insertion @ $pos : $insSeq \n";
      $used{$pos}++;
      $numIns -= $size;
    }
  }
  return $newSeq;

}

sub addDels
{
  my $seq = shift;
  my $percent = shift;
  my $newSeq = $seq;
  my $numDels = int(( $percent / 100 ) * length($seq));
  my $seqLen = length($seq);
  my %used;
  # Hexadecimal table containing the relative frequencies of
  # 1-15bp deletions.
  # 
  # Derived from:
  # gunzip -c hg38.fa.align.gz | perl -ne '{ if ( /^\s+chr\S+\s+\d+\s(\S+)\s/ ) { $seq .= $1; } if ( /^\s*\d+\s+\d+\./ ) {  while ( $seq =~ /(\-+)/ig ) { $gap{length($1)}++; } $seq = ""; } if ( eof ) { for($i=0;$i<16;$i++){ print "$i " . $gap{$i} . "\n"; } } }'
  #
  # Del Size	Count
  # 1	10665148
  # 2	4073386
  # 3	2534310
  # 4	1842226
  # 5	1067967
  # 6	749984
  # 7	543645
  # 8	432412
  # 9	348161
  # 10	283398
  # 11	219411
  # 12	181928
  # 13	148183
  # 14	118804
  # 15	101787
  my $delTable = "1"x46   . "2"x18   . "3"x11   . "4"x8 .
                 "5"x5   . "6"x3   . "7"x2   . "8"x2   .
                 "9"x2   . "A"x1   . "B"x1   . "C"x1;
  my %delSize = ( "1" => 1,
                  "2" => 2,
                  "3" => 3,
                  "4" => 4,
                  "5" => 5,
                  "6" => 6,
                  "7" => 7,
                  "8" => 8,
                  "9" => 9,
                  "A" => 10,
                  "B" => 11,
                  "C" => 12 );

  while ( $numDels > 0 )
  {
    my $pos = int(rand(length($newSeq)));
    if ( ! exists $used{$pos} )
    {
      my $delIdx = int(rand(100));
      my $size = $delSize{substr($delTable,$delIdx,1)};
      #print "Orig: Adding DELETION size=$size\n";
      substr($newSeq,$pos,$size) = "";
      $used{$pos}++;
      $numDels -= $size;
    }
  }
  return $newSeq;

}

sub addSubs
{
  my $seq = shift;
  my $percent = shift;
  my $newSeq = $seq;
  my $numSubs = int(( $percent / 100 ) * length($seq));
  my $seqLen = length($seq);

  my %used = ();

# Derived from ctools.matrix
#    A   G   C   T
#A   9  -6 -15 -17
#G  -6  10 -15 -15
#C -15 -15  10  -6
#T -17 -15  -6   9
#
# Inverted and scaled for random pick array
#    A   G   C   T
#A   -  57   23  20 
#G   56  -   22  22
#C   22 22    -  56
#T   20 23   57   -
#
  my %subTable = ( "A" => "G"x57 . "C"x23 . "T"x20,
                   "G" => "A"x56 . "C"x22 . "T"x22,
                   "C" => "A"x22 . "G"x22 . "T"x56,
                   "T" => "A"x20 . "G"x23 . "C"x57 );

  while ( $numSubs )
  {
    my $pos = int(rand($seqLen));
    if ( ! exists $used{$pos} )
    {
      my $subPos = int(rand(100));
      substr($newSeq,$pos,1) = 
          substr( $subTable{uc(substr($newSeq,$pos,1))}, $subPos, 1 );
      #print "Adding sub: $pos\n";
      $used{$pos}++;
      $numSubs--;
    }
  }
  return $newSeq;
}

1;
