#!/usr/bin/env perl

use strict;
use warnings;
use Getopt::Long;
use File::Basename;
use Data::Dumper;

#----------------------------------------------------------------------
# the sitch
#                      1     6     11    16    21    26  30
# contig:              NNNNN NNNNN NNNNN NNNNN NNNNN NNNNN
#
# 11,20,+   10M                    xxxxx xxxxx
# 11,20,+   6M4S                   xxxxx x----
# 11,20,+   4S6M                   ----x xxxxx
# 11,20,+   3S5M2S                 ---xx xxx--

#----------------------------------------------------------------------
# globals

my $EXE = basename($0);
my $VERSION = "0.4.0";
my $AUTHOR = 'Torsten Seemann (@torstenseemann)';
my $HOMEPAGE = "https://github.com/tseemann/samclip";

# SAM file TSV columns
use constant {
  SAM_RNAME => 2,
  SAM_POS   => 3,
  SAM_CIGAR => 5,
  SAM_TLEN  => 8,
  SAM_SEQ   => 9,
};

#----------------------------------------------------------------------
# command line parameters

my $max      = 5;
my $ref      = '';
my $invert   = 0;
my $debug    = 0;
my $progress = 100_000;

#----------------------------------------------------------------------
sub usage {
  my($exitcode) = @_;
  $exitcode=0 if !defined($exitcode) or $exitcode eq 'help';
  my $fh = $exitcode ? \*STDERR : \*STDOUT;
  print $fh
    "SYNOPSIS\n  Filter SAM file for soft & hard clipped alignments\n",
    "AUTHOR\n  $AUTHOR\n",
    "USAGE\n",
    "  % samclip --ref ref.fa < in.sam > out.sam\n",
    "  % minimap2 ref.fa R1.fq R2.fq | samclip --ref ref.fa | samtools sort > out.bam\n",
    "OPTIONS\n",
    "  --help         This help\n",
    "  --version      Print version and exit\n",
    "  --ref FASTA    Reference genome - needs FASTA.fai index\n",
    "  --max NUM      Maximum clip length to allow (default=$max)\n",
    "  --invert       Output rejected SAM lines and ignore good ones\n",
    "  --debug        Print verbose debug info to stderr\n",
    "  --progress N   Print progress every NUM records (default=$progress,none=0)\n",
    "HOMEPAGE\n  $HOMEPAGE\n",
    "";
  exit($exitcode);
}

#----------------------------------------------------------------------
# getopts

@ARGV or usage(1);

GetOptions(
  "help"       => \&usage,
  "version"    => \&version,
  "ref=s"      => \$ref,
  "max=i"      => \$max,
  "invert"     => \$invert,
  "debug"      => \$debug,
  "progress=i" => \$progress,
) or usage(1);
             
$ref or err("Please supply reference genome with --ref");
$max >= 0 or err("Please supply --max >= 0");  
$ref .= ".fai" unless $ref =~ m/\.fai$/;
-r $ref or err("Can't see '$ref' index. Run 'samtools faidx $ref' ?"); 
!@ARGV and -t STDIN and err("Please provide or pipe a SAM file to $EXE");

#----------------------------------------------------------------------
# main
msg("$EXE $VERSION by $AUTHOR");

# get a hash of { seqname => length }
msg("Loading: $ref");
my $len = fai_to_dict($ref);
msg(Dumper($len)) if $debug;
msg("Found", scalar keys %$len, "sequences in $ref");

my $total=0;
my $removed=0;
my $kept=0;
my $header=0;

# read SAM one line ar a time
while (my $line = <ARGV>) {
  # SAM header
  if ($line =~ m/^@/) {
    print $line;
    $header++;
    next;
  }
  $total++;
  msg("Processed $total records...") if $progress and $total % $progress == 0;
  my @sam = split m/\t/, $line;
  # do a quick 'clipped?' check before heavyweight parsing
  if ($sam[SAM_CIGAR] =~ /\d[SH]/) {
    my($HL, $SL, undef, $SR, $HR) 
      = ($sam[5] =~ m/ ^ (?:(\d+)H)? (?:(\d+)S)? (.*?) (?:(\d+)S)? (?:(\d+)H)? $/x);
    $HL ||= 0; $SL ||= 0; $SR ||= 0; $HR ||= 0;
    # if either end is clipped more than --max allowed, then remove it
    # unless it is at a contig end
    my $start = $sam[SAM_POS];
    my $end = $start + length($sam[SAM_SEQ]) - 1;
    my $contiglen = $len->{$sam[SAM_RNAME]} or err("Reference", $sam[SAM_RNAME], "not in '$ref'");
    my $L = $HL + $SL;
    my $R = $HR + $SR;
    $L = 0 if $start <= 1+$L;
    $R = 0 if $end >= $contiglen-$R;
    my $info = $debug ? "CHROM=$sam[SAM_RNAME]:1..$contiglen POS=$start..$end CIGAR=$sam[SAM_CIGAR] L=$L R=$R | HL=$HL SL=$SL SR=$SR HR=$HR max=$max)" : "need --debug";
    if ($L > $max or $R > $max) {
      msg("BAD! $info") if $debug;
      $removed++;
      next;
    }
    msg("GOOD $info") if $debug;
    # otherwise pass through untouched
    print $line if $invert;
    $kept++;
  }
  print $line unless $invert;
}
# stats
msg("Total SAM records $total, removed $removed, allowed $kept, passed", $total-$removed);
msg("Header contained $header lines");
msg("Done.");

#----------------------------------------------------------------------
sub fai_to_dict {
  my($fname) = @_;
  open my $fai, '<', $fname or err("Can't read FAI '$fname'");
  my $len = {};
  while (<$fai>) {
    my($name, $bp) = split m/\t/;
    $len->{$name} = $bp;
  }
  close $fai;
  return $len;
}


#----------------------------------------------------------------------
sub version {
  print "$EXE $VERSION\n";
  exit(0);
}

#----------------------------------------------------------------------
sub msg {
  print STDERR "[$EXE] @_\n";
}

#----------------------------------------------------------------------
sub err {
  msg("ERROR:", @_);
  exit(1);
}

