#!/usr/bin/perl
#
# price2trf.pl
#
# A script to run take assembled contigs from the PRICE assembler and produce a set
# of tandem repeats from them (using trf). Based on trf_wrapper.pl
#
# Author: Keith Bradnam, Genome Center, UC Davis
# This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License.
#
# Last updated by: $Author: keith $
# Last updated on: $Date: 2011/07/12 20:49:06 $
use strict;
use warnings;
use Keith;
use FAlite;
use Getopt::Long;
###############################
#
# Command-line options
#
################################
my $match;
my $mismatch;
my $indel;
my $pmatch;
my $pindel;
my $min_score;
my $max_period;
my $min_copies; # minimum number of copies of repeat in read
my $min_length; # minimum length of repeat unit
my $stats; # show summary statistics fir repeats that are skipped/ignored
my $help; # print help
GetOptions ("match:i" => \$match,
"mismatch:i" => \$mismatch,
"indel:i" => \$indel,
"pmatch:i" => \$pmatch,
"pindel:i" => \$pindel,
"min_score:i" => \$min_score,
"period:i" => \$max_period,
"copies:f" => \$min_copies,
"length:i" => \$min_length,
"stats" => \$stats,
"help" => \$help
);
###############################
#
# Set defaults
#
################################
# set defaults if not specified on command line
# these are all defaults used (and suggested) by the trf program
$match = 1 if (!$match);
$mismatch = 1 if (!$mismatch);
$indel = 2 if (!$indel);
$pmatch = 80 if (!$pmatch);
$pindel = 5 if (!$pindel);
$min_score = 200 if (!$min_score);
$max_period = 750 if (!$max_period);
# these are extra options that we can only implement through post-processing of raw trf output
$min_copies = 2 if (!$min_copies);
$min_length = 5 if (!$min_length);
###############################
#
# Check command line options
#
################################
# usage
my $usage = "
usage: trf_wrapper.pl [options]
options:
-match [$match]
-mismatch [$mismatch]
-indel [$indel]
-pmatch (probability of match) [$pmatch]
-pindel (probability of indel) [$pindel]
-min_score [$min_score]
-period (maximum period length) [$max_period]
-copies (minimum copies) [$min_copies]
-length (minimum repeat length) [$min_length]
-stats : show details of which tandem repeats were skipped or ignored
-help : print this help
";
die $usage if ($help);
die $usage unless (@ARGV == 1);
my ($file) = @ARGV;
###############################
#
# Variables
#
###############################
my @param = ($match, $mismatch, $indel, $pmatch, $pindel, $min_score, $max_period);
my %seq_to_length; # need to know lengths of all sequences, key to hash is the fasta header
my %errors; # Keep track of various errors
$errors{'repeat_length'} = $errors{'min_copies'} = $errors{'min_repeat_fraction'} = $errors{'length_multiple'} = 0;
my $seq_counter = 0; # how many input sequences do we process
my $repeat_counter = 0; # number of tandem repeats predicted by trf
my $output_counter = 0; # number of tandem repeats that pass all filters
my $total_nt = 0; # keep track of the total size of sequence in the input files
my $repeat_cutoff = 0.25; # what fraction of a contig has to be made up on tandem repeats in order for us to keep it
my $excluded_trf_nt = 0; #
###############################
#
# Loop over input file
#
###############################
# form name of output file from parameters
my $data_file = $file . "." . join(".", @param) . ".dat";
# We may already have run trf on the same sequence file and just want to tweak the parameters for deciding which
# repeats to keep, so can make the script use an existing data file if one exists
if(-e $data_file){
warn "NOTE: A data file with the chosen parameters already exists ($data_file), will use this instead of re-running trf\n\n";
}
else{
system("trf $file $match $mismatch $indel $pmatch $pindel $min_score $max_period -d -h > /dev/null") or die "Can't run trf\n";
}
# now process trf output file
process_trf_output($file, $data_file);
# Statistics on what was rejected/outside of thresholds
if($stats){
my $total_rejections = $errors{'repeat_length'} + $errors{'min_copies'} + $errors{'min_repeat_fraction'} + $errors{'length_multiple'};
warn "\nProcessed $seq_counter sequences that contained $repeat_counter repeats, $output_counter of which matched all criteria\n";
warn "$total_rejections repeats rejected for failing criteria:\n";
warn "\t$errors{'min_repeat_fraction'} repeats rejected for making up too low a fraction of the total sequence (<$repeat_cutoff)\n";
warn "\t$errors{'repeat_length'} repeats rejected for being too short (<$min_length nt)\n";
warn "\t$errors{'min_copies'} repeats rejected for having too few copies (<$min_copies)\n";
warn "\t$errors{'length_multiple'} repeats rejected for being a multiple of a shorter repeat\n\n";
my $excluded_percent = sprintf("%.2f",($excluded_trf_nt / $total_nt)*100);
warn "Total amount of tandem repeats in excluded repeat fraction (<$repeat_cutoff): ", int($excluded_trf_nt), " nt ($excluded_percent%)\n\n";
}
exit(0);
###############################
#
#
# S U B R O U T I N E S
#
#
################################
sub process_trf_output{
my ($file,$data) = @_;
# want to capture the (slightly processed) FASTA headers in the trf output
my $header;
# now need to capture lengths of all sequences in input file (frustratingly, this is not in the TRF output)
calculate_seq_lengths($file);
open(DATA,"<$data") or die "Can't open data file\n";
my $seq_repeat_counter; # to keep track of when a sequence contains more than one repeat
my @seq_repeat_data; # keep track of data on all repeats in a sequence to see if they are just multiples of each other
REPEAT: while(){
# skip blank likes
next if (m/^$/);
if (m/^Sequence: (.*)/) {
$header = $1;
$seq_counter++;
# reset certain counters and data
$seq_repeat_counter = 0;
@seq_repeat_data = ();
# and now move to next line in file
next REPEAT;
}
# the main output that we are interested in will all be on one line which starts with various
# numerical details of the repeat
if(m/^\d+ \d+ \d+ \d+\.\d /){
$repeat_counter++;
# capture repeat data into various variables (most will be unsused)
my ($start,$end,$period,$copies,$consensus,$matches,$indels,$score,$a,$c,$g,$t,$entropy,$repeat_seq) = split(/\s+/);
my $repeat_length = length($repeat_seq);
my $total_repeat_span = $end - $start + 1;
my $repeat_fraction = $total_repeat_span / $seq_to_length{$header};
my $tidied = Keith::tidy_seq($repeat_seq);
###################################################################
# now apply various criteria to see if we want to keep this repeat
###################################################################
# does the repeat occupy enough of the read that it is contained in?
if ($repeat_fraction < $repeat_cutoff){
$errors{"min_repeat_fraction"}++;
$excluded_trf_nt += ($copies * $repeat_length);
next REPEAT;
}
# is the repeat unit long enough? We certainly want to dicount tandem monomers, dimers etc.
if ($repeat_length < $min_length){
$errors{"repeat_length"}++;
next REPEAT;
}
# are there enough copies of the repeat in the read?
if ($copies < $min_copies){
$errors{"min_copies"}++;
next REPEAT;
}
# if we get this far then we will capture some of the repeat data in a hash
# this is to potentially compare to other repeats in the same sequence
$seq_repeat_counter++;
$seq_repeat_data[$seq_repeat_counter]{'coords'} = "$start $end";
$seq_repeat_data[$seq_repeat_counter]{'consensus'} = "$consensus";
$seq_repeat_data[$seq_repeat_counter]{'copies'} = "$copies";
# Is this repeat just a multiple of a previously seen repeat in the same trace sequence?
if ($seq_repeat_counter > 1){
# loop through previous repeats
for(my $i = 1; $i < @seq_repeat_data; $i++){
if("$start $end" eq $seq_repeat_data[$i]{'coords'}){
my $result = $consensus / $seq_repeat_data[$i]{'consensus'};
my $processed_result = $result;
$processed_result =~ s/\d+\.(\d+)/0\.$1/;
# we are not expecting one repeat to be a perfect multiple of another repeat, so we'll allow
# some flexibility.
if(($processed_result < 0.15 || $processed_result > 0.85) && ($result > 1.5)){
# print "$header\tRepeat $seq_repeat_counter: span = $start $end, consensus length = $consensus, copies = $copies\tTHIS IS A MULTIPLE ($result) OF:\n";
# print "$header\tRepeat $i: span = $seq_repeat_data[$i]{'coords'}, consensus length = $seq_repeat_data[$i]{'consensus'}, copies = $seq_repeat_data[$i]{'copies'}\n";
$errors{"length_multiple"}++;
next REPEAT;
}
}
}
}
# if we are here then we are keeping the repeat and printing it out
$output_counter++;
my $formatted = sprintf("%.0f",$repeat_fraction * 100);
my $mass = $repeat_length * $copies;
print ">tandem-$output_counter L=$repeat_length N=$copies M=$mass F=$formatted% P=$header\n";
print "$tidied\n";
}
}
close(DATA);
}
sub calculate_seq_lengths{
my ($file) = @_;
open (FILE,"<$file") || die "Can't open file $file\n";
my $fasta = new FAlite(\*FILE);
while(my $entry = $fasta->nextEntry) {
# extract seq length & header and add to hash and to grand total
my $length = length($entry->seq);
$total_nt += $length;
my $header = $entry->def;
$header =~ s/>//;
$seq_to_length{$header} = $length;
}
close(FILE);
}