#!/usr/bin/perl # # n50_booster.pl # # A script to increase the N50 length of a genome/transcriptome assembly # # Author: Keith Bradnam, Genome Center, UC Davis # This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. # This software is provided AS IS, without warranty of any kind. use strict; use warnings FATAL => 'all'; use FAlite; use List::Util qw(sum max min); die "Usage: $0 " unless @ARGV == 1; my ($file) = @ARGV; ############################################################################### # # All documentation and comments have been removed from the code after # this point in accordance with traditional bioinformatics coding practices # ############################################################################### my @l; open(my $in, "<", "$file") or die "Can't open $file\n"; my $fasta = new FAlite(\*$in); while(my $entry = $fasta->nextEntry){ push(@l, length($entry->seq)); } close($in); @l = reverse sort{$a <=> $b} @l; my $tl = sum(@l); my $ls = max(@l); my $p = int(@l*0.75 -1); my $np = $p; for(my $i = $p-1; $i > 0; $i--){ if ($l[$i] == $l[$p]){ $np = $i; next; } else{ last; } } my $ssl = $l[$p]; my $stp = 0; my $min = $np; my $max = @l -1; $stp = sum(@l[$min..$max]); my $rt = 0; my $pre_n50_length = 0; foreach my $l (@l){ $rt += $l; if ($rt > $tl * 0.5){ $pre_n50_length = $l; last; } } warn "\n\nBefore:\n"; warn "==============\n"; warn "Total assembly size = $tl bp\n"; warn "N50 length = $pre_n50_length bp\n\n\n"; warn "Boosting N50...please wait\n\n\n"; my $output_file = "$file" . ".n50"; open(my $out, ">", "$output_file") or die "Can't open $output_file\n"; open($in, "<", "$file") or die "Can't open $file\n"; $fasta = new FAlite(\*$in); while(my $entry = $fasta->nextEntry){ my $seq = uc($entry->seq); my $def = $entry->def; my $l = length($seq); next if ($l <= $ssl); if($l == $ls){ my $padded = "N" x $stp; $seq = $seq . $padded; } my $newseq = tidy_seq($seq); print $out "$def\n$newseq\n"; } close($in); close($out); @l = (); open($in, "<", "$output_file") or die "Can't open $output_file\n"; $fasta = new FAlite(\*$in); while(my $entry = $fasta->nextEntry){ push(@l, length($entry->seq)); } close($in); $rt = 0; my $pt_n50_length = 0; @l = reverse sort{$a <=> $b} @l; foreach my $l (@l){ $rt += $l; my $percent = sprintf("%.2f", $rt / $tl * 100); if ($rt > $tl * 0.5){ $pt_n50_length = $l; last; } } my $improvement = $pt_n50_length - $pre_n50_length; warn "After:\n"; warn "==============\n"; warn "Total assembly size = $tl bp\n"; warn "N50 length = $pt_n50_length bp\n\n\n"; warn "Improvement in N50 length = $improvement bp\n"; warn "See file $output_file for your new (and improved) assembly\n\n"; sub tidy_seq{ my ($seq) = @_; $seq =~ s/[\s\n]//g; $seq =~ tr/a-z/A-Z/; my ($output_seq) = ""; my (@seq2) = ""; my ($start,$end); @seq2 = split(//,$seq); my $l = @seq2; my ($to_add) = int($l/60); $end = $start= 0; foreach (1..$to_add){ $output_seq .= substr($seq,$start,60); $output_seq .= "\n"; $start += 60; } $output_seq .= substr($seq,$start); return ($output_seq); }