package Keith; # a module of frequently used bioinformatics subroutines used by Keith Bradnam # Last updated by: $Author: keith $ # Last updated on: $Date: 2011/07/28 22:18:25 $ use strict; use FAlite; use List::Util qw(sum); # simple subroutine to just count nucleotide frequencies # also counts N's separately, and finally works out if # there is anything other than A,T,C,G or N sub get_mononucleotides{ # receive sequence, make sure it is all uppercase my $seq = uc(shift); my $length = length($seq); my ($a,$c,$g,$t,$n,$o) = (0,0,0,0,0,0); # count mononucleotide frequencies $a = ($seq =~ tr/A/A/); $c = ($seq =~ tr/C/C/); $g = ($seq =~ tr/G/G/); $t = ($seq =~ tr/T/T/); $n = ($seq =~ tr/N/N/); # calculate remainder (in case there are other characters present) $o = $length - $a - $c - $g -$t - $n; return($a,$c,$g,$t,$n,$o); } # simple subroutine to just count dinucleotide frequencies # ignores any non-A,C,G,T character sub get_dinucleotides{ # receive sequence, make sure it is all uppercase my $seq = uc(shift); my $length = length($seq); # initialize hash my %di; ($di{'AA'},$di{'AC'},$di{'AG'},$di{'AT'}) = (0,0,0,0,0); ($di{'CA'},$di{'CC'},$di{'CG'},$di{'CT'}) = (0,0,0,0,0); ($di{'GA'},$di{'GC'},$di{'GG'},$di{'GT'}) = (0,0,0,0,0); ($di{'TA'},$di{'TC'},$di{'TG'},$di{'TT'}) = (0,0,0,0,0); # want to keep track of any dinucleotides that have something other than A,C,G,or T. my $other=0; # count dinucleotide frequencies foreach my $i (0..length($seq)){ my $tmp = substr($seq,$i,2); $di{$tmp}++; # we might see things like 'NA' or 'CN' so we will keep a count of these as well $other++ if ($tmp !~ m/^[ACGT]*$/); } return($di{'AA'},$di{'AC'},$di{'AG'},$di{'AT'},$di{'CA'},$di{'CC'},$di{'CG'},$di{'CT'},$di{'GA'},$di{'GC'},$di{'GG'},$di{'GT'},$di{'TA'},$di{'TC'},$di{'TG'},$di{'TT'},$other); } # calculate a z test statistic from two sets of numbers. # will also calculate (and return), the means, standard deviations, # and standard errors of the means for both datasets sub z_test{ my $a = shift; my $b = shift; # 3rd value can be an optional level of precision # defaults to 4 dp my $precision = shift; $precision = 4 if (!$precision); # and now all quite simple calculations.... my $count_a = @$a; my $count_b = @$b; my $sum_a = sum(@$a); my $sum_b = sum(@$b); my $mean_a = sprintf("%.${precision}f",$sum_a/$count_a); my $mean_b = sprintf("%.${precision}f",$sum_b/$count_b); my $std_dev_a = sprintf("%.${precision}f",sqrt(sum(map {($_ - $mean_a) ** 2} @$a) / ($count_a -1))); my $std_dev_b = sprintf("%.${precision}f",sqrt(sum(map {($_ - $mean_b) ** 2} @$b) / ($count_b -1))); my $std_err_a = sprintf("%.${precision}f",$std_dev_a/sqrt($count_a)); my $std_err_b = sprintf("%.${precision}f",$std_dev_b/sqrt($count_b)); # ... leading up to the final z statistic my $z = sprintf("%.${precision}f",($mean_a - $mean_b)/sqrt(($std_dev_a**2/$count_a)+($std_dev_b**2/$count_b))); return($z,$mean_a,$mean_b,$std_dev_a,$std_dev_b,$std_err_a,$std_err_b); } #adds a new line character every 60 bases sub tidy_seq{ my ($seq) = @_; $seq =~ s/[\s\n]//g; $seq =~ tr/a-z/A-Z/; my ($output_seq) = ""; my (@seq2) = ""; my ($start,$end); @seq2 = split(//,$seq); my $length = @seq2; my ($to_add) = int($length/60); $end = $start= 0; foreach (1..$to_add){ $output_seq .= substr($seq,$start,60); $output_seq .= "\n"; $start += 60; } $output_seq .= substr($seq,$start); return ($output_seq); } # reverse complement a sequence sub revcomp{ my ($seq) = @_; $seq = uc($seq); my $revcomp = reverse $seq; $revcomp =~ tr/ACGTRSKBD/TGCAYWMVH/; return ($revcomp); } # produce a random integer between x and y sub get_random_integer{ my($min, $max) = @_; # Assumes that the two arguments are integers themselves! return $min if $min == $max; ($min, $max) = ($max, $min) if $min > $max; return $min + int rand(1 + $max - $min); } # take a FASTA file and just double up all the sequences # can be useful for helping find the 'edges' of a tandem repeat sub tandemify_sequence{ my ($input_file, $output_file) = @_; # choose new output name if necessary if (not defined $output_file){ $output_file = $input_file; # check for .fa, .fasta, .seq, or .dna ending if ($output_file =~ m/(fa|fasta|seq|dna)$/){ my $suffix = $1; $output_file =~ s/(fa|fasta|seq|dna)$//; $output_file .= "x2.$suffix"; } else { $output_file = $input_file . ".x2"; } } # parse info from FASTA file my ($in, $out); open($in, "<", "$input_file") or die "Can't read from $input_file\n"; open($out, ">", "$output_file") or die "Can't write to $output_file\n"; my $fasta = new FAlite(\*$in); # loop through each sequence in target file while(my $entry = $fasta->nextEntry){ my $seq = Keith::tidy_seq($entry->seq); my $header = $entry->def; print $out "$header\n$seq\n\n$seq\n\n"; } close($in); close($out); } 1; __END__ =head1 NAME Keith; =head1 DESCRIPTION Keith is a package that has some basic bioinformatic functions which are mostly applicable to working with FASTA files =head1 FUNCTIONS =head2 get_mononucleotides Returns a list of counts of A,C,G,T,N, and 'Other' bases in a specified string (presumed to be DNA). 'Other' is a count of everything which isn't a A,C,G,T or N, and while it potentially may include other nucleotide ambiguity characters (R,Y,S,W etc.) it can be used as a crude error checking method (if there are more 'Other' characters than everything else, it's probably a protein sequence; if there are zero Ts but plenty of 'Other', you may have an RNA sequence) Example: # a,c,g, and t are the first 4 characters returned my ($a,$c,$g,$t) = get_mononucleotides("aaccacacaggaggattttgaaag"); # but six values are always returned for any string my ($a,$c,$g,$t,$n,$other) = get_mononucleotides($sequence); =head2 get_dinucleotides Returns a list of counts of all 16 dinucleotides, and 'Other' for any specified string (presumed to be DNA). 'Other' is a count of any dinucleotide which includes something other than A,C,T, or G. The percentage calculations exclude these 'Other' dinucleotides. Example: my (@dinucs) = get_dinucleotides("aaccacacaggaggattttgaaag"); =head2 z_test If you have two lists of numbers (in separate arrays), send them to the z_test function and it will calculate a z test statistic which will let you know how different the means of the two samples are (and whether the difference is significant). It will also (if you want) return the information used to calculate the z statistic (means, standard deviations, and standard errors) The function defaults to a level of 4 decimal places for precision, but you change this with the third argument to the function Example (increasing level of precision to 5 dp): my ($z,$mean_a,$mean_b,$std_dev_a,$std_dev_b,$std_err_a,$std_err_b) = Keith::z_test(\@a,\@b,5); =head2 tidy_seq Simply pass subroutine a DNA/RNA/protein sequence (just the sequence, not the header) as a string and subroutine will tidy up sequence to be all upper case and just 60 characters per line. =head2 revcomp Returns the reverse complement of a DNA sequence =head2 get_random_integer Specify a minimum and maximum value, and this routine just calculates a random integer between those values =head2 tandemify_sequence Take a FASTA file and just doubles every sequence and writes to a new file (which can be specified as a second argument) =head1 AUTHOR Keith Bradnam (krbradnam@ucdavis.edu) =head1 ACKNOWLEDGEMENTS This software was developed at the Genome Center, UC Davis Davis, CA, USA =head1 COPYRIGHT Copyright (C) 2008. Keith Brandam. All Rights Reserved. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =cut