#!/usr/bin/perl # HTSanalysis - program that analyse sequences generated by # Hight Thoughput Sequinator such as: 454, Solexa # for Bind-N-Seq use strict; use warnings; use Az; use DataBrowser; use Getopt::Long; my $SEQ_FILE = ""; # input file my $bcCondFile = ""; # file with barCodes-condition to use, one per line my $hts = "s"; # type of Hight Throughput Sequinator: 454 - 454; # s - solexa my $stat = ""; # perform statistics my $partition = ""; # perform partitioning my $partOpt = "clean"; # get part of the sequences: clean -clean my $partBC = "AAA"; # for partitioning what to print: all, CGT (somebarCodes) my $partRegion = "all"; # for partitioning what regioin to pritn: all; random die "usage: HTSanalysis.pl [options] options: -input input file -bcCondFile file with barCodes-condition to use, one per line -hts type of Hight Troughput Sequinator: $hts -stat perform and print statistics -partition get some part of input sequence and print -partOpt partition option: $partOpt clean - filter to get clean data set; extraClean - filter to get extraClean data set; entropy - filtered by entropy score; barCode - filter for specific barCode; -partBC for partitioning what barCode to print: $partBC -partRegion for partitioning what region to print: $partRegion all - all sequence; random - random region of sequence; " unless @ARGV > 1; GetOptions("input=s" => \$SEQ_FILE, "bcCondFile=s" => \$bcCondFile, "hts=s" => \$hts, "stat" => \$stat, "partition" => \$partition, "partOpt=s" => \$partOpt, "partBC=s" => \$partBC, "partRegion=s" => \$partRegion); die "input SEQ_FILE\n" if $SEQ_FILE eq ""; die "input bcCondFile\n" if $bcCondFile eq ""; ################################################################################ ####### Parameters and default values ######################################## ################################################################################ my @nucleotides = qw(A C G T); my %barcode = (); ## barCodes from file - file must contain all barCodes and has priority over $bcNum if ($bcCondFile) { open (BCIN, "$bcCondFile") or die; while () { my @f = split(/\s/, $_); $barcode{$f[0]} = $f[1]; } } close BCIN; ## hts conditions my %seq = (); # input sequences my %bc_rand_seq = (); # random sequences devided by BarCode my %uniqueBC_R = (); # hash with keys equal unique barCode+Random sequences my $bcS; # position within sequence where barCode Starts my $jel_true; # real left junk edge my $je_lS; # real left junk edge Starts my $je_lL; # real left junk edge Length my $jer_true; # real right junk edge my $je_rS; # real right junk edge Starts my $je_rL; # real right junk edge Length my $bc_rS; # barCode_random Starts my $bc_rL; # barCode_random Length my $randS; # random region Starts my $randL; # random region Legth if ($hts eq "454") { ## 454 parameters $bcS = 20; $jel_true = "GCCTCCCTCGCGCCATCAG"; $je_lS = 1; $je_lL = 19; $bc_rS = 20; $bc_rL = 24; $randS = 23; $randL = 21; } elsif ($hts eq "s") { ## solexa parameters $bcS = 3; $jel_true = "AA"; $je_lS = 1; $je_lL = 2; $jer_true = "AGATCGGAA"; $je_rS = 27; $je_rL = 9; $bc_rS = 3; $bc_rL = 24; $randS = 6; $randL = 21; } else {die "ERROR: hts $hts incorrect!!!\n"} ################################################################################ ### Reading input-sequence file ### ################################################################################ open (SEQ, "$SEQ_FILE") or die; while () { my $def = $_; chomp $def; my $s = ; chomp $s; # if ($seq{$def}) {die "$def already exist!!!\n"} # if ($s =~ /\W/) {die "$foo: sequence has non word letter\n"} $seq{$def} = $s; } close SEQ; print "finished reading input_file\n"; ################################################################################ ### Partitioning ### ################################################################################ if (!$partition) {goto STEP1} print "started partitioning\n"; ## print clean sequences ------------------------------------------------ if ($partOpt eq "clean") { if (-e "clean_seq.fa") {die "clean_seq.fa already exist\n"} open (PART, ">clean_seq.fa") or die; my %uniqueBC_R = (); foreach my $def (keys %seq) { my $bc = substr($seq{$def}, $bcS - 1, 3); if (!defined $barcode{$bc}) {next} # barCode must be defined if ($seq{$def} =~ /[^ACGT]/) {next} # sequence must contain only [ACGT] my $jel = substr($seq{$def}, $je_lS - 1, $je_lL); my $jer = substr($seq{$def}, $je_rS - 1, $je_rL); if ($jel ne $jel_true) {next} # junk edge left must == $jel_true if ($jer ne $jer_true) {next} # junk edge right must == $jer_true my $bc_r = substr($seq{$def}, $bc_rS - 1, $bc_rL); if ($uniqueBC_R{$bc_r}) {next} # check uniqness of barCode_Random else {$uniqueBC_R{$bc_r} = 2} print PART "$def\n$seq{$def}\n"; } close PART; } #------------------------------------------------------------------------------- ## print extraClean sequences ------------------------------------------------ # disclude all sequences that equal or different in only one nucleotide # within random region if ($partOpt eq "extraClean") { if (-e "extraClean_seq.fa") {die "extraClean_seq.fa already exist\n"} open (PART, ">extraClean_seq.fa") or die; my %ten_mer = (); my $i = 1; my %unique = (); my %ununique = (); foreach my $def (keys %seq) { my $bc = substr($seq{$def}, $bcS - 1, 3); if (!defined $barcode{$bc}) {next} # barCode must be defined if ($seq{$def} =~ /[^ACGT]/) {next} # sequence must contain only [ACGT] my $jel = substr($seq{$def}, $je_lS - 1, $je_lL); my $jer = substr($seq{$def}, $je_rS - 1, $je_rL); if ($jel ne $jel_true) {next} # junk edge left must == $jel_true if ($jer ne $jer_true) {next} # junk edge right must == $jer_true my $rand = substr($seq{$def}, $randS - 1, $randL); ## save to %ten_mer two chomped 10_mer sequences push @{$ten_mer{substr($rand, 0, 10)}}, $i; push @{$ten_mer{substr($rand, 10, 10)}}, $i; $unique{$i} = $def; # assign to each $def a number $i++; } #### filter out all sequences that similar or differ in one nucleotide foreach my $part (keys %ten_mer) { if (@{$ten_mer{$part}} == 1) {delete($ten_mer{$part}); next;} for (my $j = 0; $j < @{$ten_mer{$part}} - 1; $j++) { my $id1 = $ten_mer{$part}[$j]; my $randj = $seq{$unique{$id1}}; my @first = split(//, substr($randj, $randS - 1, $randL)); for (my $k = $j + 1; $k < @{$ten_mer{$part}}; $k++) { if ($ten_mer{$part}[$j] == $ten_mer{$part}[$k]) {next} ## mark un_unique sequences my $id2 = $ten_mer{$part}[$k]; if ($ununique{$id1} and $ununique{$id2}) {next} my $randk = $seq{$unique{$id2}}; my @second = split(//, substr($randk, $randS - 1, $randL)); my $diff = 0; for (my $r = 0; $r < @first; $r++) { if ($diff > 1) {last} if ($first[$r] ne $second[$r]) {$diff++} } if ($diff <= 1) { $ununique{$id1} = 1; $ununique{$id2} = 1; } } } } %ten_mer = (); foreach my $id (keys %ununique) { delete($unique{$id}); } foreach my $id (keys %unique) { my $def = $unique{$id}; print PART "$def\n$seq{$def}\n"; } close PART; } #------------------------------------------------------------------------------- ## print sequences with particular barCode -------------------------------- if ($partOpt eq "barCode") { unless (-d "partBC") {system("mkdir partBC") == 0 or die} if ($partRegion eq "all") { if (-e "partBC\/$partBC-$SEQ_FILE") { die "partBC\/$partBC-$SEQ_FILE already exist\n" } open (PART, ">partBC\/$partBC-$SEQ_FILE") or die; } else { if (-e "partBC\/$partBC-$partRegion-$SEQ_FILE") { die "partBC\/$partBC-$partRegion-$SEQ_FILE already exist\n" } open (PART, ">partBC\/$partBC-$partRegion-$SEQ_FILE") or die; } foreach my $def (keys %seq) { my $bc = substr($seq{$def}, $bcS - 1, 3); if ($partBC ne $bc) {next} if ($partRegion eq "all") { print PART "$def\n$seq{$def}\n"; } elsif ($partRegion eq "random") { my $rand = substr($seq{$def}, $randS - 1, $randL); print PART "$def\n$rand\n"; } else {die "problem with partRegion\n"} } close PART; } #------------------------------------------------------------------------------- #my @TS = qw(0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2); #my %entropy_distr = (); ## calculating entropy in random region # my $rand = substr($seq{$s}, $randS - 1, $randL); # my @letters = split(//, $rand); # my %tempHash = (); # foreach my $nuc (@letters) { # $tempHash{$nuc}++; # } # my $foo_e = 0; # foreach my $nuc (keys %tempHash) { # my $p = $tempHash{$nuc}/$randL; # $foo_e += ($p)*(log($p)/log(2)); # } # foreach my $threshold (@TS) { # if (-$foo_e <= $threshold) { # push @{$entropy_distr{$threshold}}, $rand; # last; # } # } #open (OUT, ">entropyDistr_rand_$SEQ_FILE") or die; #foreach my $th (sort keys %entropy_distr) { # my $i = 0; # foreach my $seq (@{$entropy_distr{$th}}) { # if ($i > 4) {last} # print OUT "$seq\t$th\n"; # $i++; # } #} #close OUT; ################################################################################ ### STATISTICS: sequences analysis ### ################################################################################ STEP1: print "STEP1\n"; if (!$stat) {goto STEP2} print "started statistics\n"; my @seqFileParts = split(/\./, $SEQ_FILE); my $seqFileName = $seqFileParts[0]; open (STAT, ">stat-$seqFileName.ha") or die; ## distribution of sequences length ---------------------------------------- my %seq_length = (); foreach my $s (keys %seq) { my $l = length($seq{$s}); $seq_length{$l}++; } print STAT " distribution of sequences length:\n"; print STAT ${Az::printHashColumn(\%seq_length, "key")}; #------------------------------------------------------------------------------- ## distribution of N in sequences ---------------------------------------- my $totalN = 0; my $seqN = 0; # number of sequences with at list one N my $jelN = 0; # total number of N in jel my $bcN = 0; # total number of N in bc my $randomN = 0; # total number of N in random my $jerN = 0; # total number of N in jer my @foo = (); foreach my $s (keys %seq) { my $jel = substr($seq{$s}, $je_lS - 1, $je_lL); my $bc = substr($seq{$s}, $bcS - 1, 3); my $random = substr($seq{$s}, $randS - 1, $randL); my $jer = substr($seq{$s}, $je_rS - 1, $je_rL); if ($seq{$s} =~ /N/) {$seqN++} $jelN += @foo = ($jel =~ /N/g); $bcN += @foo = ($bc =~ /N/g); $randomN += @foo = ($random =~ /N/g); $jerN += @foo = ($jer =~ /N/g); } $totalN = $jelN + $bcN + $randomN + $jerN; print STAT "\nchecking distribution of N in sequences\n"; print STAT "number of sequences with at list one N: ", $seqN, "\n"; print STAT "total number of N: ", $totalN, "\n"; print STAT "total number of N in jel: ", $jelN, "\n"; print STAT "total number of N in bc: ", $bcN, "\n"; print STAT "total number of N in random: ", $randomN, "\n"; print STAT "total number of N in jer: ", $jerN, "\n"; #------------------------------------------------------------------------------- ## distribution of junk edges -------------------------------------------- my %junk_edge_l = (); # left junk edge my %junk_edge_r = (); # right junk edge for solexa only foreach my $s (keys %seq) { my $jel = substr($seq{$s}, $je_lS - 1, $je_lL); my $jer = substr($seq{$s}, $je_rS - 1, $je_rL); $junk_edge_l{$jel}++; $junk_edge_r{$jer}++; } print STAT "\njunk edge left sequences distribution, only 1000 most friquent:\n"; print STAT ${Az::printHashColumn(\%junk_edge_l, "value", 1000)}; print STAT "\njunk edge right sequences distribution, only 1000 most friquent:\n"; print STAT ${Az::printHashColumn(\%junk_edge_r, "value", 1000)}; print "fininshed junk edge\n"; %junk_edge_l = (); %junk_edge_r = (); #------------------------------------------------------------------------------- ## number of changes AT->CG and CG->AT within $jer compare to real $jer_true --- my $ATtoCG = 0; my $CGtoAT = 0; foreach my $s (keys %seq) { my $jer = substr($seq{$s}, $je_rS - 1, $je_rL); my @r_real = split(//, $jer_true); my @r = split(//, $jer); for (my $i = 0; $i < 9; $i++) { if ($r[$i] ne $r_real[$i]) { if (($r[$i] =~ /[AT]/) and ($r_real[$i] =~ /[CG]/)) {$CGtoAT++} elsif (($r[$i] =~ /[CG]/) and ($r_real[$i] =~ /[AT]/)) {$ATtoCG++} } } } print STAT "\nchecking changes AT->CG and CG->AT within jer compare to jer_True\n"; print STAT "ATtoCG: $ATtoCG\n"; print STAT "CGtoAT: $CGtoAT\n\n"; #------------------------------------------------------------------------------- ## distribution of BarCode region ---------------------------------------- my %bc_seq = (); my %bc_dontexist = (); foreach my $s (keys %seq) { my $bc = substr($seq{$s}, $bcS - 1, 3); if (defined $barcode{$bc}) {$bc_seq{$bc}++} else {$bc_dontexist{$bc}++} } print STAT "BarCode sequences distribution:\n"; foreach my $bc (sort keys %bc_seq) { print STAT "$barcode{$bc}\t$bc\t$bc_seq{$bc}\n" } print STAT "mutant BarCode sequences distribution:\n"; foreach my $bc (sort keys %bc_dontexist) { print STAT "$bc\t$bc_dontexist{$bc}\n" } print "fininshed BarCode edge\n"; %bc_seq = (); %bc_dontexist = (); #------------------------------------------------------------------------------- ## distribution for BarCode region, considering total match in junk edges -- %bc_seq = (); %bc_dontexist = (); foreach my $s (keys %seq) { my $bc = substr($seq{$s}, $bcS - 1, 3); my $jel = substr($seq{$s}, $je_lS - 1, $je_lL); my $jer = substr($seq{$s}, $je_rS - 1, $je_rL); # for solexa only if (($jel eq $jel_true) and ($jer eq $jer_true)) { if (defined $barcode{$bc}) {$bc_seq{$bc}++} else {$bc_dontexist{$bc}++} } } print STAT "\nChecking BarCode edge considering total match in junk edge\n"; print STAT "BarCode sequences distribution:\n"; foreach my $bc (sort keys %bc_seq) { print STAT "$barcode{$bc}\t$bc\t$bc_seq{$bc}\n" } print STAT "mutant BarCode sequences distribution:\n"; foreach my $bc (sort keys %bc_dontexist) { print STAT "$bc\t$bc_dontexist{$bc}\n" } %bc_seq = (); %bc_dontexist = (); #------------------------------------------------------------------------------- ## distribution of random region for each BarCode: SEQUENCES ------------ my %random_distr = (); %bc_seq = (); my %foo_distr = (); foreach my $s (keys %seq) { my $bc = substr($seq{$s}, $bcS - 1, 3); my $rand = substr($seq{$s}, $randS - 1, $randL); if (defined $barcode{$bc}) { $random_distr{$bc}{$rand}++; $bc_seq{$bc}++; } } foreach my $bc (sort keys %random_distr) { my $bc_num = $bc_seq{$bc}; foreach my $key (keys %{$random_distr{$bc}}) { my $foo = $random_distr{$bc}{$key}; $random_distr{$bc}{$key} = $random_distr{$bc}{$key} * 100 / $bc_num; $random_distr{$bc}{$key} = sprintf("%.4f", $random_distr{$bc}{$key}); if ($foo <= 10) {$foo_distr{$bc}{$foo}++} # save groups with 10 or less seqs per group } foreach my $key (keys %{$foo_distr{$bc}}) { $foo_distr{$bc}{$key} = $foo_distr{$bc}{$key} * $key * 100 / $bc_num; $foo_distr{$bc}{$key} = sprintf("%.4f", $foo_distr{$bc}{$key}); } } print STAT "\nsequence distribution of random region for BarCodes\n"; print STAT "printed: 10 most frequent seq for each BarCode; in per cent\n"; foreach my $bc (sort keys %random_distr) { my $bc_num = $bc_seq{$bc}; print STAT "BarCode: $bc\ttotal: $bc_num\n"; print STAT ${Az::printHashColumn($random_distr{$bc}, "value", 10)}; } print STAT "\nfor barCode and similar sequences: % of that sequences\n"; print STAT "\t\t1\t2\t3\t4\t5\t6\t7\t8\t9\t10\ttotal\n"; foreach my $bc (sort keys %foo_distr) { my $bc_num = $bc_seq{$bc}; print STAT "$barcode{$bc}\t$bc\t"; for (my $i = 1; $i <= 10; $i++) { print STAT $foo_distr{$bc}{$i}, "\t"; } print STAT "$bc_num\n"; } print "fininshed random distr for each BarCodes\n"; %foo_distr = (); %random_distr = (); %bc_seq = (); #------------------------------------------------------------------------------- ## distribution of random region for all sequence simultaneously -------- %random_distr = (); my %bc_for_rand = (); foreach my $s (keys %seq) { my $bc = substr($seq{$s}, $bcS - 1, 3); my $rand = substr($seq{$s}, $randS - 1, $randL); if (defined $barcode{$bc}) { $random_distr{$rand}++ } } my $i = 0; foreach my $key (sort {$random_distr{$b} <=> $random_distr{$a}} keys %random_distr) { if ($i >= 100) {last} foreach my $s (keys %seq) { my $rand = substr($seq{$s}, $randS - 1, $randL); if ($key eq $rand) { my $bc = substr($seq{$s}, $bcS - 1, 3); $bc_for_rand{$key}{$bc} = 2; } } $i++; } print STAT "\nsequence distribution of random region for all sequences simultaneously\n"; $i = 0; foreach my $key (sort {$random_distr{$b} <=> $random_distr{$a}} keys %random_distr) { if ($i >= 100) {last} print STAT "$key\t", $random_distr{$key}, "\t"; foreach my $bc (sort keys %{$bc_for_rand{$key}}) { print STAT "$bc\t"; } $i++; print STAT "\n"; } print "fininshed random distr for all sequences\n"; %random_distr = (); %bc_for_rand = (); #------------------------------------------------------------------------------- ## distribution of nucleotides in random region for each position for BarCodes my %random_nuc_distr = (); %bc_seq = (); foreach my $s (keys %seq) { my $bc = substr($seq{$s}, $bcS - 1, 3); if (!defined $barcode{$bc}) {next} $bc_seq{$bc}++; my $rand = substr($seq{$s}, $randS - 1, $randL); my @nuc = split(//, $rand); for (my $i = 0; $i < @nuc; $i++) { $random_nuc_distr{$bc}[$i]{$nuc[$i]}++; } } print STAT "\ndistribution of nucleotides in random region for each position for BarCodes\n"; foreach my $bc (sort keys %random_nuc_distr) { my $bc_num = $bc_seq{$bc}; print STAT "BarCode: $bc\t$barcode{$bc}\ttotal $bc_num\n"; foreach my $nuc (@nucleotides) { print STAT "$nuc\t"; for (my $i = 0; $i < @{$random_nuc_distr{$bc}}; $i++) { $random_nuc_distr{$bc}[$i]{$nuc} = $random_nuc_distr{$bc}[$i]{$nuc} * 100 / $bc_num; $random_nuc_distr{$bc}[$i]{$nuc} = sprintf("%.1f", $random_nuc_distr{$bc}[$i]{$nuc}); print STAT $random_nuc_distr{$bc}[$i]{$nuc}, "\t"; } print STAT "\n"; } } ## checking total distribution of nucleotides in random region for BarCodes print STAT "\ntotal distribution of nucleotides in random region for BarCodes\n"; print STAT "\t\tA\tC\tG\tT\ttotal\n"; my $total_seq = 0; my %total_distr = (); foreach my $bc (sort keys %random_nuc_distr) { my %temp_nuc_distr = (); my $posN = @{$random_nuc_distr{$bc}}; foreach my $nuc (@nucleotides) { for (my $i = 0; $i < @{$random_nuc_distr{$bc}}; $i++) { $temp_nuc_distr{$nuc} += $random_nuc_distr{$bc}[$i]{$nuc}; } $temp_nuc_distr{$nuc} /= $posN; $temp_nuc_distr{$nuc} = sprintf("%.1f", $temp_nuc_distr{$nuc}); $total_distr{$nuc} += $temp_nuc_distr{$nuc} * $bc_seq{$bc} / 100; } $total_seq += $bc_seq{$bc}; print STAT "$bc\t$barcode{$bc}\t"; foreach my $nuc (@nucleotides) { print STAT $temp_nuc_distr{$nuc}, "\t"; } print STAT $bc_seq{$bc}, "\n"; } $total_seq = int($total_seq); print STAT "total\t\t"; foreach my $nuc (sort keys %total_distr) { my $freq = $total_distr{$nuc} * 100 / $total_seq; $freq = sprintf("%.1f", $freq); print STAT "$freq\t"; } print STAT "$total_seq\n"; print "fininshed total distribution of nucleotides\n"; %random_nuc_distr = (); %bc_seq = (); #------------------------------------------------------------------------------- # calculating frequency of dinucleotide and nucl x nucl in random region --- my %nucDistr = (); my $nucDistrTotal = {}; my %nucTimesNuc = (); my %nucTimesNucTotal = (); my %diNuc = (); my $diNucTotal = {}; %bc_seq = (); foreach my $s (keys %seq) { my $bc = substr($seq{$s}, $bcS - 1, 3); if (!defined $barcode{$bc}) {next} $bc_seq{$bc}++; my $rand = substr($seq{$s}, $randS - 1, $randL); my @letters = split(//, $rand); foreach my $bp (@letters) { $nucDistr{$bc}{$bp}++; $nucDistrTotal->{$bp}++; } for (my $i = 0; $i < length($rand) - 1; $i++) { ## dinuc my $dinuc = substr($rand, $i, 2); $diNuc{$bc}{$dinuc}++; $diNucTotal->{$dinuc}++; } } foreach my $bc (keys %bc_seq) { ## for BarCodes $diNuc{$bc} = Az::percent($diNuc{$bc}, 0); ## dinuc foreach my $key (keys %{$diNuc{$bc}}) { $diNuc{$bc}{$key} = sprintf("%.3f", $diNuc{$bc}{$key}); } $nucDistr{$bc} = Az::percent($nucDistr{$bc}, 0); ## nuc*nuc foreach my $key (sort keys %{$nucDistr{$bc}}) { foreach my $key2 (sort keys %{$nucDistr{$bc}}) { my $dinuc = "$key"."$key2"; $nucTimesNuc{$bc}{$dinuc} = $nucDistr{$bc}{$key}*$nucDistr{$bc}{$key2}; $nucTimesNuc{$bc}{$dinuc} = sprintf("%.3f", $nucTimesNuc{$bc}{$dinuc}); } } } $diNucTotal = Az::percent($diNucTotal, 0); ## dinuc Total foreach my $key (keys %$diNucTotal) { $diNucTotal->{$key} = sprintf("%.3f", $diNucTotal->{$key}); } $nucDistrTotal = Az::percent($nucDistrTotal, 0); ## nuc Total foreach my $key (sort keys %$nucDistrTotal) { foreach my $key2 (sort keys %$nucDistrTotal) { my $dinuc = "$key"."$key2"; $nucTimesNucTotal{$dinuc} = $nucDistrTotal->{$key}*$nucDistrTotal->{$key2}; $nucTimesNucTotal{$dinuc} = sprintf("%.3f", $nucTimesNucTotal{$dinuc}); } } print STAT "\nfrequency distribution of dinuc and nucTIMESnuc in random region\n"; foreach my $dinuc (sort keys %nucTimesNucTotal) { print STAT "\t\t$dinuc"; } foreach my $bc (sort keys %bc_seq) { ## printing by BarCodes print STAT "\n$barcode{$bc}", "_", "dinuc\t","$bc", "_", "dinuc\t"; foreach my $dinuc (sort keys %nucTimesNucTotal) { print STAT $diNuc{$bc}{$dinuc}, "\t"; } print STAT "\n$barcode{$bc}", "_", "nucXnuc\t","$bc", "_", "nucXnuc\t"; foreach my $dinuc (sort keys %nucTimesNucTotal) { print STAT $nucTimesNuc{$bc}{$dinuc}, "\t"; } } print STAT "\ntotal", "_", "dinuc\t"; ## printing total foreach my $dinuc (sort keys %nucTimesNucTotal) { print STAT $diNucTotal->{$dinuc}, "\t"; } print STAT "\ntotal", "_", "nucXnuc\t"; foreach my $dinuc (sort keys %nucTimesNucTotal) { print STAT $nucTimesNucTotal{$dinuc}, "\t"; } print STAT "\n"; print "fininshed frequency distribution of dinucleotides\n"; %nucDistr = (); $nucDistrTotal = {}; %nucTimesNuc = (); %nucTimesNucTotal = (); %diNuc = (); $diNucTotal = {}; %bc_seq = (); #------------------------------------------------------------------------------- ## calculating entropy in random region -------------------------------- my %entropy = (); my %range = qw(0 0 0.125 0 0.25 0 0.375 0 0.5 0 0.625 0 0.75 0 0.875 0 1 0 1.125 0 1.25 0 1.375 0 1.5 0 1.625 0 1.75 0 1.875 0 2 0); my %entropy_distr = (); %bc_seq = (); foreach my $bc (keys %barcode) { my %foo = %range; $entropy_distr{$bc} = \%foo; } $entropy_distr{"_total"} = \%range; foreach my $s (keys %seq) { my $bc = substr($seq{$s}, $bcS - 1, 3); if (!defined $barcode{$bc}) {next} $bc_seq{$bc}++; $bc_seq{"_total"}++; my $rand = substr($seq{$s}, $randS - 1, $randL); my @letters = split(//, $rand); my %tempHash = (); foreach my $nuc (@letters) { $tempHash{$nuc}++; } my $foo_e = 0; foreach my $nuc (keys %tempHash) { my $p = $tempHash{$nuc}/$randL; $foo_e += ($p)*(log($p)/log(2)); } push @{$entropy{$bc}}, - $foo_e; } foreach my $bc (keys %entropy) { foreach my $e (@{$entropy{$bc}}) { ### binary search ## my $mid = 1; my $min = 0; while ($mid - $min >= 0.125) { if ($e < $mid) { $mid = $mid - ($mid-$min)/2 } elsif ($e > $mid) { my $old_min = $min; $min = $mid; $mid = $mid + ($mid-$old_min)/2; } else { $min = $mid; last; } } $entropy_distr{$bc}{$min}++; $entropy_distr{"_total"}{$min}++; ##################### } } print STAT "\nentropy distribution for random region\n"; foreach my $bc (sort keys %barcode) {print STAT "\t$barcode{$bc}"} print STAT "\n"; print STAT ${Az::printDoubleHashLine(\%entropy_distr)}; ## convert to percent and print## foreach my $bc (keys %entropy_distr) { foreach my $level (keys %{$entropy_distr{$bc}}) { $entropy_distr{$bc}{$level} *= 100 / $bc_seq{$bc}; $entropy_distr{$bc}{$level} = sprintf("%.3f", $entropy_distr{$bc}{$level}); } } print STAT "\nentropy distribution for random region PERCENT\n"; foreach my $bc (sort keys %barcode) {print STAT "\t$barcode{$bc}"} print STAT "\n"; print STAT ${Az::printDoubleHashLine(\%entropy_distr)}; print "fininshed entropy\n"; %entropy = (); %entropy_distr = (); #------------------------------------------------------------------------------- close STAT; STEP2: print "STEP2\n"; ################################################################################ ############### Subroutins ############### ################################################################################