#!/usr/bin/perl =head1 NAME siteFinder.pl =head1 DESCRIPTION motifMapper - program locates TFBS within sequence using motif of that TFBS; output: 1) statistical file - contain percent distribution of the motif within data set for each threshold; 2) partitial file - contain part of data set that suit for the threshold and the motif ------------------------------ =head1 AUTHOR Artem Zykovich (azykovich@ucdavis.edu) =head1 COPYRIGHT Copyright (C) 2008 Artem Zykovich. All Rights Reserved. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =cut use strict; use warnings; use Az; use DataBrowser; use FAlite; use ZFdnaBS; use Getopt::Long; my $MATRIX_FILE = ""; # input matrix file my $SEQ_FILE = ""; # input sequences file my $bcCond = ""; # file with barCodes-condition to use, one per line my $hts = "s"; # type of Hight Throughput Sequinator: 454 - 454; # s - solexa my $background = "c"; # could be e - equal; c - compositional, # that are dependent on nucleotide composition my $searchRegion = "random";# where to search (all - all region (without first two AA); # random - random region) my $stat = ""; # perform statistics my $statPv = 0.001; # threshold uses to print summary table my $partition = ""; # perform partitioning my $partOpt = "select"; # partition options (select; subtract) my $partPv = 0.0001; # partition threshold my $partBC = ""; # must define a barCode if use -partition option my $partRegion = "motif";# what part of sequence to write in OUT file: # all; random; motif; my $position = ""; # find and print position of motif within sequence die "usage: motifMapper.pl [options] options: -inputM input matrix file -inputS input sequence file -bcCond file with barCodes-condition to use, one per line -hts type of Hight Troughput Sequinator: $hts -background background model (equal/nuc composition = e/c): $background -searchRegion where to search (all; random): $searchRegion -stat perform and print statistics -statPv Pvalue used to print summary table: $statPv -partition print part of sequenses to output file -partOpt partition options (select; subtract): $partOpt -partPv partition Pvalue: $partPv -partBC must define a barCode if use -partition option -partRegion part of sequence to print (all, random, motif): $partRegion if all or random - will print the string where motif have been located (forward or reverse) -position find and print position of motif within sequence " unless @ARGV > 1; GetOptions("inputM=s" => \$MATRIX_FILE, "inputS=s" => \$SEQ_FILE, "bcCond=s" => \$bcCond, "hts=s" => \$hts, "background=s" => \$background, "searchRegion=s" => \$searchRegion, "stat" => \$stat, "statPv=i" => \$statPv, "partition" => \$partition, "partOpt=s" => \$partOpt, "partPv=f" => \$partPv, "partBC=s" => \$partBC, "partRegion=s" => \$partRegion, "position" => \$position); die "input MATRIX_FILE\n" if $MATRIX_FILE eq ""; die "input SEQ_FILE\n" if $SEQ_FILE eq ""; die "input bcCond\n" if $bcCond eq ""; ################################################################################ ####### Parameters and default values ######################################## ################################################################################ my @nucleotides = qw(A C G T); my %barcode = (); ## barCodes from file - file must contain all barCodes and has priority over $bcNum if ($bcCond) { open (BCIN, "$bcCond") or die; while () { my @f = split(/\s/, $_); $barcode{$f[0]} = $f[1]; } } close BCIN; ## determine background (control) BarCodes, that used for abundancy calculation my @backgrBC = (); foreach my $bc (keys %barcode) { if ($barcode{$bc} eq "control") { push @backgrBC, $bc } } ## hts conditions my $bcS; # position within sequence where barCode Starts my $je_l; # real left junk edge my $je_lS; # real left junk edge Starts my $je_lL; # real left junk edge Length my $je_r; # real right junk edge my $je_rS; # real right junk edge Starts my $je_rL; # real right junk edge Length my $primer_l; # sequence of left primer, that were during BIND step my $primer_r; # sequence of right primer, that were during BIND step my $bc_rS; # barCode_random Starts my $bc_rL; # barCode_random Length my $randS; # random region Starts my $randL; # random region Legth if ($hts eq "454") { ## 454 parameters $bcS = 20; $je_l = "GCCTCCCTCGCGCCATCAG"; $je_lS = 1; $je_lL = 19; $bc_rS = 20; $bc_rL = 24; $randS = 23; $randL = 21; } elsif ($hts eq "s") { ## solexa parameters $bcS = 3; $je_l = "AA"; $je_lS = 1; $je_lL = 2; $je_r = "AGATCGGAA"; $primer_l = "ACACTCTTTCCCTACACGACGCTCTTCCGATCTAA"; $primer_r = "AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG"; $je_rS = 27; $je_rL = 9; $bc_rS = 3; $bc_rL = 24; $randS = 6; $randL = 21; } else {die "ERROR: hts $hts incorrect!!!\n"} my @seqFileParts = split(/\./, $SEQ_FILE); my $seqFileName = $seqFileParts[0]; my @determSeq = (); # seq-site within sequence for which determined # score > Threshold: $determSeq[$m]{$bc}{$th}{$s}; where - # $m - matrix number; $bc - barCode; # $th - threshold value; $s - sequence number my @determSeqTwice = (); # number of sequence for which was determined # seq-site that corespond to matrix at list # twice within same sequence # $determSeqTwice[$m]{$bc}{$th} print "program started at:\n"; Az::dateNtime; ######################################## #### Determine background Model #### ######################################## my %backgroundMatrix = ("A" => 0.25, "C" => 0.25, "G" => 0.25, "T" => 0.25); my %nuc_distr = (); if ($background eq "c") {%nuc_distr = BarCoded_nuc_distr($SEQ_FILE)} ################################################################################ ### Proceeding ZFdnaBS file (input motif file) ### ################################################################################ my $BindingSites = new ZFdnaBS($MATRIX_FILE); my $BarCodedBindingSites = new ZFdnaBS::BarCodedBS; my @entry = $BindingSites->entry; for (my $i = 0; $i < @entry; $i++) { $entry[$i] = substr($entry[$i], 1); my @foo = split(/\s/, $entry[$i]); $entry[$i] = $foo[0]; } foreach my $bc (sort keys %barcode) { for (my $i = 0; $i < $BindingSites->motifsNumber; $i++) { my $motif = $BindingSites->motif($i); # convert to frequences with sum equal 1 $motif->percent(0.001); # convert to log odd if ($background eq "e") { $motif->logOdds(\%backgroundMatrix); } elsif ($background eq "c") { if (!defined $nuc_distr{$bc}) {next} $motif->logOdds($nuc_distr{$bc}); } else {die "background = $background= could equal e or c\n"} # add motif to new class $BarCodedBindingSites->addMotif($bc, $i, $motif->getMatrix); # checking if motif length more then searchRegion length if (21 < $motif->mLength) {die "searchRegion = $searchRegion is shorter than one of the motif length\n"} } } ################################################################################ ### Calculating thresholds for Pvalues ### ################################################################################ my @Pvalue = (); # contain Pvalues for motifs, that used as thresholds print "start Threshold calculating\n"; Az::dateNtime; unless (-d "pValue") {system("mkdir pValue") == 0 or die} if (-e "pValue\/$MATRIX_FILE-pValue.mm") { open (PV, "pValue\/$MATRIX_FILE-pValue.mm") or die; my $i = 0; ; while (my $line = ) { chomp $line; my @foo = split(/\t/, $line); $Pvalue[$i]{0.0001} = $foo[1]; $Pvalue[$i]{0.001} = $foo[2]; $Pvalue[$i]{0.01} = $foo[3]; $i++; } close PV; } else { open (PV, ">pValue\/$MATRIX_FILE-pValue.mm") or die; print PV "\t0.0001\t0.001\t0.01"; for (my $i = 0; $i < $BindingSites->motifsNumber; $i++) { my $motif = $BindingSites->motif($i); $motif->percent(0.001); # convert to frequences with sum equal 1 $motif->logOdds(\%backgroundMatrix); # convert to log odd $Pvalue[$i] = getPvalue($motif->getMatrix); ## printing pValue to the file print PV "\n$entry[$i]"; foreach my $pv (sort keys %{$Pvalue[$i]}) { print PV "\t", $Pvalue[$i]{$pv}; } } close PV; } print "finished Threshold calculating\n"; Az::dateNtime; ################################################################################ ########### Statistics: site finding ########### ################################################################################ if (!$stat) {goto STEP1} print "started statistics\n"; my $s = 0; # sequence number my %bc_seqnum = (); open (SEQ, "$SEQ_FILE") or die; while () { my $def = $_; my $seq = ; chomp $seq; my $bc = substr($seq, $bcS - 1, 3); $bc_seqnum{$bc}++; my $random = substr($seq, $randS - 1, $randL); ## define searchSeq and searchSeq_rc my $searchSeq = ""; if ($searchRegion eq "all") { $searchSeq = "$primer_l"."$bc"."$random"."$primer_r" } elsif ($searchRegion eq "random") { $searchSeq = $random } else {die "searchRegion $searchRegion not correct\n"} my $searchSeq_rc = reverse($searchSeq); $searchSeq_rc =~ tr[ACGT] [TGCA]; my @twoStrands = (); @{$twoStrands[0]} = split(//, $searchSeq); @{$twoStrands[1]} = split(//, $searchSeq_rc); ## search matrices through sequences ## my $m = 0; # motif's number foreach my $motif (@{$BarCodedBindingSites->motifs($bc)}) { my $matrix = $motif->getMatrix; my $mLength = $motif->mLength; my %score = (); my $strand = 1; # strand: 1 - forward; -1 - reverse ## determing score for matrix $m within sequence $s foreach my $seqStrand (@twoStrands) { my $p = $strand; for (my $i = 0; $i < @$seqStrand - $mLength + 1; $i++) { for (my $j = 0; $j < $mLength; $j++) { $score{$p} += $matrix->[$j]{$seqStrand->[$i+$j]}; } $p += $strand; } $strand = -1; } my (@a) = sort {$score{$b} <=> $score{$a}} keys %score; ## checking score for threshoulds foreach my $Pv (keys %{$Pvalue[$m]}) { my $th = $Pvalue[$m]{$Pv}; if ($score{$a[0]} >= $th) { # check point $determSeq[$m]{$bc}{$Pv}++; } if ($score{$a[1]} >= $th) { $determSeqTwice[$m]{$bc}{$Pv}++; } } $m++; } $s++; } close SEQ; ######################## ### Printing results ### ######################## unless (-d "stat") {system("mkdir stat") == 0 or die} my $STAT_FILE = "stat\/stat-$background-$searchRegion--$MATRIX_FILE--$seqFileName.mm"; open (STAT, ">$STAT_FILE") or die; print STAT "seq_file: $SEQ_FILE\t\tsequences: $s\n"; print STAT "matrix_file: $MATRIX_FILE\n"; print STAT "background: $background\t\tsearchRegion: $searchRegion\tdata in per-cent\n"; # print abundance tables (one table for one Pvalue) my @backgrAbundance = (); for (my $m = 0; $m < @determSeq; $m++) { foreach my $pv (sort keys %{$Pvalue[$m]}) { foreach my $bc (@backgrBC) { my $seqNum = $determSeq[$m]{$bc}{$pv}; my $percent = $seqNum*100/$bc_seqnum{$bc}; $backgrAbundance[$m]{$pv} += $percent; } $backgrAbundance[$m]{$pv} /= @backgrBC; } } print STAT "\nAbundance ratio case/control table\n"; print STAT "\ncontrol barCodes: @backgrBC\n"; foreach my $pv (sort keys %{$Pvalue[0]}) { print STAT "\nPvalue $pv\n"; print STAT "bc\tcondition"; for (my $m = 0; $m < @determSeq; $m++) { print STAT "\t$entry[$m]"; } print STAT "\n"; foreach my $bc (sort keys %{$determSeq[0]}) { print STAT "$bc\t$barcode{$bc}"; for (my $m = 0; $m < @determSeq; $m++) { my $seqNum = $determSeq[$m]{$bc}{$pv}; my $percent = $seqNum*100/$bc_seqnum{$bc}; my $ratio = $percent / $backgrAbundance[$m]{$pv}; $ratio = sprintf("%.3f", $ratio); print STAT "\t$ratio"; } print STAT "\n"; } } # print summary table for particular threshold for all matrices print STAT "\n\nSummary table (percent)\n"; print STAT "Pvalue: $statPv\n"; print STAT "total\tbc\tcondition"; for (my $m = 0; $m < @determSeq; $m++) { print STAT "\t$entry[$m]"; } print STAT "\n"; foreach my $bc (sort keys %{$determSeq[0]}) { print STAT $bc_seqnum{$bc},"\t$bc\t$barcode{$bc}"; for (my $m = 0; $m < @determSeq; $m++) { my $seqNum = $determSeq[$m]{$bc}{$statPv}; my $percent = $seqNum*100/$bc_seqnum{$bc}; $percent = sprintf("%.3f", $percent); print STAT "\t$percent"; } print STAT "\n"; } # print exhaustive statistics for (my $m = 0; $m < @determSeq; $m++) { print STAT "\n\nmatrix: $entry[$m]\n"; print STAT "number of seq in which Site determined AT LIST once\n"; print STAT "bc\tcondition\ttotal\tPv0.0001\tPv0.001\tPv0.01\n"; foreach my $bc (sort keys %{$determSeq[$m]}) { print STAT "$bc\t$barcode{$bc}\t", $bc_seqnum{$bc},"\t"; foreach my $pv (sort keys %{$determSeq[$m]{$bc}}) { my $seqNum = $determSeq[$m]{$bc}{$pv}; my $percent = $seqNum*100/$bc_seqnum{$bc}; $percent = sprintf("%.3f", $percent); print STAT "$percent\t"; } print STAT "\n"; } print STAT "number of seq in which Site determined MORE THAN once\n"; print STAT "bc\tcondition\ttotal\tPv0.0001\tPv0.001\tPv0.01\n"; foreach my $bc (sort keys %{$determSeq[$m]}) { if ($determSeqTwice[$m]{$bc}) { print STAT "$bc\t$barcode{$bc}\t", $bc_seqnum{$bc},"\t"; foreach my $pv (sort keys %{$determSeq[$m]{$bc}}) { my $seqNum = $determSeqTwice[$m]{$bc}{$pv}; my $percent = $seqNum*100/$bc_seqnum{$bc}; $percent = sprintf("%.3f", $percent); print STAT "$percent\t"; } print STAT "\n"; } } } close STAT; ################################################################################ ######## Partitioning ########### ################################################################################ STEP1: print "STEP1\n"; if (!$partition) {goto STEP2} print "started partition\n"; if ($partBC eq "") {die "you must define partBC if want to use -partition option\n"} unless (-d "partition") {system("mkdir partition") == 0 or die} my $partFile = "partition\/part-$background-$searchRegion--$partOpt-$partPv-$partBC-$partRegion--$MATRIX_FILE-$SEQ_FILE.mm"; #if (-e "$partFile") {die "$partFile already exist\n"} open (PART, ">$partFile") or die; #my @bestMatch = (); # distribuition of best matches [matrix][seqN] = best match (logOdds) #my @bestMatchHist = (); # histogram of distribuition of best matches my %hashToPrint = (); open (SEQ, "$SEQ_FILE") or die; while () { my $def = $_; chomp $def; my $seq = ; chomp $seq; my $bc = substr($seq, $bcS - 1, 3); if ($partBC ne $bc) {next} $bc_seqnum{$bc}++; my $random = substr($seq, $randS - 1, $randL); ## define searchSeq and searchSeq_rc my $searchSeq = ""; if ($searchRegion eq "all") { $searchSeq = "$primer_l"."$bc"."$random"."$primer_r" } elsif ($searchRegion eq "random") { $searchSeq = $random } else {die "searchRegion $searchRegion not correct\n"} my $searchSeq_rc = reverse($searchSeq); $searchSeq_rc =~ tr[ACGT] [TGCA]; my @twoStrands = (); @{$twoStrands[0]} = split(//, $searchSeq); @{$twoStrands[1]} = split(//, $searchSeq_rc); ## search matrices through sequences ## my $m = 0; # motif's number foreach my $motif (@{$BarCodedBindingSites->motifs($bc)}) { my $matrix = $motif->getMatrix; my $mLength = $motif->mLength; my %score = (); my $strand = 1; # strand: 1 - forward; -1 - reverse ## determing score for matrix $m within sequence $s foreach my $seqStrand (@twoStrands) { my $p = $strand; for (my $i = 0; $i < @$seqStrand - $mLength + 1; $i++) { for (my $j = 0; $j < $mLength; $j++) { $score{$p} += $matrix->[$j]{$seqStrand->[$i+$j]}; } $p += $strand; } $strand = -1; } my (@a) = sort {$score{$b} <=> $score{$a}} keys %score; # push @{$bestMatch[$m]}, $score{$a[0]}; ## checking score for threshoulds my $th = $Pvalue[$m]{$partPv}; my $present = "no"; if ($score{$a[0]} >= $th) { # check point $present = "yes"; } # determine what sequence to print my $seqToPrint = ""; if ($partRegion eq "all") { if ($a[0] > 0) {$seqToPrint = $seq} else {$seqToPrint = reverse($seq); $seqToPrint =~ tr[ACGT] [TGCA]; } } elsif ($partRegion eq "random") { if ($a[0] > 0) {$seqToPrint = $random} else {$seqToPrint = reverse($random); $seqToPrint =~ tr[ACGT] [TGCA]; } } elsif ($partRegion eq "motif") { my $start = abs($a[0]) - 1; if ($a[0] > 0) {$seqToPrint = substr($searchSeq, $start, $mLength)} else {$seqToPrint = substr($searchSeq_rc, $start, $mLength)} } else {die "wrong partRegion\n"} ## Print partition - selec ## if (($partOpt eq "select") and ($present eq "yes")) { $hashToPrint{$def} = $seqToPrint } ## Print partition - substract ## if (($partOpt eq "subtract") and ($present eq "no")) { $hashToPrint{$def} = $seqToPrint } $m++; } } close SEQ; ## Printing partition foreach my $key (keys %hashToPrint) { print PART "$key\n",$hashToPrint{$key},"\n" } close PART; # open (BEST, ">bestMatchHisto--$background-$searchRegion--$partOpt-$partPv-$partBC-$partRegion--$MATRIX_FILE-$SEQ_FILE.mm") or die; # for (my $i = 0; $i < @bestMatch; $i++) { # $bestMatchHist[$i] = Az::histogram($bestMatch[$i]); # print BEST $entry[$i], "\n"; # foreach my $key (sort {$a <=> $b} keys %{$bestMatchHist[$i]}) { # my $keyS = sprintf("%.1f", $key); # print BEST "$keyS\t", $bestMatchHist[$i]{$key}, "\n"; # } # } # close BEST; ################################################################################ ######## Position ########### ################################################################################ STEP2: print "STEP2\n"; if (!$position) {goto STEP3} print "started position\n"; my %motifPosition = (); # structure: {$bc}[motif]{pValue}{position} = quantity open (SEQ, "$SEQ_FILE") or die; while () { my $def = $_; my $seq = ; chomp $seq; my $bc = substr($seq, $bcS - 1, 3); $bc_seqnum{$bc}++; my $random = substr($seq, $randS - 1, $randL); ## define searchSeq and searchSeq_rc my $searchSeq = ""; if ($searchRegion eq "all") { $searchSeq = "$primer_l"."$bc"."$random"."$primer_r" } elsif ($searchRegion eq "random") { $searchSeq = $random } else {die "searchRegion $searchRegion not correct\n"} my $searchSeq_rc = reverse($searchSeq); $searchSeq_rc =~ tr[ACGT] [TGCA]; my @twoStrands = (); @{$twoStrands[0]} = split(//, $searchSeq); @{$twoStrands[1]} = split(//, $searchSeq_rc); ## search matrices through sequences ## my $m = 0; # motif's number foreach my $motif (@{$BarCodedBindingSites->motifs($bc)}) { my $matrix = $motif->getMatrix; my $mLength = $motif->mLength; my %score = (); my $strand = 1; # strand: 1 - forward; -1 - reverse ## determing score for matrix $m within sequence $s foreach my $seqStrand (@twoStrands) { my $p = $strand; for (my $i = 0; $i < @$seqStrand - $mLength + 1; $i++) { for (my $j = 0; $j < $mLength; $j++) { $score{$p} += $matrix->[$j]{$seqStrand->[$i+$j]}; } $p += $strand; } $strand = -1; } my (@a) = sort {$score{$b} <=> $score{$a}} keys %score; ## checking score for threshoulds foreach my $Pv (keys %{$Pvalue[$m]}) { my $th = $Pvalue[$m]{$Pv}; if ($score{$a[0]} >= $th) { # check point $motifPosition{$bc}[$m]{$Pv}{$a[0]}++; } } $m++; } $s++; } close SEQ; ########################## ## print motifPosition ## ########################## unless (-d "position") {system("mkdir position") == 0 or die} my $motPosFile = "position\/motifPos-$background-$searchRegion--$MATRIX_FILE-$SEQ_FILE.mm"; open (MOTPOS, ">$motPosFile") or die; #------ Printing Summary table-------# print MOTPOS "Summary table: ratio (number of motif on forward strand)/(number of motif on reverse strand)\n\n"; foreach my $pv (sort keys %{$Pvalue[0]}) { my @ratio = (); # [matrix]{$bc} = ratio print MOTPOS "\npValue\t$pv\n"; foreach my $bc (sort keys %motifPosition) { print MOTPOS "\t$barcode{$bc}"; for (my $m = 0; $m < @{$motifPosition{$bc}}; $m++) { my $forward = 0; my $reverse = 0; foreach my $pos (keys %{$motifPosition{$bc}[$m]{$pv}}) { if ($pos > 0) {$forward += $motifPosition{$bc}[$m]{$pv}{$pos}} else {$reverse += $motifPosition{$bc}[$m]{$pv}{$pos}} } if (($reverse != 0) and ($forward != 0)) { $ratio[$m]{$bc} = $forward/$reverse } elsif ($reverse == 0) {$ratio[$m]{$bc} = $forward} elsif ($forward == 0) {$ratio[$m]{$bc} = 1/$reverse} else {$ratio[$m]{$bc} = 0} $ratio[$m]{$bc} = sprintf("%.2f", $ratio[$m]{$bc}); } } for (my $m = 0; $m < @ratio; $m++) { print MOTPOS "\n", $entry[$m]; foreach my $bc (sort keys %{$ratio[$m]}) { print MOTPOS "\t", $ratio[$m]{$bc}; } } print MOTPOS "\n"; } #------ Printing exhaustive table-------# ## determine possible positions my %possiblePos = (); # {bc}{$pos} = nucleotide foreach my $bc (keys %motifPosition) { if ($searchRegion eq "all") { my $searchSeq = "$primer_l"."$bc"."NNNNNNNNNNNNNNNNNNNNN"."$primer_r"; my $searchSeq_rc = reverse($searchSeq); $searchSeq_rc =~ tr[ACGT] [TGCA]; my @twoStrands = (); @{$twoStrands[0]} = split(//, $searchSeq); @{$twoStrands[1]} = split(//, $searchSeq_rc); my $strand = 1; # strand: 1 - forward; -1 - reverse foreach my $seqStrand (@twoStrands) { my $p = $strand; for (my $i = 0; $i < @$seqStrand - 5; $i++) { $possiblePos{$bc}{$p} = $seqStrand->[$i]; $p += $strand; } $strand = -1; } } elsif ($searchRegion eq "random") { for (my $i = -16; $i < 17; $i++) { if ($i == 0) {next} $possiblePos{$bc}{$i} = "N"; } } } ## print distribution of motif within seq for each barCode print MOTPOS "\n\nDistribution of motif within seq for each barCode\n"; foreach my $pv (sort keys %{$Pvalue[0]}) { foreach my $bc (sort keys %motifPosition) { print MOTPOS "\nbarCode\t$bc\t$barcode{$bc}\t\tpValue\t$pv\n"; ## print header: possible position foreach my $pos (sort {$a <=> $b} keys %{$possiblePos{$bc}}) { print MOTPOS "\t",$possiblePos{$bc}{$pos}; } print MOTPOS "\n"; foreach my $pos (sort {$a <=> $b} keys %{$possiblePos{$bc}}) { print MOTPOS "\t$pos"; } print MOTPOS "\n"; ## print number of sequence for each position for (my $m = 0; $m < @{$motifPosition{$bc}}; $m++) { print MOTPOS $entry[$m]; foreach my $pos (sort {$a <=> $b} keys %{$possiblePos{$bc}}) { if ($motifPosition{$bc}[$m]{$pv}{$pos}) { print MOTPOS "\t",$motifPosition{$bc}[$m]{$pv}{$pos}; } else {print MOTPOS "\t"} } print MOTPOS "\n"; } } } close MOTPOS; #------------------------------------------------------------------------------- STEP3: print "STEP3\n"; print "program finished at:\n"; Az::dateNtime; ################################################################################ ######## Subroutines ########### ################################################################################ sub BarCoded_nuc_distr { my ($file) = @_; open (SEQ, "$file") or die; my %nuc_distr = (); # nucleotide distribution for each barcode while () { my $def = $_; my $seq = ; chomp $seq; my $bc = substr($seq, $bcS - 1, 3); my $random = substr($seq, $randS - 1, $randL); my $searchSeq = ""; if ($searchRegion eq "all") { $searchSeq = "$primer_l"."$bc"."$random"."$primer_r" } elsif ($searchRegion eq "random") { $searchSeq = $random } else {die "searchRegion $searchRegion not correct\n"} my @foo; $nuc_distr{$bc}{A} += @foo = ($searchSeq =~ /A/g); $nuc_distr{$bc}{C} += @foo = ($searchSeq =~ /C/g); $nuc_distr{$bc}{G} += @foo = ($searchSeq =~ /G/g); $nuc_distr{$bc}{T} += @foo = ($searchSeq =~ /T/g); } foreach my $key (keys %nuc_distr) { $nuc_distr{$key} = Az::percent($nuc_distr{$key}, 0); my $mid = ($nuc_distr{$key}{A} + $nuc_distr{$key}{T}) / 2; $nuc_distr{$key}{A} = $mid; $nuc_distr{$key}{T} = $mid; $mid = ($nuc_distr{$key}{C} + $nuc_distr{$key}{G}) / 2; $nuc_distr{$key}{C} = $mid; $nuc_distr{$key}{G} = $mid; } close SEQ; return %nuc_distr; } sub getPvalue { my ($matrix) = @_; my $mLength = @$matrix; my @scoreList = (); open (RSEQ, "random_seq--sL-21--sN-1000000.fa") or die; print "Achtung!!! sequence length is 21 bp\n"; while () { my $def = $_; my $seq = ; chomp $seq; my $rc_seq = reverse($seq); $rc_seq =~ tr[ACGT] [TGCA]; my @twoStrands = (); @{$twoStrands[0]} = split(//, $seq); @{$twoStrands[1]} = split(//, $rc_seq); ## search matrix through sequence ## my %score = (); my $strand = 1; # strand: 1 - forward; -1 - reverse ## determing score for matrix $m within sequence $s foreach my $seqStrand (@twoStrands) { my $p = $strand; for (my $i = 0; $i < 21 - $mLength + 1; $i++) { for (my $j = 0; $j < $mLength; $j++) { $score{$p} += $matrix->[$j]{$seqStrand->[$i+$j]}; } $p += $strand; } $strand = -1; } my (@a) = sort {$score{$b} <=> $score{$a}} keys %score; push @scoreList, $score{$a[0]}; } my (@sorted_scoreList) = sort {$b <=> $a} @scoreList; my %threshold = (); $threshold{0.0001} = $sorted_scoreList[99]; $threshold{0.001} = $sorted_scoreList[999]; $threshold{0.01} = $sorted_scoreList[9999]; return \%threshold; }