#!/usr/bin/perl =head1 NAME seqMapper.pl =head1 DESCRIPTION seqMapper - program locates one-dimentional TFBS (sequences) within sequence set using sequence(s) of that TFBS; ------------------------------ =head1 AUTHOR Artem Zykovich (azykovich@ucdavis.edu) =head1 COPYRIGHT Copyright (C) 2008 Artem Zykovich. All Rights Reserved. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =cut use strict; use warnings; use Az; use DataBrowser; use FAlite; use ZFdnaBS; use Getopt::Long; my $SEQ_FILE = ""; # sequences set file my $bcCond = ""; # file with barCodes-condition to use, one per line my $hts = "s"; # type of Hight Throughput Sequinator: 454 - 454; # s - solexa my $background = "e"; # could be e - equal; c - compositional, # that are dependent on nucleotide composition my $searchRegion = "random";# where to search (all - all region (without first two AA); # random - random region) my $pseudo = 1; # add pseudo to number of found sequences my $denovo = ""; # perform count of all possible k-mer my $kmerLength = 9; # length of sequence to count my $enrichTh = 10; # enrichment threshold my $zScoreTh = 3; # z-score threshold; z-score = int((x - mean) / sd) my $relative = "e"; # print true sequences relative by: e - enrichment; 0 - oddsRation my $affinity = ""; # perform affinity my $affSeq_FILE = ""; # file with known one-dimentional TFBS - sequence(s) # one sequence per line die "usage: seqMapper.pl [options] options: -inputS input sequence seq file -bcCond file with barCodes-condition to use, one per line -hts type of Hight Troughput Sequinator: $hts -background background model (equal/nuc composition = e/c): $background -searchRegion where to search (all; random): $searchRegion -pseudo add pseudo to number of found sequences: $pseudo -denovo perform count of all possible k-mer -kmerLength length of sequence to count: $kmerLength -enrichTh enrichment threshold: $enrichTh -zScoreTh z-score threshold: $zScoreTh -relative print true sequences relative by (e - enrichment; o - oddsRation): $relative -affinity perform affinity -affSeq file with known one-dimentional TFBS - sequence(s), one sequence per line " unless @ARGV > 1; GetOptions( "inputS=s" => \$SEQ_FILE, "bcCond=s" => \$bcCond, "hts=s" => \$hts, "background=s" => \$background, "searchRegion=s" => \$searchRegion, "pseudo=i" => \$pseudo, "denovo" => \$denovo, "kmerLength=i" => \$kmerLength, "enrichTh=i" => \$enrichTh, "zScoreTh=i" => \$zScoreTh, "relative=s" => \$relative, "affinity" => \$affinity, "affSeq=s" => \$affSeq_FILE); die "input SEQ_FILE\n" if $SEQ_FILE eq ""; die "input bcCond\n" if $bcCond eq ""; ################################################################################ ####### Parameters and default values ######################################## ################################################################################ my @nucleotides = qw(A C G T); my %barcode = (); ## barCodes from file - file must contain all barCodes and has priority over $bcNum if ($bcCond) { open (BCIN, "$bcCond") or die; while () { my @f = split(/\s/, $_); $barcode{$f[0]} = $f[1]; } } close BCIN; ## determine background (control) BarCodes, that used for abundancy calculation my @backgrBC = (); foreach my $bc (keys %barcode) { if ($barcode{$bc} eq "control") { push @backgrBC, $bc } } ## hts conditions my $bcS; # position within sequence where barCode Starts my $je_l; # real left junk edge my $je_lS; # real left junk edge Starts my $je_lL; # real left junk edge Length my $je_r; # real right junk edge my $je_rS; # real right junk edge Starts my $je_rL; # real right junk edge Length my $primer_l; # sequence of left primer, that were during BIND step my $primer_r; # sequence of right primer, that were during BIND step my $bc_rS; # barCode_random Starts my $bc_rL; # barCode_random Length my $randS; # random region Starts my $randL; # random region Legth if ($hts eq "454") { ## 454 parameters $bcS = 20; $je_l = "GCCTCCCTCGCGCCATCAG"; $je_lS = 1; $je_lL = 19; $bc_rS = 20; $bc_rL = 24; $randS = 23; $randL = 21; } elsif ($hts eq "s") { ## solexa parameters $bcS = 3; $je_l = "AA"; $je_lS = 1; $je_lL = 2; $je_r = "AGATCGGAA"; $primer_l = "ACACTCTTTCCCTACACGACGCTCTTCCGATCTAA"; $primer_r = "AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG"; $je_rS = 27; $je_rL = 9; $bc_rS = 3; $bc_rL = 24; $randS = 6; $randL = 21; } else {die "ERROR: hts $hts incorrect!!!\n"} my @seqFileParts = split(/\./, $SEQ_FILE); my $seqFileName = $seqFileParts[0]; my @bcCondParts = split(/\./, $bcCond); my $bcCondName = $bcCondParts[0]; my @affSeqFileParts = split(/\./, $affSeq_FILE); my $affSeqFilName = $affSeqFileParts[0]; print "program started at:\n"; Az::dateNtime; ######################################## #### Determine background Model #### ######################################## my %backgroundMatrix = ("A" => 0.25, "C" => 0.25, "G" => 0.25, "T" => 0.25); my %nuc_distr = (); if ($background eq "c") {%nuc_distr = BarCoded_nuc_distr($SEQ_FILE)} elsif ($background eq "e") { foreach my $bc (keys %barcode) { $nuc_distr{$bc} = \%backgroundMatrix } } else {die "background model don't equal e or c\n"} print "background model determined at:\n"; Az::dateNtime; ################################################################################ ########### De novo: sequence finding ########### ################################################################################ if (!$denovo) {goto STEP1} print "started denovo\n"; my $s = 0; # sequence number my %bc_seqnum = (); my %kmerStatHalf = (); my %kmerStat = (); open (SEQ, "$SEQ_FILE") or die; while () { my $def = $_; my $seq = ; chomp $seq; my $bc = substr($seq, $bcS - 1, 3); if (!$barcode{$bc}) {next} $bc_seqnum{$bc}++; my $random = substr($seq, $randS - 1, $randL); ## define searchSeq and my $searchSeq = ""; if ($searchRegion eq "all") { $searchSeq = "$primer_l"."$bc"."$random"."$primer_r" } elsif ($searchRegion eq "random") { $searchSeq = $random } else {die "searchRegion $searchRegion not correct\n"} ## count k-mer through the sequence ## for (my $i = 0; $i < length($searchSeq) - $kmerLength + 1; $i++) { my $partSeq = substr($searchSeq, $i, $kmerLength); $kmerStatHalf{$bc}{$partSeq} ++; } $s++; } close SEQ; print "scan through inputS file done at:\n"; Az::dateNtime; ## adding up reverse compliment sequences foreach my $bc (keys %kmerStatHalf) { foreach my $kmer (keys %{$kmerStatHalf{$bc}}) { $kmerStat{$bc}{$kmer} += $kmerStatHalf{$bc}{$kmer}; my $rc = reverse($kmer); $rc =~ tr[ACGT] [TGCA]; $kmerStat{$bc}{$rc} += $kmerStatHalf{$bc}{$kmer}; } } %kmerStatHalf = (); ## add k-mer patterns to barcodes that don't have one compare to other foreach my $bc (keys %kmerStat) { foreach my $restBC (keys %kmerStat) { if ($bc eq $restBC) {next} foreach my $kmer (keys %{$kmerStat{$bc}}) { if (!$kmerStat{$restBC}{$kmer}) {$kmerStat{$restBC}{$kmer} = 0} } } } ## normalisation foreach my $bc (keys %kmerStat) { foreach my $kmer (keys %{$kmerStat{$bc}}) { $kmerStat{$bc}{$kmer} += $pseudo; # adding pseudo # by number of reads $kmerStat{$bc}{$kmer} /= ($bc_seqnum{$bc} * ($randL - $kmerLength + 1) * 2); # by nucleotide composition: Odds ratio my @letters = split(//, $kmer); my $expected = 1; foreach my $nuc (@letters) {$expected *= $nuc_distr{$bc}{$nuc}} $kmerStat{$bc}{$kmer} /= $expected; } } ## calculate kmerStat parameters: mean, sd my %mean = (); my %sd = (); foreach my $bc (keys %kmerStat) { my $n = keys %{$kmerStat{$bc}}; foreach my $kmer (keys %{$kmerStat{$bc}}) { $mean{$bc} += $kmerStat{$bc}{$kmer} } $mean{$bc} /= $n; foreach my $kmer (keys %{$kmerStat{$bc}}) { $sd{$bc} += ($kmerStat{$bc}{$kmer} - $mean{$bc}) ** 2 } $sd{$bc} = ($sd{$bc}{sd} / $n) ** 0.5; } ## calculate avarage presence of kmer for control barCodes my %avgControl = (); foreach my $bc (@backgrBC) { foreach my $kmer (keys %{$kmerStat{$bc}}) { $avgControl{$kmer} += $kmerStat{$bc}{$kmer} } } foreach my $kmer (keys %avgControl) { $avgControl{$kmer} /= @backgrBC } ## enrichment calculation ## my %kmerEnrich = (); foreach my $bc (keys %kmerStat) { foreach my $kmer (keys %{$kmerStat{$bc}}) { $kmerEnrich{$bc}{$kmer} = $kmerStat{$bc}{$kmer} / $avgControl{$kmer} } } ## delete kmers that don't pass thresholds my %true = (); # contain sequences that pass thresholds foreach my $bc (sort keys %kmerEnrich) { foreach my $kmer (keys %{$kmerEnrich{$bc}}) { my $zScore = int(($kmerStat{$bc}{$kmer} - $mean{$bc}) / $sd{$bc}); if (($kmerEnrich{$bc}{$kmer} < $enrichTh) or ($zScore < $zScoreTh)) { delete $kmerEnrich{$bc}{$kmer}; delete $kmerStat{$bc}{$kmer}; next; } push @{$true{$bc}}, $kmer; } } #--------------------------------------------------------------------- # kmer clustering (k-means clustering) #--------------------------------------------------------------------- my @origin = (); # contain sequences that will be used as origin in k-mean clustering # 1) find best match for every pair of sequences foreach my $bc (keys %true) { my $match = []; $match = bestMatch($true{$bc}); # 2) find sequences that not similar to more than 10% of all sequences my $foo = []; my $nonMatchTh = int(length($true{$bc}[0]) / 2); for (my $i = 0; $i < @$match; $i++) { my $nonMatch = 0; foreach my $j (keys %{$match->[$i]}) { if ($match->[$i]{$j} < $nonMatchTh) {$nonMatch++} } if ($nonMatch >= @$match * 0.1) {push @foo, $i} } # 3) find final sequences (origin) by deleting similar $origin[0] = $foo[0]; for (my $i = 1; $i < @foo; $i++) { my $similar = "no"; for (my $j = 0; $j < @origin; $j++) { if ($match->[$origin[$j]]{$foo[$i]} >= $nonMatchTh) {$similar = "yes"; last;} } if ($similar eq "yes") {push @origin, $foo[$i]} } } # renormalizing oddsRation score, so the min will equal 10 if ($relative eq "o") { foreach my $bc (keys %true) { my $min = 10000000000; foreach my $kmer (keys %{$true{$bc}}) { if ($true{$bc}{$kmer} < $min) {$min = $true{$bc}{$kmer}} } foreach my $kmer (keys %{$true{$bc}}) { $true{$bc}{$kmer} = int((10 / $min) * $true{$bc}{$kmer}) } } } # print list of sequences for the following alignment my $OUT = "kmer\/trueSeq-$relative-$kmerLength-mer-$searchRegion-$background--$seqFileName-$bcCondName.sm"; open (OUT, ">$OUT") or die; foreach my $bc (keys %true) { print OUT ">$barcode{$bc}\t$bc\n"; foreach my $kmer (keys %{$true{$bc}}) { for (my $i = 0; $i < $true{$bc}{$kmer}; $i++) { print OUT $true{$bc}{$kmer}, "\n" } } } close OUT; #----------------------- # Printing enrichment, Odds ratio, zScore #----------------------- unless (-d "kmer") {system("mkdir kmer") == 0 or die} my $KMER_OUT = "kmer\/$kmerLength-mer-$searchRegion-$background--$seqFileName-$bcCondName.sm"; open (KMEROUT, ">$KMER_OUT") or die; #print KMEROUT "control barCodes: @backgrBC\n\n"; print KMEROUT "kmer\tEnrichment\tOdds ratio\tzScore\n"; foreach my $bc (sort keys %kmerEnrich) { print KMEROUT ">$barcode{$bc}\t$bc\n"; foreach my $kmer (sort {$kmerEnrich{$bc}{$b} <=> $kmerEnrich{$bc}{$a}} keys %{$kmerEnrich{$bc}}) { my $zScore = int(($kmerStat{$bc}{$kmer} - $mean{$bc}) / $sd{$bc}); my $oddsRatio = sprintf("%.2f", $kmerStat{$bc}{$kmer}); my $enrichment = sprintf("%.2f", $kmerEnrich{$bc}{$kmer}); print KMEROUT "$kmer\t$enrichment\t$oddsRatio\t$zScore\n"; } } close KMEROUT; ################################################################################ ########### Affinity: sequence finding ########### ################################################################################ STEP1: print "STEP1\n"; if (!$affinity) {goto STEP2} print "started affinity\n"; ## open and read affSeq_FILE ## open (AFF, "$affSeq_FILE") or die; my @affSeq = (); my %knownAff = (); while () { chomp $_; my @foo = split(/\t/, $_); push @affSeq, $foo[0]; $knownAff{$foo[0]} = $foo[1]; } close AFF; $s = 0; # sequence number %bc_seqnum = (); my %affStat = (); open (SEQ, "$SEQ_FILE") or die; while () { my $def = $_; my $seq = ; chomp $seq; my $bc = substr($seq, $bcS - 1, 3); if (!$barcode{$bc}) {next} $bc_seqnum{$bc}++; my $random = substr($seq, $randS - 1, $randL); ## define searchSeq and searchSeq_rc my $searchSeq = ""; if ($searchRegion eq "all") { $searchSeq = "$primer_l"."$bc"."$random"."$primer_r" } elsif ($searchRegion eq "random") { $searchSeq = $random } else {die "searchRegion $searchRegion not correct\n"} my $searchSeq_rc = reverse($searchSeq); $searchSeq_rc =~ tr[ACGT] [TGCA]; ## search known sequences within sequence reads ## foreach my $kmer (@affSeq) { if ($searchSeq =~ /$kmer/) {$affStat{$bc}{$kmer}++} elsif ($searchSeq_rc =~ /$kmer/) {$affStat{$bc}{$kmer}++} } $s++; } close SEQ; print "scan through inputS file done at:\n"; Az::dateNtime; ## normalisation of kmer occurance by total number of sequences for a barCode foreach my $bc (keys %affStat) { foreach my $kmer (keys %{$affStat{$bc}}) { $affStat{$bc}{$kmer} += $pseudo; # adding pseudo $affStat{$bc}{$kmer} /= $bc_seqnum{$bc}; } } ## normalisation of kmer occurance by nucleotide composition foreach my $bc (keys %affStat) { foreach my $kmer (keys %{$affStat{$bc}}) { my @letters = split(//, $kmer); my $expected = 1; foreach my $nuc (@letters) { $expected *= $nuc_distr{$bc}{$nuc}; } my $pl = length($kmer); $affStat{$bc}{$kmer} /= ($expected * ($randL - $pl + 1) * 2); } } # calculate avarage presence of kmer for control barCodes %avgControl = (); foreach my $bc (@backgrBC) { foreach my $kmer (keys %{$affStat{$bc}}) { $avgControl{$kmer} += $affStat{$bc}{$kmer} } } foreach my $kmer (keys %avgControl) { $avgControl{$kmer} /= @backgrBC } ## enrichment calculation ## my %affEnrichment = (); foreach my $bc (keys %affStat) { foreach my $kmer (keys %{$affStat{$bc}}) { if (!$avgControl{$kmer}) {$affEnrichment{$bc}{$kmer} = 0; next;} $affEnrichment{$bc}{$kmer} = $affStat{$bc}{$kmer} / $avgControl{$kmer}; } } ## Pearson product-moment correlation coefficient my %coeffCorr = (); if (%knownAff) { foreach my $bc (keys %affEnrichment) { $coeffCorr{$bc} = coefficientCorrelation($affEnrichment{$bc}, \%knownAff) } } ######################## ### Printing results ### ######################## unless (-d "affinity") {system("mkdir affinity") == 0 or die} my $AFF_OUT = "affinity\/affinity-$searchRegion-$background--$seqFileName-$bcCondName-$affSeqFilName.sm"; open (AFFOUT, ">$AFF_OUT") or die; print AFFOUT "control barCodes: @backgrBC\n\n"; ## print enrichment tables ## print AFFOUT "\nEnrichment: kmer in bcCond / kmer in avrCond\n"; # print header # foreach my $bc (sort keys %affEnrichment) { print AFFOUT "\t$bc"; } print AFFOUT "\n"; foreach my $bc (sort keys %affEnrichment) { print AFFOUT "\t$barcode{$bc}"; } print AFFOUT "\n"; # print enrichment table # foreach my $kmer (@affSeq) { print AFFOUT "$kmer"; foreach my $bc (sort keys %affEnrichment) { if (!$affEnrichment{$bc}{$kmer}) {print AFFOUT "\t0"; next;} my $enrichment = sprintf("%.2f", $affEnrichment{$bc}{$kmer}); print AFFOUT "\t$enrichment"; } if ($knownAff{$kmer}) {print AFFOUT "\t", $knownAff{$kmer}} print AFFOUT "\n"; } ## print coefficient correlation print AFFOUT "\nCoefficient Correlation: enrichment vs known Kd\n"; # print header # foreach my $bc (sort keys %coeffCorr) { print AFFOUT "\t$barcode{$bc}"; } print AFFOUT "\n"; # print coefficient correlation table # foreach my $bc (sort keys %coeffCorr) { $coeffCorr{$bc} = sprintf("%.3f", $coeffCorr{$bc}); print AFFOUT "\t", $coeffCorr{$bc}; } print AFFOUT "\n"; ## print number of sequences per barCode print AFFOUT "\n\n\nTotal reads number per barCode\n"; foreach my $bc (sort keys %bc_seqnum) { print AFFOUT "\t$bc"; } print AFFOUT "\n"; foreach my $bc (sort keys %bc_seqnum) { print AFFOUT "\t$barcode{$bc}"; } print AFFOUT "\nreadNum"; foreach my $bc (sort keys %bc_seqnum) { print AFFOUT "\t$bc_seqnum{$bc}"; } print AFFOUT "\n\n"; ## print Odds ration table of kmer presence within barCode reads ## print AFFOUT "\nOdds table of kmer presence within barCode reads\n"; # print header # foreach my $bc (sort keys %affStat) { print AFFOUT "\t$bc"; } print AFFOUT "\n"; foreach my $bc (sort keys %affStat) { print AFFOUT "\t$barcode{$bc}"; } print AFFOUT "\n"; # print Odds table # foreach my $kmer (@affSeq) { print AFFOUT "$kmer"; foreach my $bc (sort keys %affStat) { if (!$affStat{$bc}{$kmer}) {print AFFOUT "\t0"; next;} # my $occurance = log($affStat{$bc}{$kmer})/log(10); my $occurance = $affStat{$bc}{$kmer}; $occurance = sprintf("%.2f", $occurance); print AFFOUT "\t$occurance"; } print AFFOUT "\n"; } close AFFOUT; STEP2: print "STEP2\n"; print "program ended at:\n"; Az::dateNtime; ################################################################################ ######## Subroutines ########### ################################################################################ sub BarCoded_nuc_distr { my ($file) = @_; open (SEQ, "$file") or die; my %nuc_distr = (); # nucleotide distribution for each barcode while () { my $def = $_; my $seq = ; chomp $seq; my $bc = substr($seq, $bcS - 1, 3); if (!$barcode{$bc}) {next} my $random = substr($seq, $randS - 1, $randL); my $searchSeq = ""; if ($searchRegion eq "all") { $searchSeq = "$primer_l"."$bc"."$random"."$primer_r" } elsif ($searchRegion eq "random") { $searchSeq = $random } else {die "searchRegion $searchRegion not correct\n"} my @foo; $nuc_distr{$bc}{A} += @foo = ($searchSeq =~ /A/g); $nuc_distr{$bc}{C} += @foo = ($searchSeq =~ /C/g); $nuc_distr{$bc}{G} += @foo = ($searchSeq =~ /G/g); $nuc_distr{$bc}{T} += @foo = ($searchSeq =~ /T/g); } foreach my $key (keys %nuc_distr) { $nuc_distr{$key} = Az::percent($nuc_distr{$key}, 0); my $mid = ($nuc_distr{$key}{A} + $nuc_distr{$key}{T}) / 2; $nuc_distr{$key}{A} = $mid; $nuc_distr{$key}{T} = $mid; $mid = ($nuc_distr{$key}{C} + $nuc_distr{$key}{G}) / 2; $nuc_distr{$key}{C} = $mid; $nuc_distr{$key}{G} = $mid; } close SEQ; return %nuc_distr; } sub coefficientCorrelation { my ($x, $y) = @_; my $n = 0; my $sx = 0; my $sy = 0; my $sxy = 0; my $sx2 = 0; my $sy2 = 0; foreach my $kmer (keys %$y) { if (!$x->{$kmer}) {next} $n++; $sx += $x->{$kmer}; $sy += $y->{$kmer}; $sxy += $x->{$kmer} * $y->{$kmer}; $sx2 += $x->{$kmer} * $x->{$kmer}; $sy2 += $y->{$kmer} * $y->{$kmer}; } my $r = ($n * $sxy - $sx * $sy) / ((($n * $sx2 - $sx ** 2) ** 0.5) * (($n * $sy2 - $sy ** 2) ** 0.5)); return $r; } sub bestMatch { my ($array) = @_; # each element has a sequence as a value my @match = (); my $length = length($array->[0]); my $shift = $length - int($length / 2); my @letters = (); for (my $i = 0; $i < @array; $i++) { @{$letters[$i]} = split(//, $array->[$i]) } for (my $i = 0; $i < @letters; $i++) { for (my $j = 0; $j < @letters; $j++) { next if ($i == $j); my $max = 0; ## count number of matches for a pair for (my $s = 0; $s <= 2 * $shift; $s++) { my $matchN = 0; for (my $l = 0; $l < $length; $l++) { if (!$letters[$i][$l + $l - $shift]) {next} if ($letters[$i][$l + $l - $shift] eq $letters[$l]) {$matchN++} } if ($matchN > $max) {$max = $matchN} } $match[$i]{$j} = $max; } } return \@match; }