package FAlite; use strict; sub new { my ($class, $fh) = @_; if (ref $fh !~ /GLOB/) {die ref $fh, "\n", "FAlite ERROR: expect a GLOB reference\n"} my $this = bless {}; $this->{FH} = $fh; while(<$fh>) {last if $_ =~ /\S/} # not supposed to have blanks, but... my $firstline = $_; if (not defined $firstline) {warn "FAlite: Empty\n"; return $this} if ($firstline !~ /^>/) {warn "FAlite: Not FASTA formatted\n"; return $this} $this->{LASTLINE} = $firstline; chomp $this->{LASTLINE}; return $this; } sub nextEntry { my ($this) = @_; return 0 if not defined $this->{LASTLINE}; my $fh = $this->{FH}; my $def = $this->{LASTLINE}; my @seq; my $lines_read = 0; while(<$fh>) { $lines_read++; if ($_ =~ /^>/) { $this->{LASTLINE} = $_; chomp $this->{LASTLINE}; last; } push @seq, $_; } return 0 if $lines_read == 0; chomp @seq; my $entry = FAlite::Entry::new($def, \@seq); return $entry; } package FAlite::Entry; use overload '""' => 'all'; sub new { my ($def, $seqarry) = @_; my $this = bless {}; $this->{DEF} = $def; $this->{SEQ} = join("", @$seqarry); $this->{SEQ} =~ s/\s//g; # just in case more spaces return $this; } sub def {shift->{DEF}} sub seq {shift->{SEQ}} sub all {my $e = shift; return $e->{DEF}."\n".$e->{SEQ}."\n"} 1; __END__ =head1 NAME FAlite; =head1 SYNOPSIS use FAlite; my $fasta = new FAlite(\*STDIN); while(my $entry = $fasta->nextEntry) { $entry->def; $entry->seq; } =head1 DESCRIPTION FAlite is a package for parsing FASTA files and databases. The FASTA format is widely used in bioinformatics. It consists of a definition line followed by sequence with an arbitrary number of lines and line lengths. A FASTA file looks like this: >identifier descriptive text GAATTC A FASTA database looks like this: >identifier1 some text describing this entry GAATTC ACTAGT >identifier2 some text describing this entry AAACCT GCTAAT =head2 Object FAlite has two kinds of objects, the file and the entry. my $fasta_file = new FAlite(\*STDIN); # or any other filehandle $entry = $fasta_file->nextEntry; # single fasta fle while(my $entry = $fasta_file->nextEntry) { # canonical form of use for fasta database } The entry has two attributes (def and seq). $entry->def; # access the def line $entry->seq; # access the sequence "$entry"; # overload to fasta file ($entry->def . "\n" . $entry->seq) =head1 AUTHOR Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf) =head1 ACKNOWLEDGEMENTS This software was developed at the Genome Sequencing Center at Washington Univeristy, St. Louis, MO. =head1 COPYRIGHT Copyright (C) 1999 Ian Korf. All Rights Reserved. =head1 DISCLAIMER This software is provided "as is" without warranty of any kind. =cut