Important Data Objects of Biostrings
XString
for single sequence
DNAString
: for DNARNAString
: for RNAAAString
: for amino acidBString
: for any string
XStringSet
for many sequences
DNAStringSet
`: for DNARNAStringSet
: for RNAAAStringSet
: for amino acidBStringSet
: for any string
QualityScaleXStringSet
for sequences with quality data
QualityScaledDNAStringSet
: for DNAQualityScaledRNAStringSet
: for RNAQualityScaledAAStringSet
: for amino acidQualityScaledBStringSet
: for any string
Sequence Import and Export
Download the following sequences to your current working directory and then import them into R: ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Bacteria/Halobacterium_sp_uid217/AE004437.ffn
dir.create("data", showWarnings = FALSE)
# system("wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Bacteria/Halobacterium_sp_uid217/AE004437.ffn")
download.file("ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_genbank/Bacteria/Halobacterium_sp_uid217/AE004437.ffn", "data/AE004437.ffn")
Import FASTA file with readDNAStringSet
myseq <- readDNAStringSet("data/AE004437.ffn")
myseq[1:3]
## A DNAStringSet instance of length 3
## width seq names
## [1] 1206 ATGACTCGGCGGTCTCGTGTCGGTGCCGGCCTC...GTCGTCGTTGTTCGACGCTGGCGGAACCCATGA gi|12057215|gb|AE...
## [2] 666 ATGAGCATCATCGAACTCGAAGGCGTGGTCAAA...GTCAACCTCGTCGATGGGGTGTTACACACGTGA gi|12057215|gb|AE...
## [3] 1110 ATGGCGTGGCGGAACCTCGGGCGGAACCGCGTG...AACGATCCGCCCGTCGAGGCGCTCGGCGAATGA gi|12057215|gb|AE...
Subset sequences with regular expression on sequence name field
sub <- myseq[grep("99.*", names(myseq))]
length(sub)
## [1] 170
Export subsetted sequences to FASTA file
writeXStringSet(sub, file="./data/AE004437sub.ffn", width=80)
Now inspect exported sequence file AE004437sub.ffn
in a text editor
Working with XString
Containers
The XString
stores the different types of biosequences in dedicated containers
library(Biostrings)
d <- DNAString("GCATAT-TAC")
d
## 10-letter "DNAString" instance
## seq: GCATAT-TAC
d[1:4]
## 4-letter "DNAString" instance
## seq: GCAT
RNA sequences
r <- RNAString("GCAUAU-UAC")
r <- RNAString(d) # Converts d to RNAString object
r
## 10-letter "RNAString" instance
## seq: GCAUAU-UAC
Protein sequences
p <- AAString("HCWYHH")
p
## 6-letter "AAString" instance
## seq: HCWYHH
Any type of character strings
b <- BString("I store any set of characters. Other XString objects store only the IUPAC characters.")
b
## 85-letter "BString" instance
## seq: I store any set of characters. Other XString objects store only the IUPAC characters.
Working with XStringSet
Containers
XStringSet
containers allow to store many biosequences in one object
dset <- DNAStringSet(c("GCATATTAC", "AATCGATCC", "GCATATTAC"))
names(dset) <- c("seq1", "seq2", "seq3") # Assigns names
dset[1:2]
## A DNAStringSet instance of length 2
## width seq names
## [1] 9 GCATATTAC seq1
## [2] 9 AATCGATCC seq2
Important utilities for XStringSet
containers
width(dset) # Returns the length of each sequences
## [1] 9 9 9
d <- dset[[1]] # The [[ subsetting operator returns a single entry as XString object
dset2 <- c(dset, dset) # Appends/concatenates two XStringSet objects
dsetchar <- as.character(dset) # Converts XStringSet to named vector
dsetone <- unlist(dset) # Collapses many sequences to a single one stored in a DNAString container
Sequence subsetting by positions:
DNAStringSet(dset, start=c(1,2,3), end=c(4,8,5))
## A DNAStringSet instance of length 3
## width seq names
## [1] 4 GCAT seq1
## [2] 7 ATCGATC seq2
## [3] 3 ATA seq3
Multiple Alignment Class
The XMultipleAlignment
class stores the different types of multiple sequence alignments:
origMAlign <- readDNAMultipleAlignment(filepath = system.file("extdata",
"msx2_mRNA.aln", package = "Biostrings"), format = "clustal")
origMAlign
## DNAMultipleAlignment with 8 rows and 2343 columns
## aln names
## [1] -----TCCCGTCTCCGCAGCAAAAAAGTTTGAGTCG...TTGTCCAAACTCACAATTAAAAAAAAAAAAAAAAA gi|84452153|ref|N...
## [2] ------------------------------------...----------------------------------- gi|208431713|ref|...
## [3] ------------------------------------...----------------------------------- gi|118601823|ref|...
## [4] ----------------------AAAAGTTGGAGTCT...----------------------------------- gi|114326503|ref|...
## [5] ------------------------------------...----------------------------------- gi|119220589|ref|...
## [6] ------------------------------------...----------------------------------- gi|148540149|ref|...
## [7] --------------CGGCTCCGCAGCGCCTCACTCG...----------------------------------- gi|45383056|ref|N...
## [8] GGGGGAGACTTCAGAAGTTGTTGTCCTCTCCGCTGA...----------------------------------- gi|213515133|ref|...
Basic Sequence Manipulations
Reverse and Complement
randset <- DNAStringSet(rand)
complement(randset[1:2])
## A DNAStringSet instance of length 2
## width seq
## [1] 17 AACCACGGGCGTGTGCA
## [2] 10 ATGAGTGCTG
reverse(randset[1:2])
## A DNAStringSet instance of length 2
## width seq
## [1] 17 TGCACACGCCCGTGGTT
## [2] 10 CAGCACTCAT
reverseComplement(randset[1:2])
## A DNAStringSet instance of length 2
## width seq
## [1] 17 ACGTGTGCGGGCACCAA
## [2] 10 GTCGTGAGTA
Translate DNA into Protein
translate(randset[1:2])
## Warning in .Call2("DNAStringSet_translate", x, skip_code, dna_codes[codon_alphabet], : in 'x[[1]]':
## last 2 bases were ignored
## Warning in .Call2("DNAStringSet_translate", x, skip_code, dna_codes[codon_alphabet], : in 'x[[2]]':
## last base was ignored
## A AAStringSet instance of length 2
## width seq
## [1] 5 LVPAH
## [2] 3 YSR
Pattern Matching
Pattern matching with mismatches
Find pattern matches in reference
myseq1 <- readDNAStringSet("./data/AE004437.ffn")
mypos <- matchPattern("ATGGTG", myseq1[[1]], max.mismatch=1)
Count only the corresponding matches
countPattern("ATGGCT", myseq1[[1]], max.mismatch=1)
## [1] 3
Count matches in many sequences
vcountPattern("ATGGCT", myseq1, max.mismatch=1)[1:20]
## [1] 3 0 5 4 1 2 2 1 4 3 0 0 1 2 0 1 4 0 0 1
Results shown in DNAStringSet object
tmp <- c(DNAStringSet("ATGGTG"), DNAStringSet(mypos))
Return a consensus matrix for query and hits
consensusMatrix(tmp)[1:4,]
## [,1] [,2] [,3] [,4] [,5] [,6]
## A 3 0 0 0 0 0
## C 1 1 0 0 0 0
## G 0 0 4 4 1 4
## T 0 3 0 0 3 0
Find all pattern matches in reference
myvpos <- vmatchPattern("ATGGCT", myseq1, max.mismatch=1)
myvpos # The results are stored as MIndex object.
## MIndex object of length 2058
## $`gi|12057215|gb|AE004437.1|:248-1453 Halobacterium sp. NRC-1, complete genome`
## IRanges of length 3
## start end width
## [1] 1 6 6
## [2] 383 388 6
## [3] 928 933 6
##
## $`gi|12057215|gb|AE004437.1|:1450-2115 Halobacterium sp. NRC-1, complete genome`
## IRanges of length 0
##
## $`gi|12057215|gb|AE004437.1|:2145-3254 Halobacterium sp. NRC-1, complete genome`
## IRanges of length 5
## start end width
## [1] 1 6 6
## [2] 94 99 6
## [3] 221 226 6
## [4] 535 540 6
## [5] 601 606 6
##
## ...
## <2055 more elements>
Views(myseq1[[1]], start(myvpos[[1]]), end(myvpos[[1]])) # Retrieves the result for single entry
## Views on a 1206-letter DNAString subject
## subject: ATGACTCGGCGGTCTCGTGTCGGTGCCGGCCTCGCAGCCATTGT...TTGCGATCGTCGTCGTCGTTGTTCGACGCTGGCGGAACCCATGA
## views:
## start end width
## [1] 1 6 6 [ATGACT]
## [2] 383 388 6 [ATGGCA]
## [3] 928 933 6 [ATGACT]
Return all matches
sapply(seq(along=myseq1), function(x)
as.character(Views(myseq1[[x]], start(myvpos[[x]]), end(myvpos[[x]]))))[1:4]
Pattern matching with regular expression support
myseq <- DNAStringSet(c("ATGCAGACATAGTG", "ATGAACATAGATCC", "GTACAGATCAC"))
myseq[grep("^ATG", myseq, perl=TRUE)] # String searching with regular expression support
## A DNAStringSet instance of length 2
## width seq
## [1] 14 ATGCAGACATAGTG
## [2] 14 ATGAACATAGATCC
pos1 <- regexpr("AT", myseq) # Searches 'myseq' for first match of pattern "AT"
as.numeric(pos1); attributes(pos1)$match.length # Returns position information of matches
## [1] 1 1 7
## [1] 2 2 2
pos2 <- gregexpr("AT", myseq) # Searches 'myseq' for all matches of pattern "AT"
as.numeric(pos2[[1]]); attributes(pos2[[1]])$match.length # Match positions in first sequence
## [1] 1 9
## [1] 2 2
DNAStringSet(gsub("^ATG", "NNN", myseq)) # String substitution with regular expression support
## A DNAStringSet instance of length 3
## width seq
## [1] 14 NNNCAGACATAGTG
## [2] 14 NNNAACATAGATCC
## [3] 11 GTACAGATCAC
PWM Viewing and Searching
library(seqLogo)
pwm <- PWM(DNAStringSet(c("GCT", "GGT", "GCA")))
pwm
## [,1] [,2] [,3]
## A 0.0000000 0.0000000 0.2312611
## C 0.0000000 0.3157205 0.0000000
## G 0.3685591 0.2312611 0.0000000
## T 0.0000000 0.0000000 0.3157205
seqLogo(t(t(pwm) * 1/colSums(pwm)))
Search sequence for PWM matches with score better than min.score
chr <- DNAString("AAAGCTAAAGGTAAAGCAAAA")
matchPWM(pwm, chr, min.score=0.9)
## Views on a 21-letter DNAString subject
## subject: AAAGCTAAAGGTAAAGCAAAA
## views:
## start end width
## [1] 4 6 3 [GCT]
## [2] 10 12 3 [GGT]
## [3] 16 18 3 [GCA]