Ali Oghabian
2019-04-25
- R and IntEREst version
- Files in the zip
- Annotating u12 type introns of HG38
- ncRNAs with U12-type introns
- U12 annotation comparison
- Reference
Files in the zip
annoFil.rda
R object file. A data frame that includes various annotations and information for the latest ENSEMBL genes (GRCh38.p12).AllU12Introns.tsv
Tab separated text file. Includes the detecetd U12-type introns and their annotations.ens38UncolMod.rda
ENSEMBL reference that includes gene and transcript IDs together with coordinates of the introns and exons.ensHg38UncolFilpaste60AnnoMat_40_don07_bp14.rda
R object file. Data frame with U12/U2 type annotation of the reference (hg38) introns, the U12 GT-AG or AT-AC subtype intfo and the PWM match scores.allIntronsU12annotationGeneType.tsv
Tab separated text file. Includes all annotated U12 type introns.nonProtCodingU12Introns.tsv
Tab separated text file. Includes the detecetd U12 type introns that were annotated as NOT protein-coding.report.Rmd
an RMarkdown file that was used to generate this report.report.html
the HTML file that was generated fromreport.Rmd
.U12LncRNA.tsv
Tab separated text file. Includes the detecetd U12-type introns that were annotated to be lncRNAs.
R and IntEREst version
R.Version()
## $platform
## [1] "x86_64-pc-linux-gnu"
##
## $arch
## [1] "x86_64"
##
## $os
## [1] "linux-gnu"
##
## $system
## [1] "x86_64, linux-gnu"
##
## $status
## [1] "Under development (unstable)"
##
## $major
## [1] "3"
##
## $minor
## [1] "6.0"
##
## $year
## [1] "2018"
##
## $month
## [1] "05"
##
## $day
## [1] "02"
##
## $`svn rev`
## [1] "74682"
##
## $language
## [1] "R"
##
## $version.string
## [1] "R Under development (unstable) (2018-05-02 r74682)"
##
## $nickname
## [1] "Unsuffered Consequences"
packageVersion("IntEREst")
## [1] '1.7.5'
Annotating u12 type introns of HG38
# Time demanding
ens38Uncol<- referencePrepare (sourceBuild="biomaRt",
biomart="ENSEMBL_MART_ENSEMBL",
biomartDataset="hsapiens_gene_ensembl",
biomartHost="www.ensembl.org",
ignore.strand=FALSE,
annotateGeneIds=TRUE,
collapseExons=FALSE)
Next the U12-type introns were annotated using donor socre threoshold 0.07 and branch point score threshold 0.14. If an intron was not annotated as a U12-type or a U2-type we considered it as U2-type (i.e. the default parameter setting
setNaAs= "U2"
was used). # Time demanding
library(BSgenome.Hsapiens.UCSC.hg38)
# Building PWM
pwmPaste<-buildSsTypePwms(u12DonorBegin=13, u2DonorBegin=11,
u2AcceptorBegin=38, u12DonorEnd=17, u2DonorEnd=17,
u2AcceptorEnd=40, pasteSites=TRUE)
save(pwmPaste, file="./pwmPaste.rda")
# Ignore 3' end nucleotide
# Annotations will be based on Donor and BP scores
accPasteScore=as.matrix(pwmPaste[[3]])
for(i in 1:nrow(accPasteScore))accPasteScore[i,1]=0
# Modify the chr names of the reference data.frame
if(length(grep("^chr",ens38Uncol$chr))==0)
ens38Uncol$chr<- paste("chr", ens38Uncol$chr, sep="")
accPasteScore=as.matrix(pwmPaste[[3]])
for(i in 1:nrow(accPasteScore))accPasteScore[i,1]=0
ens38UncolMod<- ens38Uncol[which(ens38Uncol$chr %in%
paste("chr", c(1:22,"X","Y", "MT"), sep="")),]
ens38UncolMod[which(ens38UncolMod$chr == "chrMT"),]<- "chrM"
save(ens38UncolMod, file="./ens38UncolMod.rda")
# Annotate U12-type introns using donor and BP of U12- and Donor and acceptor
# sites of U2-type introns.
ensHg38UncolFilpaste60AnnoMat_40_don07_bp14<- annotateU12(
pwmU12U2= list(
pwmPaste[[1]],
pwmPaste[[2]],
accPasteScore,
pwmPaste[[4]],
pwmPaste[[5]]),
pwmSsIndex= list(
indexDonU12=1,
indexBpU12=1,
indexAccU12=1,
indexDonU2=1,
indexAccU2=1),
referenceChr= ens38UncolMod[,"chr"],
referenceBegin= as.numeric(ens38UncolMod[,"begin"]),
referenceEnd= as.numeric(ens38UncolMod[,"end"]),
referenceIntronExon= ens38UncolMod[,"int_ex"],
intronExon= "intron",
matchWindowRelativeUpstreamPos= c(2,-40,NA,NA,NA),
matchWindowRelativeDownstreamPos= c(6,-3,NA,NA,NA),
minMatchScore= c( .07, .14, 0, .07, 0),
refGenome= BSgenome.Hsapiens.UCSC.hg38,
setNaAs= "U2",
annotateU12Subtype= TRUE,
includeMatchScores= TRUE)
save(ensHg38UncolFilpaste60AnnoMat_40_don07_bp14,
file="./ensHg38UncolFilpaste60AnnoMat_40_don07_bp14.rda")
The number of the detected U12-type introns are as follows:
load(file="./ensHg38UncolFilpaste60AnnoMat_40_don07_bp14.rda")
# Number of U12-type intron
table(ensHg38UncolFilpaste60AnnoMat_40_don07_bp14$int_type)
##
## U12 U2
## 3528 1051845
# Number of unique U12-type intron
indU12<- which(ensHg38UncolFilpaste60AnnoMat_40_don07_bp14$int_type=="U12")
length(
unique(paste(ens38UncolMod[indU12,"chr"],
ens38UncolMod[indU12,"begin"], ens38UncolMod[indU12,"end"], sep="_")))
## [1] 826
# Number of U12-type intron subtypes
table(ensHg38UncolFilpaste60AnnoMat_40_don07_bp14$u12_subtype)
##
## AT-AC GT-AG
## 903 2625
# Number of unique U12-type intron subtypes
uniCoord<- unique(paste(ens38UncolMod[indU12,"chr"],
ens38UncolMod[indU12,"begin"], ens38UncolMod[indU12,"end"], sep="_"))
coord<- paste(ens38UncolMod[indU12,"chr"],
ens38UncolMod[indU12,"begin"], ens38UncolMod[indU12,"end"], sep="_")
indUni<-match(uniCoord, coord)
table(ensHg38UncolFilpaste60AnnoMat_40_don07_bp14$u12_subtype[indU12[indUni]])
##
## AT-AC GT-AG
## 195 631
ncRNAs with U12-type introns
ensembl<- biomaRt::useMart("ENSEMBL_MART_ENSEMBL",
host="www.ensembl.org", dataset="hsapiens_gene_ensembl")
annoFil<- biomaRt::getBM( attributes=c("ensembl_gene_id",
"ensembl_transcript_id", "external_gene_name", "gene_biotype",
"transcript_biotype"),
filters = "ensembl_transcript_id",
values = unique(ens38UncolMod$transcript_id),
mart=ensembl, uniqueRows = TRUE)
save(annoFil, file="./annoFil.rda")
indPick<- which((transcript_biotype=="lincRNA" |
transcript_biotype=="bidirectional_promoter_lncRNA") &
ensHg38UncolFilpaste60AnnoMat_40_don07_bp14$int_type=="U12")
U12Linc<-ens38UncolMod[indPick,]
donPos=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,
names=U12Linc[U12Linc[,4]=="+",1],
start=
as.numeric(U12Linc[U12Linc[,4]=="+",2]),
end=
as.numeric(U12Linc[U12Linc[,4]=="+",2])+9,
as.character=TRUE)
accNeg=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,
names=U12Linc[U12Linc[,4]=="-",1],
start=
as.numeric(U12Linc[U12Linc[,4]=="-",2]),
end=
as.numeric(U12Linc[U12Linc[,4]=="-",2])+39,
as.character=TRUE)
accPos=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,
names=U12Linc[U12Linc[,4]=="+",1],
start=as.numeric(U12Linc[U12Linc[,4]=="+",3])-39,
end=as.numeric(U12Linc[U12Linc[,4]=="+",3])
,
as.character=TRUE)
donNeg=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,
names=U12Linc[U12Linc[,4]=="-",1],
start=as.numeric(U12Linc[U12Linc[,4]=="-",3])-9,
end=as.numeric(U12Linc[U12Linc[,4]=="-",3]),
as.character=TRUE)
donNeg<- as.vector(
as.character(Biostrings::reverseComplement(DNAStringSet(donNeg))))
accNeg<- as.vector(
as.character(Biostrings::reverseComplement(DNAStringSet(accNeg))))
accPos<- as.vector(accPos)
donPos<- as.vector(donPos)
donorSeq<- rep("",nrow(U12Linc))
acceptorSeq<- rep("",nrow(U12Linc))
donorSeq[U12Linc[,4]=="+"]<-donPos
donorSeq[U12Linc[,4]=="-"]<-donNeg
acceptorSeq[U12Linc[,4]=="+"]<-accPos
acceptorSeq[U12Linc[,4]=="-"]<-accNeg
U12LincMat<- cbind(U12Linc, donorSeq, acceptorSeq)
# Write lincRNA U12-type introns
write.table(U12LincMat, file="./U12LncRNA.tsv",
col.names=T, row.names=F, sep='\t', quote=F)
# write data frame with info for all introns
write.table(
cbind(ens38UncolMod, external_gene_name,
ensHg38UncolFilpaste60AnnoMat_40_don07_bp14,
transcript_biotype),
file="./allIntronsU12annotationGeneType.tsv",
col.names=T, row.names=F, sep='\t', quote=F)
# Build datat frame with U12 type introns info
u12Out<-
cbind(ens38UncolMod, ensHg38UncolFilpaste60AnnoMat_40_don07_bp14,
external_gene_name,
transcript_biotype)[which(
ensHg38UncolFilpaste60AnnoMat_40_don07_bp14$int_type=="U12"),]
donPos=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,
names=u12Out[u12Out[,4]=="+",1],
start=
as.numeric(u12Out[u12Out[,4]=="+",2]),
end=
as.numeric(u12Out[u12Out[,4]=="+",2])+9,
as.character=TRUE)
accNeg=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,
names=u12Out[u12Out[,4]=="-",1],
start=
as.numeric(u12Out[u12Out[,4]=="-",2]),
end=
as.numeric(u12Out[u12Out[,4]=="-",2])+39,
as.character=TRUE)
accPos=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,
names=u12Out[u12Out[,4]=="+",1],
start=as.numeric(u12Out[u12Out[,4]=="+",3])-39,
end=as.numeric(u12Out[u12Out[,4]=="+",3])
,
as.character=TRUE)
donNeg=Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38,
names=u12Out[u12Out[,4]=="-",1],
start=as.numeric(u12Out[u12Out[,4]=="-",3])-9,
end=as.numeric(u12Out[u12Out[,4]=="-",3]),
as.character=TRUE)
donNeg<-
as.vector(as.character(Biostrings::reverseComplement(DNAStringSet(donNeg))))
accNeg<-
as.vector(as.character(Biostrings::reverseComplement(DNAStringSet(accNeg))))
accPos<- as.vector(accPos)
donPos<- as.vector(donPos)
donorSeq<- rep("",nrow(u12Out))
acceptorSeq<- rep("",nrow(u12Out))
donorSeq[u12Out[,4]=="+"]<-donPos
donorSeq[u12Out[,4]=="-"]<-donNeg
acceptorSeq[u12Out[,4]=="+"]<-accPos
acceptorSeq[u12Out[,4]=="-"]<-accNeg
u12OutSeq<- cbind(u12Out, donorSeq, acceptorSeq)
write.table(
u12OutSeq,
file="./AllU12Introns.tsv",
col.names=T, row.names=F, sep='\t', quote=F)
write.table(
u12OutSeq[which(u12OutSeq$transcript_biotype!="protein_coding"),],
file="./nonProtCodingU12Introns.tsv",
col.names=T, row.names=F, sep='\t', quote=F)
U12 annotation comparison
annotateU12()
function may lead to detection of different sets of U12-type introns. Here, we compare the U12-type introns detected from the same annotateU12()
run as mentioned above (on a reference built from HG19) to those reported by Merico, D. et al.Finally
suppl1.html
file from the suppl1.Rmd
file. library("rmarkdown")
render("./report.Rmd")
Reference
- Merico,D. et al. (2015) Compound heterozygous mutations in the noncoding RNU4ATAC cause Roifman Syndrome by disrupting minor intron splicing. Nat Commun, 6, 8718.
Add a comment