@@ -10,6 +10,8 @@ library(dplyr)
1010# #get entrez ids to symbol
1111entrez <- as.data.frame(org.Hs.egALIAS2EG )
1212
13+ sym <- as.data.frame(org.Hs.egSYMBOL )
14+
1315# #get entriz ids to ensembl
1416ens <- as.data.frame(org.Hs.egENSEMBL2EG )
1517
@@ -22,26 +24,34 @@ ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
2224tab <- getBM(attributes = c(' ensembl_gene_id' ),filters = ' biotype' , values = c(' protein_coding' ),mart = ensembl )
2325
2426
25- joined.df <- entrez %> %full_join(ens )%> %
26- dplyr :: rename(entrez_id = ' gene_id' ,gene_symbol = ' alias_symbol' ,other_id = ' ensembl_id' )%> %
27- mutate(other_id_source = ' ensembl_gene' )| >
28- mutate(is_protein = other_id %in% tab $ ensembl_gene_id )| >
29- subset(is_protein )| >
30- dplyr :: select(- is_protein )
27+ joined.df <- entrez | >
28+ left_join(sym )| >
29+ dplyr :: rename(entrez_id = ' gene_id' ,gene_symbol = ' symbol' ,other_id = ' alias_symbol' ,gene_symbol = ' symbol' )%> %
30+ mutate(other_id_source = ' entrez_alias' )
31+
32+ # #now get aliases from ensembl
33+ edf <- sym | >
34+ inner_join(ens )| >
35+ dplyr :: rename(entrez_id = ' gene_id' ,gene_symbol = ' symbol' ,other_id = ' ensembl_id' )%> %
36+ mutate(other_id_source = ' ensembl_gene' )
37+
3138
32- tdf <- entrez | >
33- full_join (enst )| >
34- dplyr :: rename(entrez_id = ' gene_id' ,gene_symbol = ' alias_symbol ' ,other_id = ' trans_id' )| >
35- subset(entrez_id %in% joined.df $ entrez_id )| >
36- subset(gene_symbol %in% joined .df$ gene_symbol )| >
39+ tdf <- sym | >
40+ inner_join (enst )| >
41+ dplyr :: rename(entrez_id = ' gene_id' ,gene_symbol = ' symbol ' ,other_id = ' trans_id' )| >
42+ subset(entrez_id %in% edf $ entrez_id )| >
43+ # subset(gene_symbol%in%ed .df$gene_symbol)|>
3744 dplyr :: mutate(other_id_source = ' ensembl_transcript' )
3845
39- joined.df <- rbind(joined.df ,tdf )| >
46+
47+ prots <- subset(edf ,other_id %in% tab $ ensembl_gene_id )
48+
49+ full.df <- rbind(joined.df ,edf ,tdf )| >
50+ subset(entrez_id %in% prots $ entrez_id )| >
4051 distinct()
4152
4253# save to file and version
4354write.table(joined.df ,' /tmp/genes.csv' ,sep = ' ,' ,row.names = F ,quote = T )
4455
4556# #store this file somewhere!
4657
47-
0 commit comments