felipealbrecht
10/29/2016 - 6:43 AM

Download all BLUEPRINT gene expression data and format as numeric matrix

Download all BLUEPRINT gene expression data and format as numeric matrix

# Load dependencies
# install DeepBlueR from bioconductor 
# http://bioconductor.org/packages/release/bioc/html/DeepBlueR.html
library(DeepBlueR)
library(dplyr)
library(tidyr)

# List all BLUEPRINT samples
blueprint_samples <- deepblue_list_samples(
    extra_metadata = list("source" = "BLUEPRINT Epigenome"))

# Extract their ids
blueprint_samples_ids <- deepblue_extract_ids(blueprint_samples)

# Select gene expression data. We assign gene names using Gencode 22
gene_exprs_query <- deepblue_select_expressions(expression_type="gene", sample_ids = 
                                                         blueprint_samples_ids, gene_model = "gencode v22")

# We request the data and define the output format
request = deepblue_get_regions(query_id = gene_exprs_query, 
                               "@GENE_ID(gencode v22),FPKM,@BIOSOURCE,@SAMPLE_ID")

# We download the data
gene_regions <- deepblue_download_request_data(request)

# We retain a table mapping sample ids to bisources
sample_names <- dplyr::select(gene_regions, `@BIOSOURCE`, `@SAMPLE_ID`) %>% 
    dplyr::distinct()

# We filter out duplicated gene entries
genes_one_sample <- dplyr::filter(gene_regions, `@SAMPLE_ID` == "s10678")
duplicated_genes <- genes_one_sample[
    which(duplicated(genes_one_sample$`@GENE_ID(gencode v22)`)),
    "@GENE_ID(gencode v22)"]

# We convert the gene expression from a list to a data frame and subsequently...
genes_matrix = dplyr::filter(gene_regions, 
                             !(`@GENE_ID(gencode v22)` %in% duplicated_genes)) %>% 
    dplyr::select(-`@BIOSOURCE`) %>%
    tidyr::spread(key = `@SAMPLE_ID`, value = FPKM)

# ...to a numeric matrix 
genes <- genes_matrix[,1]
genes_matrix <- data.matrix(genes_matrix[,-1])
rownames(genes_matrix) <- genes

### OUTPUT
### genes_matrix : The gene expression matrix for all 276 BLUEPRINT samples
### sample_names : A mapping table from sample id to cell type / biosource