felipealbrecht
10/20/2015 - 9:01 PM

Accessing and downloading experiments from DeepBlue

Accessing and downloading experiments from DeepBlue

# The most important DeepBlue entity is the experiment. 
# The experiments contains the Epigenetic Data with their data description, that we name as metadata.

import xmlrpclib

# Before going further, we must set up the client:
url = "http://deepblue.mpi-inf.mpg.de/xmlrpc"
server = xmlrpclib.Server(url, encoding='UTF-8', allow_none=True)

# You can use the anonymous key or your own user key
user_key = "anonymous_key"

# We can list the experiments with the list_experiments command. 
# The command has 6 parameters: genome, epigenetic_mark, sample, technique, project, user_key. 
# Only user_key is mandatory and the others parameters work as filters.


# From ours previous examples, lets list all experiments that are peaks with the Epigenetic Mark H3K27ac of the ENCODE and BLUEPRINT projects:
(status, experiments) = server.list_experiments("", "peaks", "H3K27ac", "", "", "", "Roadmap Epigenomics", user_key)
print experiments


# We can list the experiments that contains Samples from blood:
(status, experiments) = server.list_experiments("", "", "H3K27ac", "blood", "", "", "Roadmap Epigenomics", user_key)
# Partial output:
# [['e17561', 'E062-H3K27ac.gappedPeak.bed'], ['e17568', 'E062-H3K27ac.narrowPeak.bed'], ['e17573', 'E062-H3K27ac.broadPeak.bed'], ...
 

# In this example, we are only getting the experiments where the Sample has exactly the BioSource blood. 
# We use the get_biosource_children to obtain the biosources that are under the given biosource in the hierarchy tree. 
(status, blood_sub) = server.get_biosource_children('blood', user_key)
biosource_names = server.extract_names(blood_sub)[1]
(status, samples) = server.list_samples(biosource_names, {}, user_key)
samples_ids = server.extract_ids(samples)[1]

# List all experiments that are peaks , has H3k27ac as epigenetic mark and have the select samples from the blood from the blueprint and roadmap projects
(status, experiments) = server.list_experiments("", "peaks", "H3K27ac", "", samples_ids, "", ["Blueprint Epigenome", "Roadmap Epigenomics"], user_key)
# Experiments (partial) :
# ['e54155', 'S00B2JH1.ERX712684.H3K27ac.bwa.GRCh38.20150529.bedgraph'], ['e53518', 'S00GPRH1.ERX712689.H3K27ac.bwa.GRCh38.20150528.bedgraph'], ['e17561', 'E062-H3K27ac.gappedPeak.bed'], ...


# We can use the info command to gather more information about the experiment. 
# The info command can be used to obtain more information about the experiments:
status, e_info = server.info(experiments[0][0], user_key)  # ID of the first experiment
# Printing the information of the sample in the selected experimnt 
print e_info[0]["sample_info"]

# We will select only the experiments that constains the columns: CHROMOSOME, START, END, NAME, SCORE, STRAND, SIGNAL_VALUE, P_VALUE, Q_VALUE, that is the BED 6+3 format.
peak_experiments = []
peak_format = "CHROMOSOME,START,END,NAME,SCORE,STRAND,SIGNAL_VALUE,P_VALUE,Q_VALUE"
for experiment in experiments:
    (status, info) = server.info(experiment[0], user_key) # experiment[0] is the ID of the experiment.
    if info[0]["format"].startswith(peak_format): # We use startwith() because we also want experiments that contain more columns.
        peak_experiments.append(experiment)
   
   
#  Now that we selected some experiments, we can retrieve their content.

# The first step is to build a list of all experiments names.
experiments_names = server.extract_names(peak_experiments)[1]

# For selecting the experiments region we must use the command select_regions .
# This command has the following parameters: experiment_name, genome, epigenetic_mark, sample_id, technique, project, chromosome, start, end, and user_key. 
# As we already have the experiments names, we can just use this list. 
# Also, we will retrieve only the regions from the chromosome 1 from the starting position 0 to 100000000.
(status, query_id) = server.select_regions(experiments_names, None, None, None, None, None, "chr1", 0, 100000000, user_key)

# The 'query_id' is a pointer to the data that we selected.

# We can download the regions using the get_regions command. 
# This command requires the query_id, that was returned by the select_regions and the content format.
# We use some meta-columns, where the @NAME is the experiment name, @BIOSOURCE the BioSource and @SAMPLE_ID the Sample ID of the Region.

download_format = "CHROMOSOME, START, END, NAME, SCORE, @NAME, @BIOSOURCE, @SAMPLE_ID"
(status, request_id) = server.get_regions(query_id, download_format, user_key) 


# The command get_regions returns a request_id.
# Due the fact that all the data is processed dinamically, some requests processed almost immediately, where others may take minutes or even hours. (It depends on the amount of data that will be processed and the server usage.) 
# The request status can be verified with the info command:

(status, request_status) = server.info(request_id, user_key)
print request_status
# Wait for the server processing
(status, info) = server.info(request_id, user_key)
request_status = info[0]["state"]
while request_status != "done" and request_status != "failed":
  time.sleep(1)
  (status, info) = server.info(request_id, user_key)
  request_status = info[0]["state"]


# The request data can be retrieved when the request status be done.
(status, data) = server.get_request_data(request_id, user_key)

print download_format
print data

print "Total of " , len(data.split("\n")), " regions."
# Total of  292112  regions. (this number may changes with the inclusion of new datasets)