lh3
1/16/2016 - 4:23 AM

00README.md

package main

import (
	"os"
	"os/exec"
	"io"
	"fmt"
	"strings"
	"time"
	"path/filepath"
	"bufio"
	"regexp"
	"net/http"
	"strconv"
)

/****************
 * BSD getopt() *
 ****************/

var optind int = 1
var getopt_place int = -1

func getopt(args []string, ostr string) (int, string) {
	if getopt_place == -1 { // update scanning pointer
		if optind >= len(args) || args[optind][0] != '-' {
			getopt_place = -1
			return -1, ""
		}
		if optind < len(args) {
			getopt_place = 0
		}
		if getopt_place + 1 < len(args[optind]) {
			getopt_place += 1
			if args[optind][getopt_place] == '-' { // found "--"
				optind += 1
				getopt_place = -1
				return -1, ""
			}
		}
	}
	optopt := args[optind][getopt_place];
	getopt_place += 1
	oli, arg := strings.IndexByte(ostr, optopt), "";
	if optopt == ':' || oli < 0 {
		if optopt == '-' {
			return -1, ""
		}
		if getopt_place < 0 {
			optind += 1
		}
		return '?', ""
	}
	if oli + 1 >= len(ostr) || ostr[oli+1] != ':' {
		if getopt_place < 0 || getopt_place >= len(args[optind]) {
			optind += 1
			getopt_place = -1
		}
	} else {
		if getopt_place >= 0 && getopt_place < len(args[optind]) {
			arg = args[optind][getopt_place:]
		} else if optind += 1; len(args) <= optind { // no arguments
			getopt_place = -1
			if len(ostr) > 0 && ostr[0] == ':' {
				return ':', ""
			}
			return '?', ""
		} else {
			arg = args[optind]
		}
		getopt_place = -1
		optind += 1
	}
	return int(optopt), arg
}

/***************
 * BAM Crawler *
 ***************/

var hs_samtools = "./samtools";

type Metadata struct {
	ac string
	study string
	sample string
	format string
	asm string
	path string
}

var hs_meta []Metadata;

func hs_crawl(dirs []string) ([]Metadata) {
	var meta []Metadata;
	re, _ := regexp.Compile("^@RG.*\tSM:(\\S+)");
	re_fn, _ := regexp.Compile("([^\\s\\./]+)\\.([^\\s\\./]+)$");
	hash := make(map[string]string);
	for i := 0; i < len(dirs); i += 1 { // traverse each directory
		study := filepath.Base(dirs[i]);
		bams, _ := filepath.Glob(dirs[i] + "/*.bam"); // all BAM files in the directory
		for j := 0; j < len(bams); j += 1 {
			var ac, format string;
			match := re_fn.FindStringSubmatch(bams[j]);
			if len(match) > 1 {
				ac, format = match[1], match[2];
			}
			_, ok := hash[ac];
			if ok == true {
				fmt.Fprintf(os.Stderr, "WARNING: duplicated accessions: %s/%s <=> %s/%s\n", hash[ac], ac, study, ac);
			} else {
				hash[ac] = study;
				cmd := exec.Command(hs_samtools, "view", "-H", bams[j]); // extract the BAM header
				pipe, _ := cmd.StdoutPipe();
				sc := bufio.NewScanner(pipe);
				cmd.Start();
				var tmp []string;
				for sc.Scan() { // read by line
					match := re.FindStringSubmatch(sc.Text());
					if len(match) > 1 {
						present := false;	
						for k := 0 ; k < len(tmp); k += 1 { // FIXME: slooow given many samples in a BAM file; use a hash table instead!
							if tmp[k] == match[1] {
								present = true;
								break;
							}
						}
						if present == false {
							tmp = append(tmp, match[1]);
						}
					}
				}
				cmd.Wait();
				for k := 0; k < len(tmp); k += 1 {
					meta = append(meta, Metadata { ac, study, tmp[k], format, "GRCh37", bams[j] });
				}
			}
		}
	}
	return meta;
}

/*******************
 * Query Callbacks *
 *******************/

func hs_help(w http.ResponseWriter, r *http.Request) {
	fmt.Fprintf(w, "This is a bam-server for demonstration purposes only. It is hosting four BAMs\n");
	fmt.Fprintf(w, "in a 1Mb region 11:10,000,001-11,000,000 from GRCh37. The server has two entry\n");
	fmt.Fprintf(w, "points: /search to discover file accessions and /get to retrieve data (optionally\n");
	fmt.Fprintf(w, "in a region) when we know the file accession(s). The following gives a few examples\n");
	fmt.Fprintf(w, "about how to retrieve data from this web server.\n\n");
	fmt.Fprintf(w, "Examples in a web browser:\n\n");
	fmt.Fprintf(w, "  * List of studies and samples:\n");
	fmt.Fprintf(w, "    http://%s/search\n\n", r.Host);
	fmt.Fprintf(w, "  * Retrieve alignments for SGDP sample S_French-2 and S_Mbuti-1 in a small region:\n");
	fmt.Fprintf(w, "    http://%s/search?study=SGDP&sample=S_French-2,S_Mbuti-1\n", r.Host);
	fmt.Fprintf(w, "    http://%s/get?fmt=sam&ac=EXA00001,EXA00002&chr=11&start=10899000&end=10900000\n\n", r.Host);
	fmt.Fprintf(w, "Command-line examples:\n\n");
	fmt.Fprintf(w, "  * List of studies and samples:\n");
	fmt.Fprintf(w, "    curl -s 'http://%s/search' > study-sample.txt\n\n", r.Host);
	fmt.Fprintf(w, "  * Merged BAM:\n");
	fmt.Fprintf(w, "    curl -s 'http://%s/get?ac=EXA00001,EXA00002&chr=11&start=10300000&end=10500000' > merged.bam\n\n", r.Host);
	fmt.Fprintf(w, "  * BAM download without server-side re-coding (one accession; no region conditions):\n");
	fmt.Fprintf(w, "    wget 'http://%s/get?ac=EXA00001'\n\n", r.Host);
	fmt.Fprintf(w, "  * BAM retrieval without server-side re-coding:\n");
	fmt.Fprintf(w, "    samtools view -b 'http://%s/get?ac=EXA00001' 11:10300000-10500000 > S_French-2_part.bam\n\n", r.Host);
	fmt.Fprintf(w, "  * View alignment with samtools tview (one sample; no region conditions):\n");
	fmt.Fprintf(w, "    samtools tview 'http://%s/get?ac=EXA00001'\n\n", r.Host);
}

func hs_collect_strings(a []string) (map[string]int) {
	h := make(map[string]int);
	for i := 0; i < len(a); i += 1 {
		b := strings.Split(a[i], ",");
		for j := 0; j < len(b); j += 1 {
			h[b[j]] = 1;
		}
	}
	return h;
}

func hs_test_hash(h map[string]int, key string) (bool) {
	if len(h) > 0 {
		_, ok := h[key];
		return ok;
	}
	return true;
}

func hs_search(w http.ResponseWriter, r *http.Request) {
	r.ParseForm();
	ac := hs_collect_strings(r.Form["ac"]);
	study := hs_collect_strings(r.Form["study"]);
	sample := hs_collect_strings(r.Form["sample"]);
	for i := 0; i < len(hs_meta); i += 1 {
		m := hs_meta[i];
		if hs_test_hash(ac, m.ac) && hs_test_hash(study, m.study) && hs_test_hash(sample, m.sample) {
			fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", m.ac, m.study, m.sample, m.format, m.asm);
		}
	}
}

func hs_get(w http.ResponseWriter, r *http.Request) {
	// test if users are requesting to download BAI
	isBAI := false;
	if len(r.URL.RawQuery) > 4 && r.URL.RawQuery[len(r.URL.RawQuery)-4:] == ".bai" {
		isBAI = true;
		r.URL.RawQuery = r.URL.RawQuery[:len(r.URL.RawQuery)-4];
	}

	r.ParseForm();
	if len(r.Form["ac"]) == 0 {
		http.Error(w, "400 Bad Request: 'ac' MUST BE present", 400);
		return;
	}
	ac := hs_collect_strings(r.Form["ac"]);

	region := "";
	if len(r.Form["chr"]) > 0 {
		region = r.Form["chr"][0];
		if len(r.Form["start"]) > 0 {
			region += ":" + r.Form["start"][0];
		} else {
			region += ":1";
		}
		if len(r.Form["end"]) > 0 {
			region += "-" + r.Form["end"][0];
		}
	}

	// get the list of requested files
	var files, acs []string;
	for i := 0; i < len(hs_meta); i += 1 {
		_, ok := ac[hs_meta[i].ac];
		if ok {
			files = append(files, hs_meta[i].path);
			acs = append(acs, hs_meta[i].ac);
		}
	}
	if len(files) == 0 {
		http.Error(w, "400 Bad Request: non-existing 'ac'", 400);
		return;
	}

	// set content-type
	outfmt := "bam";
	if len(r.Form["fmt"]) > 0 {
		outfmt = r.Form["fmt"][0];
	}
	if outfmt == "sam" {
		w.Header().Set("Content-Type", "text/plain");
	} else if outfmt == "bam" {
		w.Header().Set("Content-Type", "application/octet-stream");
		if isBAI {
			w.Header().Set("Content-Disposition", "attachment; filename=\"" + strings.Join(acs, ",") + ".bai" + "\"");
		} else {
			w.Header().Set("Content-Disposition", "attachment; filename=\"" + strings.Join(acs, ",") + "." + outfmt + "\"");
		}
	}

	// write results
	if len(files) == 1 && region == "" && outfmt == "bam" { // copy through
		var offset int64 = 0;
		fn := files[0];
		if isBAI == true {
			fn += ".bai";
		} else if r.Header.Get("Range") != "" { // FIXME: to parse the end of the range
			re, _ := regexp.Compile("bytes=(\\d+)-");
			match := re.FindStringSubmatch(r.Header.Get("Range"));
			if len(match) >= 2 {
				offset, _ = strconv.ParseInt(match[1], 10, 64);
			}
			w.WriteHeader(206);
		}
		f, _ := os.Open(fn);
		if offset > 0 {
			f.Seek(offset, os.SEEK_SET);
		}
		defer f.Close();
		buf := make([]byte, 65536);
		for {
			buf = buf[:cap(buf)];
			n, err := f.Read(buf);
			if n == 0 && err == io.EOF {
				break;
			}
			buf = buf[:n];
			w.Write(buf);
		}
	} else { // use "samtools merge"; this requires relative recent samtools
		var args []string;
		args = append(args, "merge");
		if region != "" {
			args = append(args, "-R"+region);
		}
		args = append(args, "-1O"+outfmt, "-");
		args = append(args, files...);
		cmd := exec.Command(hs_samtools, args...);
		cmd.Stdout = w;
		cmd.Run();
	}
}

func main() {
	var port string = "8000";

	if os.Getenv("PORT") != "" {
		port = os.Getenv("PORT");
	}
	// parse command line options
	for {
		opt, arg := getopt(os.Args, "e:p:");
		if opt == 'p' {
			port = arg;
		} else if opt == 'e' {
			hs_samtools = arg;
		} else if opt < 0 {
			break;
		}
	}
	if optind == len(os.Args) {
		fmt.Fprintln(os.Stderr, "Usage: hts-server [options] <dir1> [...]");
		fmt.Fprintln(os.Stderr, "Options:");
		fmt.Fprintf(os.Stderr, "  -p INT    port number [%s or from $PORT env]\n", port);
		fmt.Fprintf(os.Stderr, "  -e FILE   samtools executable [%s]\n", hs_samtools);
		os.Exit(1);
	}

	hs_meta = hs_crawl(os.Args[optind:]);

	fmt.Fprintf(os.Stderr, "[%d] launched at port %s\n", time.Now().UnixNano(), port);

	http.HandleFunc("/", hs_help);
	http.HandleFunc("/search", hs_search);
	http.HandleFunc("/get", hs_get);
	http.ListenAndServe(fmt.Sprintf(":%s", port), nil);
}
#!/usr/bin/perl -w

use strict;
use warnings;

my $srv = 'http://bamsvr.herokuapp.com';
my ($study, $sample, $chr, $start, $end) = ('SGDP', 'S_French-2', '11', 10890000, 10900000);
# file discovery
my @ac = split /[\t\n]+/, `curl -sd "study=$study&sample=$sample" "$srv/search"|cut -f1|sort|uniq`;
# retrieve alignment data
if (@ac == 1) { # one and only one accession
	open FH, qq(curl -sd "ac=$ac[0]&chr=$chr&start=$start&end=$end&fmt=sam" "$srv/get"|) || die;
	print while (<FH>);
	close(FH);
}

This gist implements a proof-of-concept web server to provide remote access to multiple BAMs. You can launch the server with:

go run hts-server.go -e /path/to/samtools bam-dir1 bam-dir2

and test it with:

curl -s http://127.0.0.1:8000/

The server regards bam-dir1 and bam-dir2 as study accessions. For a BAM file bam-dir1/myfile1.bam, the file accession is myfile1. It is required that every file has a unique file name across all input directories (i.e. even in two directories, two files must be named differently).