esafwan
7/17/2019 - 8:51 PM

Use php and curl to run scroll query against elasticsearch to get all the results of a search query.

Use php and curl to run scroll query against elasticsearch to get all the results of a search query.

<?php
//Only required if memory allocated is less and data size is big.
ini_set('memory_limit', '512M');

//Global configuration
$url = "http://elasticsearchhost.com:9200/elasticsearch_index_attendance/_search";
$port = "9200";
$request_type = "POST";
$headers = array(
            "Cache-Control: no-cache",
            "Content-Type: application/json",    
          );

//To hold the merged final data. Not required, if you write data to csv file in each loop.
//It will be faster and more scalable that way.
$final_data ="";


//First function to call, which does the first query.
function init($url,$port,$request_type,$headers){
    //The query for first request
    $query = "{\n \"size\": \"2000\",\n  \"query\": {\n    \"bool\": {\n      \"must\": [\n          { \"match\": { \"type\":\"engagement\" }}\n      ]\n    }\n  }\n}";

    //The param to be added along with url
    $params = "?scroll=3m";

    //Initiate the curl for first call with neccessary data passed
    $results = curl_op($url,$params,$port,$query,$headers,$request_type);

    //Get the result as php array
    $res_arr = json_decode($results);

    //Count of data in first query.
    $count = count($res_arr->hits->hits);  

    //Get scroll id from php array
    $scroll_id =  $res_arr->_scroll_id;    
        
    $ran = 1;  //Keep track of how many times loop ran. [Optional]    
    $total_count = $count; //Keep track of total count. Start with current count.[Optional]    

    $final_data = array();

    //Loop run as long as count is greater than 0. If 0 it means no data.
    while($count > 0) {

        $r = scroll($scroll_id,$port,$request_type,$headers);  //Call scroll function and get data using curl.
        $r_arr = json_decode($r);  //convert to array
        $d =  $res_arr->hits->hits; //Get data from array
        $count = count($r_arr->hits->hits); //Count the data

        //Bad Idea. Its better to create the csv in small batches. Slowly in each loop of 2000 items.
        //The below code, merges all together and is not the way to go with large dataset.  
        $final_data = array_merge($final_data, $d); 

        
        //For knowing total count. [Optional]    
        $total_count = $total_count + $count;
        print "<br>Run:" . $ran . " Count: " . $count;  //Show nth time run & current count. [Optional]    
        $ran++; //Increase ran counter [Optional]    
    } 

    print "<hr><br>Total count: " . $total_count . " Total times ran: " . $ran . "<hr> <h4>Full Data</h4>";  
    
    //Browser will crash. As array will have thousands of documents.
    //print("<pre>" . print_r($final_data,true) . "</pre>"); 
}


function scroll($scroll_id,$port,$request_type,$headers){
    
    //IMPORTANT:
    //The url should not include the index name in this case.
    $url = "http://elasticsearchhost.com:9200/_search";

    //Query for scrolling with scroll id dynamically added
    $query = "{\n    \"scroll\" : \"3m\", \n    \"scroll_id\" : \"" . $scroll_id ."\" \n}";

    //The param to be added along with url
    $params = "/scroll";

    //Initiate curl for scroll calss with id
    $results = curl_op($url,$params,$port,$query,$headers,$request_type);

    return $results;
}

//Function to initiate curl for any needs.
function curl_op($url,$params,$port,$query,$headers,$request_type){

    $curl = curl_init();

    curl_setopt_array($curl, array(
      CURLOPT_PORT => $port,
      CURLOPT_URL => $url . $params,
      CURLOPT_RETURNTRANSFER => true,
      CURLOPT_ENCODING => "",
      CURLOPT_MAXREDIRS => 10,
      CURLOPT_TIMEOUT => 30,
      CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
      CURLOPT_CUSTOMREQUEST => $request_type,
      CURLOPT_POSTFIELDS => $query,
      CURLOPT_HTTPHEADER => $headers,
    ));

    $response = curl_exec($curl);
    $err = curl_error($curl);

    curl_close($curl);

    if ($err) {
      return "cURL Error #:" . $err;
    } else {
      // header('Content-Type: application/json');
      // print json_encode($data);
      return $response;
    }
}


//Call the function to run the whole process.
init($url,$port,$request_type,$headers);