hieu292
10/2/2016 - 8:31 AM

SortingAndMergingBigData

func Read_File_Stream_In_Ram(nameFile){
    //read stream in memory
    //...some stuff to work
    yield data;
}
func Split_Chunk_Data(log){
    nameFile = log.name
    dataReceived = []
    data = Read_File_Stream_In_Ram(nameFile)
    dataReceived.push(data)
    while(len(dataReceived) > 0){
        start_timestamp = data[0]["unix_timestamp"]
        end_timestamp = data[len(data) -1]["unix_timestamp"]
        index = generate_index(start_timestamp,end_timestamp) //ex: "log-1-startTime-EndTime"
        // sort data chunk by name_book
        dataSorted = data.sort((a, b) => {return (a.name_book.toUpperCase() > b.name_book.toUpperCase())}
        // write to disk, name file is nameFile and index, data is dataSorted
        Write_File(nameFile, index, dataSorted)
    }
    log.loaded = true;
    return log;
}

func extract_Info(query, logFile){
    listChunkName = get_list_name_of_chunk_file(logFile) // output format: ["log-1-Jan01-Jan30.txt", "log-1-Feb01-Feb30.txt", ...]
    for chunkName in listChunkName:
        start_timestamp = get_startTime_from_ChunkName(chunkName)
        end_timestamp = get_endTime_from_ChunkName(chunkName)
        if( query.startTime < get_endTime(chunkName) || query.endTime > get_startTime(chunkName)):
            //search book on this chunk
            search_book(query, chunkName)
}

func search_book(query, chunkName){
    bookName = query.nameBook
    data = Read_File(chunkName)
    lengthData = len(data)
    data = search(sortedData, 1, lengthData, bookName)
    
    //Write Result to disk
    Write_File(query, data)
}

func search(sortedData, lowerPoint, upperPoint, bookName){
    if(len(sortedData) == 0)
        return;
    middle = (lowerPoint + upperPoint)/2
    if(bookName > sortedData[middle]):
        lowerPoint = middle + 1
        data = sortedData.splice(lowerPoint, upperPoint)
        search(data, lowerPoint, upperPoint, bookName)
    if(bookName < sortedData[middle]):
        upperPoint = middle - 1
        data = sortedData.splice(lowerPoint, upperPoint)
        search(data, lowerPoint, upperPoint, bookName)
    if(bookName == sortedData[middle]):
        yield sortedData[middle]
        //continue to search
        upperPoint1 = middle - 1
        lowerPoint1 = middle + 1
        data1 = sortedData.splice(lowerPoint, upperPoint1)
        data2 = sortedData.splice(lowerPoint1, upperPoint)
        search(data1, lowerPoint, upperPoint1, bookName)
        search(data2, lowerPoint1, upperPoint, bookName)   
}

func main(){
    // Split large file and sort by Name book
    loaded_Log = []
    name_array_log = [{name: 'log-1.txt', loaded: false}, {'log-2.txt', loaded: false},...]
    for log in name_array_log:
        result = Split_Chunk_Data(log)
        loaded_Log.push(result)
        
    //Check log file and call extract informations
    logFile = 'log-n.text'
    query = { namebook : 'Book x', startTime: 'Aug 10', endTime: 'Aug 12'}
    for log in loaded_Log:
        if(log.name == logFile) and log.loaded:
           extract_Info(query, logFile)
           
    //read report file
    Read_File(query)
}

main()