crazy4groovy
4/14/2014 - 12:35 PM

Get photos from deviantart gallery(s), download to local folder or export url list to local file

Get photos from deviantart gallery(s), download to local folder or export url list to local file

/*
 * author:      crazy4groovy
 * description: given a list of 1+ deviant art gallery URLs (semicolon separated),
 *              will parse out a list of all full size images found.
 * license:     MIT, no warranties or guarantees!
 */

import static groovyx.gpars.GParsPool.withPool as parallel

/////////////////////////////////
List urls = args.size() > 0 ?
             args[0].split(';')*.trim() :
             ['http://name_of_account.deviantart.com/gallery/?set=gallery_id_number']
String fileOutPath = args.size() > 1 ? args[1] : /C:\out.txt/
int maxCount = args.size() > 2 ? args[2].toInteger() : 2000
int threads = args.size() > 3 ? Math.min(args[3].toInteger(), 4) : 2
String groupSeparator = args.size() > 4 ? args[4] : File.separator
/////////////////////////////////

println "$fileOutPath $maxCount $threads $groupSeparator"

int offsetInterval = 24

Map imgMap = ([:].withDefault{[] as LinkedHashSet})
Map retriesMap = [:].withDefault{0}
int timeoutTries = 5;
imgMap = Collections.synchronizedMap(imgMap)

parallel(threads) {
    urls.eachParallel { url ->
        println "START *** ${url} *****>>"
        
        String picGroup = getPicGroup(url)
        if (!picGroup) {
          println "Invalid URL, please check it"
          return
        }

        url = java.net.URLEncoder.encode(url + (!url.contains('?') ? '?' : '&'))
        
        int prevSize = 0
        int count = 0
        int page = 0
        
        while (count < maxCount) {
            String u = url + 'offset=' + (page*offsetInterval)
            //println u
            String yql = """https://query.yahooapis.com/v1/public/yql?diagnostics=true&q=select%20*%20from%20html%20where%20xpath%3D%27%2F%2Fa%5B%40data-super-img%5D%27and%20url%3D'${u}'"""
            println "{{{ $yql }}}"
            //println (yql.toURL().text)
            
            Thread.sleep(1000 * (retriesMap[yql] ?: 0) + 100)
            
            def root
            
            try {
                root = new XmlSlurper().parse(yql)
            } 
            catch (java.io.IOException ex) {
                println ("caught IOException! $picGroup -- $ex")
                break;
            }
            catch (Exception ex) {
                println ("caught Exception! $picGroup -- $ex")
                break;
            }
            
            String err = root.diagnostics.url?.@error
            if (err != "") {
                if (retriesMap[yql] < timeoutTries) {
                    println "!  timed out :: $err -- retrying $picGroup pg#$page"
                    retriesMap[yql] = retriesMap[yql] + 1
                }
                else {
                    println "!! timed out :: $err -- SKIPPING $picGroup pg#$page"
                    
                    if (err != "Read timed out") break;
                    page++
                }
                
                continue;
            }
            
            println "found: ${root.results.a.size()}"
            
            List res = root.results.a
                            //.each {println it.@'data-super-img'.text()}
                            .collect{it.@'data-super-img'.text()}
                            //.findAll{it.contains(error)}*.replaceAll(error, '/')
            //println res.size()
            if (!res) break;
            
            imgMap[picGroup].addAll(res)
            
            count += res.size()
            
            if (prevSize == count) break;
            else 
                prevSize = count
            
            println "?? $picGroup pg#$page (${(page*offsetInterval)}) :: found: ${root.results.a.size()}, total size: $count >> set grand total size: ${imgMap.values().flatten().size()}"
            
            page++
        }
        
        println "END : ${url}; #$count"
    }
}

println "*SET GRAND TOTAL img list: (${imgMap.values().flatten().size()})*"

File o = new File(fileOutPath) 
if (o.isDirectory()) {
    def ant = new AntBuilder()
    
    imgMap.each { groupName, imgSet ->
        String rootPath = o.absolutePath + File.separator + groupName + groupSeparator
        rootPath = rootPath.replaceAll(/[\*\?"<>|=]/,'_')
        
        if (groupSeparator == File.separator 
            && (!(new File(rootPath).directory))) {
            (new File(rootPath)).mkdir()
        }
        
        parallel(4) {
            imgSet.eachParallel { 
                String imgName = it.split('/')[-1]
                try {
                    ant.get(src: it, dest: rootPath + imgName, skipexisting: 'true')
                } 
                catch (Exception ignore) { }
            }
        }
    }
}
else {
    print "Directory $fileOutPath does not exist, writing to file..."
    o << ''
    //o.delete()
    o << imgMap.values().flatten().join('\n') + '\n'
    println "DONE"
}

String getPicGroup( String url ) {
    String regex = /^(.+).deviantart.com(.+?)(\?.*?q=(\w+))?$/

    url -= 'http://'
    def m = (url =~ regex)
    try {
      String name = m[0][1] + m[0][2] + (m[0][4] ?: '')
      return (name.replaceAll('/',' ').trim().replaceAll(' ','_'))
    } catch (Exception ignore) {}
}