zhasm
5/23/2011 - 7:37 AM

unshorten shorten URLs

unshorten shorten URLs

#!/usr/bin/env python                                                                                                                                                                                                                       
# -*- coding: utf-8 -*-                                                                                                                                                                                                                     
#                                                                                                                                                                                                                                           
#author:         rex                                                                                                                                                                                                                        
#blog:           http://iregex.org                                                                                                                                                                                                          
#filename        unshortenurl.py                                                                                                                                                                                                            
#created:        2011-05-23 15:35                                                                                                                                                                                                           
                                                                                                                                                                                                                                            
import pycurl                                                                                                                                                                                                                               
import StringIO                                                                                                                                                                                                                             
import re                                                                                                                                                                                                                                   
                                                                                                                                                                                                                                            
class UnShortenUrl():                                                                                                                                                                                                                       
    def __init__(self, url):                                                                                                                                                                                                                
        self.url=url                                                                                                                                                                                                                        
                                                                                                                                                                                                                                            
        c=pycurl.Curl()                                                                                                                                                                                                                     
        c.setopt(c.NOBODY,True)                                                                                                                                                                                                             
        self.curl=c                                                                                                                                                                                                                         
                                                                                                                                                                                                                                            
        f=StringIO.StringIO()                                                                                                                                                                                                               
        c.setopt(c.HEADER, True)                                                                                                                                                                                                            
        c.setopt(c.WRITEFUNCTION, f.write)                                                                                                                                                                                                  
        self.f=f                                                                                                                                                                                                                            
                                                                                                                                                                                                                                            
                                                                                                                                                                                                                                            
    def __str__(self):                                                                                                                                                                                                                      
        c=self.curl                                                                                                                                                                                                                         
        c.setopt(c.URL, self.url)                                                                                                                                                                                                           
        f=self.f                                                                                                                                                                                                                            
        f.truncate(0)                                                                                                                                                                                                                       
        c.perform()                                                                                                                                                                                                                         
        value=f.getvalue()                                                                                                                                                                                                                  
                                                                                                                                                                                                                                            
        try:                                                                                                                                                                                                                                
            return re.findall(r'''(?mi)(?<=^Location:\s).+$''', value)[0].strip()                                                                                                                                                           
        except:                                                                                                                                                                                                                             
            return ""                                                                                                                                                                                                                       
                                                                                                                                                                                                                                            
def UnShortenAll(text):    
    def _unshoren(x):    
        url=x.group(1)    
        url=UnShortenUrl(url)    
        return str(url)                                                                                                                                                                                                                     
                                                                                                                                                                                                                                            
    text=re.sub(ur"""(?i)(http://(?:j\.mp|is\.gd|goo\.gl)\S+)""", _unshoren, text)                                                                                                                                                          
    return text                                                                                                                                                                                                                             
                                                                                                                                                                                                                                            
def main():                                                                                                                                                                                                                                 
    import sys                                                                                                                                                                                                                              
                                                                                                                                                                                                                                            
    try:                                                                                                                                                                                                                                    
        text=sys.argv[1]                                                                                                                                                                                                                    
        print UnShortenAll(text)                                                                                                                                                                                                            
    except Exception, e:                                                                                                                                                                                                                    
        print '''Usage: python %s <URL>''' % sys.argv[0]                                                                                                                                                                                    
        print str(e)                                                                                                                                                                                                                        
                                                                                                                                                                                                                                            
if __name__=='__main__':                                                                                                                                                                                                                    
    main()