onlyforbopi
6/18/2018 - 10:29 AM

PYTHON FILE ENCODING AND CONVERSIONS

  1. charmap error debugging
  2. Detecting encoding in Windows
  3. Detecting encoding with python
  4. Change encoding
  5. Encoding Read/Write
  6. Get Local Encoding
  7. Guess encoding of file
def guess_encoding(csv_file):
    """
    Name: guess_encoding
    Function: guess_encoding(file)
    Input: file_input
    Output: file_output
    Usage: guess_encoding(file_in)
    """
    import io
    import locale
    with io.open(csv_file, "rb") as f:
        data = f.read(5)
    if data.startswith(b"\xEF\xBB\xBF"):  # UTF-8 with a "BOM"
        return "utf-8-sig"
    elif data.startswith(b"\xFF\xFE") or data.startswith(b"\xFE\xFF"):
        return "utf-16"
    else:  # in Windows, guessing utf-8 doesn't work, so we have to try
        try:
            with io.open(csv_file, encoding="utf-8") as f:
                preview = f.read(222222)
                return "utf-8"
        except:
            return locale.getdefaultlocale()[1]
            
            
            
print (os.path)    
#test_hexdump(file_in)
print(guess_encoding(file_in))
print (locale.getdefaultlocale())
def localencoding():
    """
    Name: localencoding()
    Function: localencoding
    Input: None
    Output: Tuple with local encodings
    Usage: print(localencoding())
    Notes: Gives a tuple of two windows encodings.
    """
    import locale
    return locale.getdefaultlocale()
    
    
print(localencoding())
import os
import sys
import shutil


def predict_encoding(file_path, n_lines=20):
    '''Predict a file's encoding using chardet'''
    import chardet
    

    # Open the file as binary data
    with open(file_path, 'rb') as f:
        # Join binary lines for specified number of lines
        rawdata = b''.join([f.readline() for _ in range(n_lines)])

    return chardet.detect(rawdata)['encoding']
    #return chardet.detect(rawdata)



####################################################

def correctSubtitleEncoding(filename, newFilename, encoding_from, encoding_to='ascii'):
    with open(filename, 'r', encoding=encoding_from) as fr:
        with open(newFilename, 'w', encoding=encoding_to) as fw:
            for line in fr:
                fw.write(line[:-1]+'\r\n')
 
def writeConversion(file_in):
    with codecs.open(str(outputDir + '/' + file_in, 'w', targetFormat)) as targetFile:
        for line in file:
            targetFile.write(line)

def convertFileWithDetection(fileName):

    import codecs

    print("Converting '" + fileName + "'...")
    format=get_encoding_type(fileName)
    print (format)
    try:
        with codecs.open(fileName, 'rU', format) as sourceFile:
            writeConversion(sourceFile)
            print('Done.')
            return
    except UnicodeDecodeError:
        pass

    print("Error: failed to convert '" + fileName + "'.")

def change_enco_blocksize(file_in, block_size, file_out):
    import codecs
    BLOCKSIZE = block_size # or some other, desired size in bytes
    with codecs.open(file_in, "r", predict_encoding(file_in, n_lines=20)) as sourceFile:
        print ("Input file: " + str(predict_encoding(file_in, n_lines=20)))
        with codecs.open(file_out, "w", "UTF-8") as targetFile:
            while True:
                contents = sourceFile.read(BLOCKSIZE)
                if not contents:
                    break
                targetFile.write(contents)
                





 
def convert_to_utf8(filename):


    import os
    import sys
    import shutil
    
    # gather the encodings you think that the file may be
    # encoded inside a tuple
    encodings = ['windows-1253', 'iso-8859-7', 'macgreek', 'ascii', 'iso-8859-1']
    
    encodings.append(predict_encoding(filename))
    print(encodings)
    
 
    # try to open the file and exit if some IOError occurs
    try:
        f = open(filename, 'r').read()
    except Exception:
        sys.exit(1)
 
    # now start iterating in our encodings tuple and try to
    # decode the file
    for enc in encodings:
        try:
            # try to decode the file with the first encoding
            # from the tuple.
            # if it succeeds then it will reach break, so we
            # will be out of the loop (something we want on
            # success).
            # the data variable will hold our decoded text
            data = f.decode(enc)
            break
        except Exception:
            # if the first encoding fail, then with the continue
            # keyword will start again with the second encoding
            # from the tuple an so on.... until it succeeds.
            # if for some reason it reaches the last encoding of
            # our tuple without success, then exit the program.
            if enc == encodings[-1]:
                print("Couldnt find encoding")
                sys.exit(1)
            continue
 
    # now get the absolute path of our filename and append .bak
    # to the end of it (for our backup file)
    fpath = os.path.abspath(filename)
    newfilename = fpath + '.bak'
    # and make our backup file with shutil
    shutil.copy(filename, newfilename)
 
    # and at last convert it to utf-8
    f = open(filename, 'w')
    try:
        f.write(data.encode('utf-8'))
    except Exception as e:
        print (e)
    finally:
        f.close()
        
        
if __name__ == '__main__':
    convert_to_utf8(sys.argv[1])
    
    
There are various ways to detect encoding in Windows:
  
  1. Standard notepad: Open the file -> Click SaveAs -> The encoding will be
     mentioned near the Save Button, with a drop down list and options.
     
  2. Git or Cygwin: If you have Git or Cygwin installed you can use the file
     linux command. (ie file *, file -bi *) and so on. With git you can:
     Rclick on Screen - Open Git Bash at directory -> type "file -bi <input".
     
  3. Special Utility in C# -> File Checker that detects.
###############################################
#
# Main module: chardet
# Known issues: Slight incompatibility py2/py3
# Main use: predict_encoding
#
# Sources: https://pypi.org/project/chardet/
# Troubleshooting: http://getpython3.com/diveintopython3/case-study-porting-chardet-to-python-3.html
#
###############################################



import sys

file_in = sys.argv[1]

# import magic

# blob = open('unknown-file').read()
# m = magic.open(magic.MAGIC_MIME_ENCODING)
# m.load()
# encoding = m.buffer(blob)  # "utf-8" "us-ascii" etc


def predict_encoding(file_path, n_lines=20):
    '''Predict a file's encoding using chardet'''
    import chardet
    

    # Open the file as binary data
    with open(file_path, 'rb') as f:
        # Join binary lines for specified number of lines
        rawdata = b''.join([f.readline() for _ in range(n_lines)])

    #return chardet.detect(rawdata)['encoding']
    return chardet.detect(rawdata)
    

def get_encoding_type(current_file):
    import os
    import sys
    import codecs
    from chardet.universaldetector import UniversalDetector
    detector = UniversalDetector()
    detector.reset()
    for line in open(current_file, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    return detector.result['encoding']


    
def predict_encoding_sim(file_in):
    import chardet
    fh = open(file_in, mode="rb")
    sc = chardet.detect(fh)
    return sc




def predict_encoding_adv(file_path):
    from chardet.universaldetector import UniversalDetector
    with open(file_path, 'rb') as f:
        detector = UniversalDetector()
        for line in f.readlines():
            detector.feed(line)
            if detector.done: break
        detector.close
    f.close()
    return detector.result
    
    
#print (predict_encoding_sim(file_in))    
print (predict_encoding(file_in))
print (predict_encoding_adv(file_in))


################################################################
# Different method - Using the file system call in linux

import subprocess
file_cmd = ['file', 'test.txt']
p = subprocess.Popen(file_cmd, stdout=subprocess.PIPE)
cmd_output = p.stdout.readlines()
# x will begin with the file type output as is observed using 'file' command
x = cmd_output[0].split(": ")[1]
return x.startswith('UTF-8')
#Traceback (most recent call last):
#  File "C:/Users/Andres/Desktop/scrap/scrap.py", line 444, in <module>
#    dar_p_fisica()
#  File "C:/Users/Andres/Desktop/scrap/scrap.py", line 390, in dar_p_fisica
#    print(datos.text) #.encode().decode('ascii', 'ignore')
#  File "C:\Python34\lib\encodings\cp1252.py", line 19, in encode
#    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
#UnicodeEncodeError: 'charmap' codec can't encode character '\u2010' in position 173: character maps to <undefined>
 
 
#In command prompt:
# change character page
 
chcp 65001
 
# Other codepages at CHCP.com


# chcp command 
#Change the active console Code Page. The default code page is determined by the Windows Locale.
 Syntax
      CHCP code_page
 
Key
   code_page  A code page number (e.g. 437)  

#This command is rarely required as most GUI programs and PowerShell now support Unicode. 
#When working with characters outside the ASCII range of 0-127, the choice of code page will 
#determine the set of characters displayed. Programs that you start after you assign a new 
#code page will use the new code page, however, programs (except Cmd.exe) that you started 
#before assigning the new code page will use the original code page.
 
# Code page	Country/ Region/ Language

#437	United States
#850	Multilingual (Latin I)
#852	Slavic (Latin II)
#855	Cyrillic (Russian)
#857	Turkish
#860	Portuguese
#861	Icelandic
#863	Canadian-French
#865	Nordic
#866	Russian
#869	Modern Greek
#1252	West European Latin
#65000	UTF-7 *
#65001	UTF-8 *

#* The 65000/1 code pages are encoded as UTF-7/8 to allow to working with unicode 
#data in 7-bit and 8-bit environments, however
 
#Even if you use CHCP to run the Windows Console in a unicode code page, many 
#applications will assume that the default still applies, e.g. Java requires 
#the-Dfile option: java -Dfile.encoding=UTF-8
 
#Unicode characters will only display if the current console font contains the 
#characters. So use a TrueType font like Lucida Console instead of the CMD default Raster Font.
 
#The CMD Shell (which runs inside the Windows Console)
#CMD.exe only supports two character encodings Ascii and Unicode (CMD /A and CMD /U)
 
#If you need full unicode support use PowerShell. There is still VERY limited support 
#for unicode in the CMD shell, piping, redirection and most commands are still ANSI only. 
#The only commands that work are DIR, FOR /F and TYPE, this allows reading and writing 
#(UTF-16LE / BOM) files and filenames but not much else.
 
#Defaults
#The default code page in the USA is 437, the default in most of Europe is 850. The number 
#of supported code pages was greatly increased in Windows 7. For a full list of code pages 
#supported on your machine, run NLSINFO (Resource Kit Tools)
 
#Files saved in Windows Notepad will be in ANSI format by default, but can also be saved as 
#Unicode UTF-16LE or UTF -8 and for unicode files, will include a BOM. A BOM will make a batch 
#file not executable on Windows, so batch files must be saved as ANSI, not Unicode.
 
#Examples:
 
#View the current code page:
chcp
 
#Change the code page to Unicode/65001:
chcp 65001
 
#“Remember that there is no code faster than no code” ~ Taligent's Guide to Designing Programs