def guess_encoding(csv_file):
"""
Name: guess_encoding
Function: guess_encoding(file)
Input: file_input
Output: file_output
Usage: guess_encoding(file_in)
"""
import io
import locale
with io.open(csv_file, "rb") as f:
data = f.read(5)
if data.startswith(b"\xEF\xBB\xBF"): # UTF-8 with a "BOM"
return "utf-8-sig"
elif data.startswith(b"\xFF\xFE") or data.startswith(b"\xFE\xFF"):
return "utf-16"
else: # in Windows, guessing utf-8 doesn't work, so we have to try
try:
with io.open(csv_file, encoding="utf-8") as f:
preview = f.read(222222)
return "utf-8"
except:
return locale.getdefaultlocale()[1]
print (os.path)
#test_hexdump(file_in)
print(guess_encoding(file_in))
print (locale.getdefaultlocale())
def localencoding():
"""
Name: localencoding()
Function: localencoding
Input: None
Output: Tuple with local encodings
Usage: print(localencoding())
Notes: Gives a tuple of two windows encodings.
"""
import locale
return locale.getdefaultlocale()
print(localencoding())
import os
import sys
import shutil
def predict_encoding(file_path, n_lines=20):
'''Predict a file's encoding using chardet'''
import chardet
# Open the file as binary data
with open(file_path, 'rb') as f:
# Join binary lines for specified number of lines
rawdata = b''.join([f.readline() for _ in range(n_lines)])
return chardet.detect(rawdata)['encoding']
#return chardet.detect(rawdata)
####################################################
def correctSubtitleEncoding(filename, newFilename, encoding_from, encoding_to='ascii'):
with open(filename, 'r', encoding=encoding_from) as fr:
with open(newFilename, 'w', encoding=encoding_to) as fw:
for line in fr:
fw.write(line[:-1]+'\r\n')
def writeConversion(file_in):
with codecs.open(str(outputDir + '/' + file_in, 'w', targetFormat)) as targetFile:
for line in file:
targetFile.write(line)
def convertFileWithDetection(fileName):
import codecs
print("Converting '" + fileName + "'...")
format=get_encoding_type(fileName)
print (format)
try:
with codecs.open(fileName, 'rU', format) as sourceFile:
writeConversion(sourceFile)
print('Done.')
return
except UnicodeDecodeError:
pass
print("Error: failed to convert '" + fileName + "'.")
def change_enco_blocksize(file_in, block_size, file_out):
import codecs
BLOCKSIZE = block_size # or some other, desired size in bytes
with codecs.open(file_in, "r", predict_encoding(file_in, n_lines=20)) as sourceFile:
print ("Input file: " + str(predict_encoding(file_in, n_lines=20)))
with codecs.open(file_out, "w", "UTF-8") as targetFile:
while True:
contents = sourceFile.read(BLOCKSIZE)
if not contents:
break
targetFile.write(contents)
def convert_to_utf8(filename):
import os
import sys
import shutil
# gather the encodings you think that the file may be
# encoded inside a tuple
encodings = ['windows-1253', 'iso-8859-7', 'macgreek', 'ascii', 'iso-8859-1']
encodings.append(predict_encoding(filename))
print(encodings)
# try to open the file and exit if some IOError occurs
try:
f = open(filename, 'r').read()
except Exception:
sys.exit(1)
# now start iterating in our encodings tuple and try to
# decode the file
for enc in encodings:
try:
# try to decode the file with the first encoding
# from the tuple.
# if it succeeds then it will reach break, so we
# will be out of the loop (something we want on
# success).
# the data variable will hold our decoded text
data = f.decode(enc)
break
except Exception:
# if the first encoding fail, then with the continue
# keyword will start again with the second encoding
# from the tuple an so on.... until it succeeds.
# if for some reason it reaches the last encoding of
# our tuple without success, then exit the program.
if enc == encodings[-1]:
print("Couldnt find encoding")
sys.exit(1)
continue
# now get the absolute path of our filename and append .bak
# to the end of it (for our backup file)
fpath = os.path.abspath(filename)
newfilename = fpath + '.bak'
# and make our backup file with shutil
shutil.copy(filename, newfilename)
# and at last convert it to utf-8
f = open(filename, 'w')
try:
f.write(data.encode('utf-8'))
except Exception as e:
print (e)
finally:
f.close()
if __name__ == '__main__':
convert_to_utf8(sys.argv[1])
There are various ways to detect encoding in Windows:
1. Standard notepad: Open the file -> Click SaveAs -> The encoding will be
mentioned near the Save Button, with a drop down list and options.
2. Git or Cygwin: If you have Git or Cygwin installed you can use the file
linux command. (ie file *, file -bi *) and so on. With git you can:
Rclick on Screen - Open Git Bash at directory -> type "file -bi <input".
3. Special Utility in C# -> File Checker that detects.
###############################################
#
# Main module: chardet
# Known issues: Slight incompatibility py2/py3
# Main use: predict_encoding
#
# Sources: https://pypi.org/project/chardet/
# Troubleshooting: http://getpython3.com/diveintopython3/case-study-porting-chardet-to-python-3.html
#
###############################################
import sys
file_in = sys.argv[1]
# import magic
# blob = open('unknown-file').read()
# m = magic.open(magic.MAGIC_MIME_ENCODING)
# m.load()
# encoding = m.buffer(blob) # "utf-8" "us-ascii" etc
def predict_encoding(file_path, n_lines=20):
'''Predict a file's encoding using chardet'''
import chardet
# Open the file as binary data
with open(file_path, 'rb') as f:
# Join binary lines for specified number of lines
rawdata = b''.join([f.readline() for _ in range(n_lines)])
#return chardet.detect(rawdata)['encoding']
return chardet.detect(rawdata)
def get_encoding_type(current_file):
import os
import sys
import codecs
from chardet.universaldetector import UniversalDetector
detector = UniversalDetector()
detector.reset()
for line in open(current_file, 'rb'):
detector.feed(line)
if detector.done: break
detector.close()
return detector.result['encoding']
def predict_encoding_sim(file_in):
import chardet
fh = open(file_in, mode="rb")
sc = chardet.detect(fh)
return sc
def predict_encoding_adv(file_path):
from chardet.universaldetector import UniversalDetector
with open(file_path, 'rb') as f:
detector = UniversalDetector()
for line in f.readlines():
detector.feed(line)
if detector.done: break
detector.close
f.close()
return detector.result
#print (predict_encoding_sim(file_in))
print (predict_encoding(file_in))
print (predict_encoding_adv(file_in))
################################################################
# Different method - Using the file system call in linux
import subprocess
file_cmd = ['file', 'test.txt']
p = subprocess.Popen(file_cmd, stdout=subprocess.PIPE)
cmd_output = p.stdout.readlines()
# x will begin with the file type output as is observed using 'file' command
x = cmd_output[0].split(": ")[1]
return x.startswith('UTF-8')
#Traceback (most recent call last):
# File "C:/Users/Andres/Desktop/scrap/scrap.py", line 444, in <module>
# dar_p_fisica()
# File "C:/Users/Andres/Desktop/scrap/scrap.py", line 390, in dar_p_fisica
# print(datos.text) #.encode().decode('ascii', 'ignore')
# File "C:\Python34\lib\encodings\cp1252.py", line 19, in encode
# return codecs.charmap_encode(input,self.errors,encoding_table)[0]
#UnicodeEncodeError: 'charmap' codec can't encode character '\u2010' in position 173: character maps to <undefined>
#In command prompt:
# change character page
chcp 65001
# Other codepages at CHCP.com
# chcp command
#Change the active console Code Page. The default code page is determined by the Windows Locale.
Syntax
CHCP code_page
Key
code_page A code page number (e.g. 437)
#This command is rarely required as most GUI programs and PowerShell now support Unicode.
#When working with characters outside the ASCII range of 0-127, the choice of code page will
#determine the set of characters displayed. Programs that you start after you assign a new
#code page will use the new code page, however, programs (except Cmd.exe) that you started
#before assigning the new code page will use the original code page.
# Code page Country/ Region/ Language
#437 United States
#850 Multilingual (Latin I)
#852 Slavic (Latin II)
#855 Cyrillic (Russian)
#857 Turkish
#860 Portuguese
#861 Icelandic
#863 Canadian-French
#865 Nordic
#866 Russian
#869 Modern Greek
#1252 West European Latin
#65000 UTF-7 *
#65001 UTF-8 *
#* The 65000/1 code pages are encoded as UTF-7/8 to allow to working with unicode
#data in 7-bit and 8-bit environments, however
#Even if you use CHCP to run the Windows Console in a unicode code page, many
#applications will assume that the default still applies, e.g. Java requires
#the-Dfile option: java -Dfile.encoding=UTF-8
#Unicode characters will only display if the current console font contains the
#characters. So use a TrueType font like Lucida Console instead of the CMD default Raster Font.
#The CMD Shell (which runs inside the Windows Console)
#CMD.exe only supports two character encodings Ascii and Unicode (CMD /A and CMD /U)
#If you need full unicode support use PowerShell. There is still VERY limited support
#for unicode in the CMD shell, piping, redirection and most commands are still ANSI only.
#The only commands that work are DIR, FOR /F and TYPE, this allows reading and writing
#(UTF-16LE / BOM) files and filenames but not much else.
#Defaults
#The default code page in the USA is 437, the default in most of Europe is 850. The number
#of supported code pages was greatly increased in Windows 7. For a full list of code pages
#supported on your machine, run NLSINFO (Resource Kit Tools)
#Files saved in Windows Notepad will be in ANSI format by default, but can also be saved as
#Unicode UTF-16LE or UTF -8 and for unicode files, will include a BOM. A BOM will make a batch
#file not executable on Windows, so batch files must be saved as ANSI, not Unicode.
#Examples:
#View the current code page:
chcp
#Change the code page to Unicode/65001:
chcp 65001
#“Remember that there is no code faster than no code” ~ Taligent's Guide to Designing Programs