morristech
5/10/2018 - 9:03 AM

Python script to fix common CRLF and Unicode problems when working with Visual Studio and git.

Python script to fix common CRLF and Unicode problems when working with Visual Studio and git.

#!/usr/bin/env python

import sys
if sys.version_info < (2, 6):
    raise RuntimeError("Python 2.6+ is required.")

import codecs
import logging
import optparse
import os
import unittest

CRLF_OFFSET_ERROR = '\r\0\r\n\0'
CRLF_OFFSET_FIX = '\r\0\n\0'

def fsckByteString(content=None, log=None):
  if not content:
    raise ArgumentException('Content must not be empty.')
  
  if content.startswith(codecs.BOM_UTF16):
    if log: log.info('Detected UTF-16 BOM.')

    if CRLF_OFFSET_ERROR in content:
      if log: log.error('Byte shift due to improper line ending conversion!')
      if log: log.info('Correcting line endings...')
      content = content.replace(CRLF_OFFSET_ERROR, CRLF_OFFSET_FIX)
    
    if log: log.info('Converting to UTF-8...')
    return content.decode("utf16").encode("utf8")

  if content.startswith(codecs.BOM_UTF8):
    if log: log.warn('Detected unneccessary UTF-8 BOM.')
    if log: log.info('Removing BOM...')

    return content[len(codecs.BOM_UTF8):]
  
  if log: log.info('No action required.')
  return content


class fscker(unittest.TestCase):
  DATA = "simple\r\ntest\r\nof\r\nencodings"
  EXPECTED = DATA.encode("utf8")

  def test_valid_utf8(self):
    value  = self.DATA.encode("utf8")
    actual = fsckByteString(value)
    self.assertEqual(self.EXPECTED, actual)
  
  def test_valid_utf8_with_bom(self):
    value  = codecs.BOM_UTF8 + self.DATA.encode("utf8")
    actual = fsckByteString(value)
    self.assertEqual(self.EXPECTED, actual)
  
  def test_valid_utf16_to_utf8(self):
    value  = self.DATA.encode("utf16")
    actual = fsckByteString(value)
    self.assertEqual(self.EXPECTED, actual)
  
  def test_invalid_utf16_to_utf8(self):
    value  = self.DATA.encode("utf16").replace('\n', '\r\n')
    actual = fsckByteString(value)
    self.assertEqual(self.EXPECTED, actual)


if __name__ == '__main__':
  parser = optparse.OptionParser(usage='Usage: %prog [options] file1 [... fileN]')
  parser.add_option('--test', dest='is_testing', action='store_true', default=False, help='run test suite')

  options, files = parser.parse_args()
  logging.basicConfig(format='%(name)s  %(levelname)s: %(message)s', level=logging.INFO)

  print
  if options.is_testing:
    unittest.TextTestRunner(verbosity=2).run(unittest.TestSuite(
      unittest.defaultTestLoader.loadTestsFromTestCase(fscker)
    ))
  elif files:
    for fname in files:
      log = logging.getLogger(os.path.basename(fname))
      try:
        content = None
        with open(fname, 'rb') as f:
          content = fsckByteString(f.read(), log)
        with open(fname, 'wb') as f:
          f.write(content)
      except Exception, e:
        log.error('"%s" could not be checked.', fname)
        log.error(e)
      print
  else:
    parser.print_help()