draco1023
11/17/2016 - 3:19 AM

Read text file with encoding detection

Read text file with encoding detection

using System.Collections.Generic;
using System.IO;
using System.Text;

namespace ConsoleApplication1
{
    ///<summary>
    /// EncodingDetector. Detects the Encoding used in byte arrays
    /// or files by testing the start of the file for a Byte Order Mark
    /// (called 'preamble' in .NET).
    ///
    /// Use ReadAllText() to read a file using a detected encoding.
    ///
    /// All encodings that have a preamble are supported.
    ///</summary>
    public class EncodingDetector
    {
        ///<summary>
        /// Helper class to store information about encodings
        /// with a preamble
        ///</summary>
        protected class PreambleInfo
        {
            protected Encoding _encoding;
            protected byte[] _preamble;

            ///<summary>
            /// Property Encoding (Encoding).
            ///</summary>
            public Encoding Encoding
            {
                get { return this._encoding; }
            }

            ///<summary>
            /// Property Preamble (byte[]).
            ///</summary>
            public byte[] Preamble
            {
                get { return this._preamble; }
            }

            ///<summary>
            /// Constructor with preamble and encoding
            ///</summary>
            ///<param name="encoding"></param>
            ///<param name="preamble"></param>
            public PreambleInfo(Encoding encoding, byte[] preamble)
            {
                this._encoding = encoding;
                this._preamble = preamble;
            }
        }

        // The list of encodings with a preamble,
        // sorted longest preamble first.
        protected static SortedList<int, PreambleInfo> _preambles = null;

        // Maximum length of all preamles
        protected static int _maxPreambleLength = 0;

        ///<summary>
        /// Read the contents of a text file as a string. Scan for a preamble first.
        /// If a preamble is found, the corresponding encoding is used.
        /// If no preamble is found, the supplied defaultEncoding is used.
        ///</summary>
        ///<param name="filename">The name of the file to read</param>
        ///<param name="defaultEncoding">The encoding to use if no preamble present</param>
        ///<param name="usedEncoding">The actual encoding used</param>
        ///<returns>The contents of the file as a string</returns>
        public static string ReadAllText(string filename, Encoding defaultEncoding, out Encoding usedEncoding)
        {
            // Read the contents of the file as an array of bytes
            byte[] bytes = File.ReadAllBytes(filename);

            // Detect the encoding of the file:
            usedEncoding = DetectEncoding(bytes);

            // If none found, use the default encoding.
            // Otherwise, determine the length of the encoding markers in the file
            int offset;
            if (usedEncoding == null)
            {
                offset = 0;
                usedEncoding = defaultEncoding;
            }
            else
            {
                offset = usedEncoding.GetPreamble().Length;
            }

            // Now interpret the bytes according to the encoding,
            // skipping the preample (if any)
            return usedEncoding.GetString(bytes, offset, bytes.Length - offset);
        }

        ///<summary>
        /// Detect the encoding in an array of bytes.
        ///</summary>
        ///<param name="bytes"></param>
        ///<returns>The encoding found, or null</returns>
        public static Encoding DetectEncoding(byte[] bytes)
        {
            // Scan for encodings if we haven't done so
            if (_preambles == null)
                ScanEncodings();

            // Try each preamble in turn
            foreach (PreambleInfo info in _preambles.Values)
            {
                // Match all bytes in the preamble
                bool match = true;

                if (bytes.Length >= info.Preamble.Length)
                {
                    for (int i = 0; i < info.Preamble.Length; i++)
                    {
                        if (bytes[i] != info.Preamble[i])
                        {
                            match = false;
                            break;
                        }
                    }
                    if (match)
                    {
                        return info.Encoding;
                    }
                }
            }

            return null;
        }

        ///<summary>
        /// Detect the encoding of a file. Reads just enough of
        /// the file to be able to detect a preamble.
        ///</summary>
        ///<param name="filename">The path name of the file</param>
        ///<returns>The encoding detected, or null if no preamble found</returns>
        public static Encoding DetectEncoding(string filename)
        {
            // Scan for encodings if we haven't done so
            if (_preambles == null)
                ScanEncodings();

            using (FileStream stream = File.OpenRead(filename))
            {
                // Never read more than the length of the file
                // or the maximum preamble length
                long n = stream.Length;

                // No bytes? No encoding!
                if (n == 0)
                    return null;

                // Read the minimum amount necessary
                if (n > _maxPreambleLength)
                    n = _maxPreambleLength;

                byte[] bytes = new byte[n];

                stream.Read(bytes, 0, (int)n);

                // Detect the encoding from the byte array
                return DetectEncoding(bytes);
            }
        }

        ///<summary>
        /// Loop over all available encodings and store those
        /// with a preamble in the _preambles list.
        /// The list is sorted by preamble length,
        /// longest preamble first. This prevents
        /// a short preamble 'masking' a longer one
        /// later in the list.
        ///</summary>
        protected static void ScanEncodings()
        {
            // Create a new sorted list of preambles
            _preambles = new SortedList<int, PreambleInfo>();

            // Loop over all encodings
            foreach (EncodingInfo encodingInfo in Encoding.GetEncodings())
            {
                // Do we have a preamble?
                byte[] preamble = encodingInfo.GetEncoding().GetPreamble();
                if (preamble.Length > 0)
                {
                    // Add it to the collection, inversely sorted by preamble length
                    // (and code page, to keep the keys unique)
                    _preambles.Add(-(preamble.Length * 1000000 + encodingInfo.CodePage),
                       new PreambleInfo(encodingInfo.GetEncoding(), preamble));

                    // Update the maximum preamble length if this one's longer
                    if (preamble.Length > _maxPreambleLength)
                    {
                        _maxPreambleLength = preamble.Length;
                    }
                }
            }
        }
    }
}