tegi
2/11/2018 - 9:37 AM

A markdown-ish parser written in Swift 2.0

A markdown-ish parser written in Swift 2.0

/*
  Markdown-ish parser. Regular expressions are not allowed in here!

  The parser works in several stages:

  1. split up the text into lines
  2. combine lines into logical blocks
  3. tokenize the contents of the blocks

  The result of this process is a tree structure that describes the contents of
  the file. This tree can then be rendered to HTML, for example.
 */

import Foundation

// MARK: - String Methods

extension String {
  func replace(s1: String, with s2: String) -> String {
    return self.stringByReplacingOccurrencesOfString(s1, withString: s2)
  }

  /*
   * HTML escapes a string.
   */
  func escape() -> String {
    var s = self
    s = s.replace("&", with: "&")
    s = s.replace("\"", with: """)
    s = s.replace("'", with: "'")
    s = s.replace("<", with: "&lt;")
    s = s.replace(">", with: "&gt;")
    return s
  }
}

// MARK: - Types

struct Markdown {
  /*
   * The text from a Fragment is converted to a stream of Tokens. This allows
   * for the separation of actual text from the control characters that modify
   * the appearance of the text.
   */
  enum Token {
    case Spacing                 // any amount of whitespace
    case Text(text: String)      // words
    case Escape(symbol: String)  // --, <, >, ', & and so on
    case Open(symbol: String)    // ", *, **, ~~, ` spans
    case Close(symbol: String)
    case Link(fragment: Fragment, url: String)
    case Image(caption: String, url: String)
  }

  /*
   * Roughly speaking, each line in the input document corresponds to a fragment,
   * but the fragment will have any whitespace trimmed off. The last newline is
   * also stripped. If the line starts a new block, such as `- list item` then
   * the `-` is also stripped from the fragment.
   *
   * Note: Source code is not tokenized at the moment, but could be in the future
   * to support syntax highlighting. (Note: .Code blocks consist of only a single
   * fragment, so such fragments may include newlines.)
   */
  enum Fragment {
    case Text(tokens: [Token])
    case Code(code: String)
  }

  /*
   * The possible types of top-level blocks.
   */
  enum BlockType {
    case Empty                     // just for parsing
    case Header(level: Int)        // #
    case Text                      // regular paragraph of text
    case Quote                     // >
    case Code(language: String)    // ```language
    case CodeIndented              // 4 spaces or tab
    case ListItem(ordered: Bool)   // 1. or -
  }

  /*
   * Describes a top-level block. The lines from the input document are grouped
   * into such blocks. Each block will have one or more Fragments.
   */
  struct Block {
    var type: BlockType
    var fragments: [Fragment] = []

    init(type: BlockType) {
      self.type = type
    }
  }

  private let input: String          // the Markdown text
  private var blocks: [Block] = []   // the top-level blocks
}

// MARK: - Public API

extension Markdown {
  init(string input: String) {
    self.input = input
    blocks = parseBlocks(parseLines())
  }

  func tree() -> [Block] {
    return blocks
  }

  mutating func removeBlockAtIndex(index: Int) {
    blocks.removeAtIndex(index)
  }
}

// MARK: - Workarounds for Swift Issues

/*
 * For debugging only. Letting Swift do this automatically doesn't work
 * very dependably yet (Xcode 7 beta 5).
 */
extension Markdown.Token: CustomStringConvertible {
  var description: String {
    switch self {
    case .Spacing: return "Spacing"
    case .Text(let text): return text
    case .Escape(let symbol): return "Escape(\(symbol))"
    case .Open(let symbol): return "Open(\(symbol))"
    case .Close(let symbol): return "Close(\(symbol))"
    case .Link(let fragment, let url): return "Link(\(fragment),\(url))"
    case .Image(let caption, let url): return "Image(\(caption),\(url))"
    }
  }
}

extension Markdown.BlockType: CustomStringConvertible {
  var description: String {
    switch self {
    case .Empty: return "@Empty"
    case .Header(let level): return "@Header:\(level)"
    case .Text: return "@Text"
    case .Code(let language): return "@Code:\(language)"
    case .CodeIndented: return "@CodeIndented"
    case .Quote: return "@Quote"
    case .ListItem(let ordered): return "@ListItem:\(ordered)"
    }
  }
}

/*
 * These helper methods are necessary because `if case` cannot be combined with
 * other conditions.
 *
 * In Swift 2.0 you can't write: if case .Quote = foo && bar { ... }
 *                           or: if !case .Empty
 *                           or: if case .Empty || case .Header || case .Code
 *
 * But at least you can now write `if foo.isQuote() && bar { ... }`
 */
private extension Markdown.BlockType {
  func isEmpty() -> Bool {
    if case .Empty = self { return true } else { return false }
  }

  func isCode() -> Bool {
    if case .Code = self { return true } else { return false }
  }

  func isQuote() -> Bool {
    if case .Quote = self { return true } else { return false }
  }

  func isListItem() -> Bool {
    if case .ListItem = self { return true } else { return false }
  }

  func isOrderedList() -> Bool {
    if case ListItem(let ordered) = self { return ordered } else { return false }
  }

  func isEmpty_Header_Code() -> Bool {
    switch self {
    case .Empty, .Header, .Code: return true
    default: return false
    }
  }

  func isText_CodeIndented() -> Bool {
    switch self {
    case .Text, .CodeIndented: return true
    default: return false
    }
  }

  func shouldTrimWhitespace() -> Bool {
    switch self {
    case .Code, .CodeIndented: return false
    default: return true
    }
  }
}

// MARK: - Supporting Methods

private extension Character {
  func isWhitespace() -> Bool {
    return self == " " || self == "\t" || self == "\r"
  }

  func isWhitespaceOrNewline() -> Bool {
    return isWhitespace() || self == "\n"
  }
}

private extension Markdown {
  func eatLeadingWhitespace(startIndex: String.Index, _ endIndex: String.Index) -> String.Index {
    for var i = startIndex; i < endIndex; i = i.successor() {
      if !input[i].isWhitespaceOrNewline() { return i }
    }
    return endIndex
  }

  func eatTrailingWhitespace(startIndex: String.Index, _ endIndex: String.Index) -> String.Index {
    for var i = endIndex.predecessor(); i >= startIndex; i = i.predecessor() {
      if !input[i].isWhitespaceOrNewline() { return i.successor() }
    }
    return startIndex
  }
}

// MARK: - Splitting Into Lines

private extension Markdown {
  /*
   * Determine the indices in the input document at which new lines begin.
   */
  func parseLines() -> [String.Index] {
    var lines: [String.Index] = []
    var i = input.startIndex
    while i < input.endIndex {
      let c = input[i]
      i = i.successor()

      // For convenience, the end of the string is recorded twice so the block
      // scanning logic doesn't need a separate check for end-of-text. When the
      // end is reached, it simply sees one final .Empty block.
      // It may not be immediately obvious, but the check for endIndex here
      // makes sure this happens whether the text ends with a newline or not.
      if c == "\n" || i == input.endIndex {
        lines.append(i)
      }
    }
    lines.append(i)  // add the endIndex again
    return lines
  }
}

// MARK: - Top-Level Blocks

private extension Markdown {
  /*
   * Determine the top-level blocks in the file.
   */
  func parseBlocks(lines: [String.Index]) -> [Block] {
    var blocks: [Block] = []
    var lineStart = input.startIndex
    var fragmentStart = lineStart
    var fragmentEnd = lineStart
    var block = Block(type: .Empty)
    var count = 0

    for lineEnd in lines {
      // This looks at the next line. If the type of this line is different from
      // the current block, then we may need to end the block and make a new one.
      // Exactly how depends on the particular block type. The "adjusted" start
      // index is for skipping the symbol that identifies the line.
      var (nextType, adjustedLineStart) = identify(lineStart, lineEnd)

      // A fragment should not have leading or trailing whitespace or a newline.
      if nextType.shouldTrimWhitespace() {
        adjustedLineStart = eatLeadingWhitespace(adjustedLineStart, lineEnd)
      }
      let adjustedLineEnd = eatTrailingWhitespace(adjustedLineStart, lineEnd)

      //print("LINE '" + input.substringWithRange(lineStart ..< lineEnd) + "' is type \(nextType)")

      // Because the endIndex appears twice in the array, we can easily detect
      // whether scanning has reached the end of the input. (By the way, the 
      // type of that last "line" is .Empty, so when checking for the Empty state
      // we don't need to look at endOfInput also.)
      let endOfInput = (adjustedLineStart == input.endIndex)

      // It may be a bit weird to define inner functions here, but this allows
      // them to use variables such as `nextType` and `block`, without passing
      // those as parameters.

      func beginNewBlock() {
        block = Block(type: nextType)
        fragmentStart = adjustedLineStart
        count = 0
      }

      func addTextFragment() {
        let tokenized = tokenize(startIndex: fragmentStart, endIndex: fragmentEnd)
        let fragment = Fragment.Text(tokens: tokenized)
        block.fragments.append(fragment)
      }

      func addCodeFragment() {
        let s = input.substringWithRange(fragmentStart ..< fragmentEnd)
        let fragment = Fragment.Code(code: s)
        block.fragments.append(fragment)
      }

      func finishBlock() {
        blocks.append(block)
      }

      // The current fragment always refers to the previous line(s). It is given
      // by `fragmentStart` and `lineStart`, which is the end of the previous line
      // and also the start of this one. So we don't immediately add new fragments,
      // we always want to look at the next line first.

      switch block.type {
      case .Empty:
        if !nextType.isEmpty() {
          beginNewBlock()
        }

      case .Header:
        // A header is always just one line, so we can immediately add this block.
        addTextFragment()
        finishBlock()
        beginNewBlock()

      case .Text:
        addTextFragment()

        // A text block ends when the next line is empty, a header, or code.
        if nextType.isEmpty_Header_Code() {
          finishBlock()
          beginNewBlock()
        } else {
          fragmentStart = lineStart
        }

      case .Quote:
        // Each line in a quote is added as a new fragment, and we strip off
        // the leading > character.
        addTextFragment()

        // A quote ends when the next line is empty, a header, or code.
        if nextType.isEmpty_Header_Code() {
          blocks.append(block)
          beginNewBlock()
        } else {
          // Any other type of line also gets added to the quote. If it
          // starts with >, we strip that off.
          if case .Quote = nextType {
            fragmentStart = adjustedLineStart
          } else {
            fragmentStart = lineStart
          }
        }

      case .Code:
        // A code block ends after a closing line of ``` backticks.
        if nextType.isCode() || endOfInput {
          addCodeFragment()
          finishBlock()
          block = Block(type: .Empty)
        }

      case .CodeIndented:
        // Each line in an indented code block is added as a new fragment,
        // allowing us to strip off the leading spaces/tabs.
        if count == 0 {
          addCodeFragment()
        }

        // If the next line is empty, keep going. If the empty line(s) is/are
        // followed by more code, then we'll insert empty fragments.
        if nextType.isEmpty() && !endOfInput {
          ++count
          fragmentStart = lineEnd.predecessor()
        } else if case .CodeIndented = nextType {
          // If the next line is also an indented code block, then keep going.
          // If we've seen empty lines, then add an empty fragment for each line.
          if count > 0 {
            fragmentEnd = fragmentStart
            for _ in 1...count { addCodeFragment() }
            count = 0
          }
          fragmentStart = adjustedLineStart
        } else {
          // If the next line is any other kind of block, then the indented 
          // code block has ended.
          finishBlock()
          beginNewBlock()
        }

      case .ListItem:
        // Each line in a list item is added as a new fragment, and we strip 
        // off the leading - character.
        addTextFragment()

        // If the next line is text or indented code, then interpret this as
        // another fragment that also belongs to this list item.
        if nextType.isText_CodeIndented() {
          fragmentStart = adjustedLineStart
        } else {
          blocks.append(block)
          beginNewBlock()
        }
      }

      lineStart = lineEnd
      fragmentEnd = adjustedLineEnd
    }
    return blocks
  }
}

// MARK: - Line Identification

private extension Markdown {
  typealias LineType = (BlockType, String.Index)

  /*
   * Scans the beginning of the line in order to identify what sort of line
   * this is. 
   *
   * Returns a new String.Index that points at the beginning of the actual text,
   * having skipped the identifying characters (#, .1, -) but not necessarily
   * any whitespace.
   */
  func identify(startIndex: String.Index, _ endIndex: String.Index) -> LineType {
    var i = startIndex

    func header() -> LineType {
      let textStart = i
      var count = 1
      i = i.successor()
      for ; i < endIndex; i = i.successor() {
        if input[i] == "#" {
          ++count
        } else {
          let headerStart = i
          i = i.successor()
          for ; i < endIndex; i = i.successor() {
            if !input[i].isWhitespaceOrNewline() {
              return (.Header(level: count), headerStart)
            }
          }
          break
        }
      }
      return (.Text, textStart)  // a # by itself
    }

    func code() -> LineType {
      let textStart = i
      i = i.successor()
      if i < endIndex && input[i] == "`" {
        i = i.successor()
        if i < endIndex && input[i] == "`" {
          i = i.successor()
          let j = endIndex.predecessor()  // not true for very last line if no newline
          if i < j {
            return (.Code(language: input.substringWithRange(i ..< j)), endIndex)
          } else {
            return (.Code(language: ""), endIndex)
          }
        }
      }
      return (.Text, textStart)
    }

    func unorderedListItem() -> LineType {
      let textStart = i
      i = i.successor()
      if i < endIndex && input[i].isWhitespace() {
        return (.ListItem(ordered: false), i)
      }
      return (.Text, textStart)
    }

    func orderedListItem() -> LineType {
      let textStart = i
      i = i.successor()
      if i < endIndex && input[i] == "." {
        i = i.successor()
        if i < endIndex && input[i].isWhitespace() {
          return (.ListItem(ordered: true), i.successor())
        }
      }
      return (.Text, textStart)
    }

    func identifier() -> LineType {
      switch input[i] {
      case "#":
        return header()
      case ">":
        return (.Quote, i.successor())
      case "`":
        return code()
      case "-":
        return unorderedListItem()
      case "1", "2", "3", "4", "5", "6", "7", "8", "9":
        return orderedListItem()
      default:
        return (.Text, i)
      }
    }

    func whitespace() -> LineType {
      var count = 0
      var codeStart = i
      for ; i < endIndex; i = i.successor() {
        switch input[i] {
        case "\n":
          break
        case " ", "\r":
          count += 1
        case "\t":
          count += 4
        default:
          if count >= 4 {
            return (.CodeIndented, codeStart)
          } else {
            return identifier()
          }
        }
        if count == 4 {
          codeStart = i.successor()
        }
      }
      return (.Empty, endIndex)
    }

    if i == endIndex || input[i] == "\n" {
      return (.Empty, endIndex)
    } else {
      switch input[i] {
      case " ", "\t", "\r":
        return whitespace()
      default:
        return identifier()
      }
    }
  }
}

// MARK: - Tokenization of Fragments

private extension Markdown {
  /*
   * The tokenizer takes in a text fragment and outputs a stream of `Token` objects.
   *
   * For example, the input:
   *    aaa  **x**
   *    I've
   *
   * becomes the following stream of tokens: Text(aaa) Spacing Open(bold) Text(x)
   * Close(Bold) Newline Text(I) Escape(') Text(ve)
   *
   * Multiple spaces get combined into a single token.
   */
  func tokenize(startIndex startIndex: String.Index, endIndex: String.Index) -> [Token] {
    var tokens = [Token]()
    var i = startIndex      // the lookahead character

    func addTextToken(s: String) {
      tokens.append(Token.Text(text: s))
    }

    func addEscapeToken(s: String) {
      tokens.append(Token.Escape(symbol: s))
    }

    func spacing() {
      for ; i < endIndex; i = i.successor() {
        if !input[i].isWhitespace() { break }
      }
      tokens.append(Token.Spacing)
    }

    func escapeDash() {
      if i < endIndex && input[i] == "-" {
        addEscapeToken("--")
        i = i.successor()
      } else {
        addTextToken("-")
      }
    }

    func escapeEllipsis() {
      if i < endIndex && input[i] == "." {
        i = i.successor()
        if i < endIndex && input[i] == "." {
          i = i.successor()
          addEscapeToken("...")
        } else {
          addTextToken("..")
        }
      } else {
        addTextToken(".")
      }
    }

    func escape() {
      let c = input[i]
      i = i.successor()
      if c == "-" {
        escapeDash()
      } else if c == "." {
        escapeEllipsis()
      } else {
        addEscapeToken(String(c))
      }
    }

    var seenOpen = [String: Bool]()

    func addOpenOrCloseToken(symbol: String, strict: Bool = false) {
      // The official rules are that *word* and *word1 word2* will work but
      // not * word *, *word *, or * word*. To keep things simple, we only
      // require that the opening * is followed by non-whitespace; where the
      // closing * is doesn't matter.
      if let open = seenOpen[symbol] where open {
        seenOpen[symbol] = false
        tokens.append(Token.Close(symbol: symbol))
      } else if strict && (i == endIndex || input[i].isWhitespace()) {
        addTextToken(symbol)
      } else {
        seenOpen[symbol] = true
        tokens.append(Token.Open(symbol: symbol))
      }
    }

    func strikethrough() {
      if i < endIndex && input[i] == "~" {
        i = i.successor()
        addOpenOrCloseToken("~~")
      } else {
        addTextToken("~")
      }
    }

    // Note that italics, bold, etc, do no work across multiple lines.
    // That happens because we tokenize each fragment individually and we reset
    // the tokenization state with each new fragment.
    func italicsOrBold() {
      if i < endIndex && input[i] == "*" {
        i = i.successor()
        addOpenOrCloseToken("**", strict: true)
      } else {
        addOpenOrCloseToken("*", strict: true)
      }
    }

    func span() {
      let c = input[i]
      i = i.successor()
      if c == "*" {
        italicsOrBold()
      } else if c == "~" {
        strikethrough()
      } else {
        addOpenOrCloseToken(String(c))
      }
    }

    func backslash() {
      i = i.successor()
      if i < endIndex {
        let c = input[i]
        if c == "*" || c == "[" || c == "]" {
          addTextToken(String(c))
          i = i.successor()
          return
        }
      }
      addTextToken("\\")
    }

    func parseLink() -> ((String.Index, String.Index), (String.Index, String.Index))? {
      let textStart = i.successor()

      // Loop until we find ] followed by ( followed by ). It's not a real link
      // unless it has text and a valid URL, but we're not that picky.
      // To be honest, this is where a regexp is the simpler solution. ;-)
      for ; i < endIndex; i = i.successor() {
        if input[i] == "]" {
          let textEnd = i
          i = i.successor()
          if i < endIndex && input[i] == "(" {
            i = i.successor()
            let urlStart = i
            for ; i < endIndex; i = i.successor() {
              if input[i] == ")" {
                let urlEnd = i
                i = i.successor()
                return ((eatLeadingWhitespace(textStart, textEnd),
                         eatTrailingWhitespace(textStart, textEnd)),
                        (eatLeadingWhitespace(urlStart, urlEnd),
                         eatTrailingWhitespace(urlStart, urlEnd)))
              }
            }
          }
        }
      }

      // This does not appear to be a validly formatted link.
      addTextToken("[")
      i = textStart
      return nil
    }

    func link() {
      if let ((textStart, textEnd), (urlStart, urlEnd)) = parseLink() {
        let tokenized = tokenize(startIndex: textStart, endIndex: textEnd)
        let fragment = Fragment.Text(tokens: tokenized)
        let url = input.substringWithRange(urlStart ..< urlEnd)
        let token = Token.Link(fragment: fragment, url: url)
        tokens.append(token)
      }
    }

    func image() {
      i = i.successor()
      if i < endIndex && input[i] == "[" {
        if let ((captionStart, captionEnd), (urlStart, urlEnd)) = parseLink() {
          let caption = input.substringWithRange(captionStart ..< captionEnd)
          let url = input.substringWithRange(urlStart ..< urlEnd)
          let token = Token.Image(caption: caption, url: url)
          tokens.append(token)
        }
      } else {
        addTextToken("!")
      }
    }

    func endsWord(c: Character) -> Bool {
      return c == " "  || c == "\t" || c == "\n" || c == "\r" ||                          // whitespace
             c == "'"  || c == "<"  || c == ">"  || c == "&"  || c == "-" || c == "." ||  // escapes
             c == "`"  || c == "\"" || c == "*"  || c == "~"  ||                          // span
             c == "\\" || c == "["  || c == "!"
    }

    func word() {
      let wordStart = i
      for ; i < endIndex; i = i.successor() {
        if endsWord(input[i]) { break }
      }
      addTextToken(input.substringWithRange(wordStart ..< i))
    }

    while i < endIndex {
      switch input[i] {
      case " ", "\t", "\r":
        spacing()
      case "'", "<", ">", "&", "-", ".":
        escape()
      case "`", "\"", "*", "~":
        span()
      case "\\":
        backslash()
      case "[":
        link()
      case "!":
        image()
      case "\n":
        fatalError("fragments should not contain newlines")
      default:
        word()
      }
    }
    return tokens
  }
}

// MARK: - HTML Rendering

private let escapeTable = [
  "'": "&#8217;",
  "<": "&lt;",
  ">": "&gt;",
  "&": "&amp;",
  "-": "&#8211;",
  "--": "&#8212;",
  "...": "&#8230;",
]

private let openTable = [
  "`": "<code>",
  "\"": "&#8220;",
  "*": "<em>",
  "**": "<strong>",
  "~~": "<del>",
]

private let closeTable = [
  "`": "</code>",
  "\"": "&#8221;",
  "*": "</em>",
  "**": "</strong>",
  "~~": "</del>",
]

private extension Markdown.Token {
  func toHTML() -> String {
    switch self {
    case .Spacing: return " "
    case .Text(let text): return text
    case .Escape(let symbol): return escapeTable[symbol]!
    case .Open(let symbol): return openTable[symbol]!
    case .Close(let symbol): return closeTable[symbol]!
    case .Link(let fragment, let url):
      return "<a href=\"\(url.escape())\">" + fragment.toHTML() + "</a>"
    case .Image(let caption, let url):
      return "<img src=\"\(url.escape())\" alt=\"\(caption.escape())\">"
    }
  }
}

private extension Markdown.Fragment {
  func toHTML() -> String {
    switch self {
    case .Text(let tokens):
      var s = ""
      for token in tokens {
        s += token.toHTML()
      }
      return s
    case .Code(let string):
      return string.escape()
    }
  }

  // If the text block contains only one image token, then we turn it into
  // a <figure> instead of a <p> paragraph.
  func isImageOnly() -> Bool {
    if case .Text(let tokens) = self where tokens.count == 1, case .Image = tokens[0] {
      return true
    } else {
      return false
    }
  }
}

private extension Markdown.Block {
  func isImageOnly() -> Bool {
    return fragments.count == 1 && fragments.first!.isImageOnly()
  }

  func formatFragments(separator: String = "<br>\n") -> String {
    precondition(fragments.count > 0)
    if fragments.count == 1 {
      return fragments.first!.toHTML()
    } else {
      return separator.join(fragments.map { $0.toHTML() })
    }
  }

  func toHTML() -> String {
    switch type {
    case .Empty:
      fatalError("should not happen")

    case .Header(let level):
      return "<h\(level)>" + formatFragments() + "</h\(level)>\n\n"

    case .Text:
      if isImageOnly() {
        return "<figure class=\"image\">" + formatFragments() + "</figure>\n\n"
      } else {
        return "<p>" + formatFragments("\n") + "</p>\n\n"
      }

    case .Quote:
      return "<p>" + formatFragments() + "</p>\n"

    case .Code(let language):
      var s = "<figure class=\"code\"><pre><code"
      if !language.isEmpty {
        s += " class=\"\(language.escape())\""
      }
      s += ">" + formatFragments() + "</code></pre></figure>\n\n"
      return s

    case .CodeIndented:
      return "<figure class=\"code\"><pre><code>" + formatFragments("\n") + "</code></pre></figure>\n\n"

    case .ListItem:
      return "<li>" + formatFragments() + "</li>\n"
    }
  }
}

extension Markdown {
  func renderHTML() -> String {
    var s = ""
    var lastType = BlockType.Empty
    var wasOrderedList = false

    for block in blocks {
      if !lastType.isQuote() && block.type.isQuote() {
        s += "<blockquote>\n"
      } else if lastType.isQuote() && !block.type.isQuote() {
        s += "</blockquote>\n\n"
      }

      if !lastType.isListItem() && block.type.isListItem() {
        wasOrderedList = block.type.isOrderedList()
        s += wasOrderedList ? "<ol>\n" : "<ul>\n"
      } else if lastType.isListItem() && !block.type.isListItem() {
        s += wasOrderedList ? "</ol>\n\n" : "</ul>\n\n"
      }

      s += block.toHTML()
      lastType = block.type
    }
    return s
  }
}

Barfdown: a Markdown-ish Parser Written in Swift

Goals for this project:

  • Parse a simplified version of Markdown that is good enough for writing my blog posts.
  • Be reasonably efficient. This means the parser shouldn't copy substrings around if not necessary. This is done by storing all the elements as indexes into the original text.
  • Be small and therefore be easy to understand.
  • No regular expressions. They are the lazy person's solution to parsing. ;-)

This is just a toy project for me to experiment with writing parsers in Swift. Because why not?

There may be bugs.

Supported Markdown

The Markdown tags that are currently supported:

# Headers

*italics*
**bold**
~~strikethrough~~
`code`

1. numbered
2. list

- unordered
- list

> quote

[Link text](http://url)

![Alt text](images/image.png)

    source code (indented 1 tab or 4 spaces)

 ```language

\*   literal asterisk
\[   literal [
\]   literal ]

Not supported are:

  • Tables
  • Horizontal rules
  • Nested lists
  • Unordered lists starting with *
  • Headers that are underlined
  • Line breaks (line ending in two spaces)
  • Probably lots of other stuff...

This could really do with a good test suite. ;-)

How to use it

Create a new Markdown instance and give it a String. The parser creates a tree structure that describes the Markdown document. You can either step through that tree yourself or simply call renderHTML() to convert it to HTML.

if let data = NSData(contentsOfFile: path) {
  if let text = NSString(data: data, encoding: NSUTF8StringEncoding) {
    let m = Markdown(string: text as String)
    let s = m.renderHTML()
    print("<!DOCTYPE html><html><head><meta charset=\"utf-8\"/></head><body>\n")
    print(s, appendNewline: false)
    print("</body></html>\n")
  }
}

Why not regexps?

They definitely make parsing easier but I don't like throwing a handful of regexps at a parsing problem.

When you use a regular expression, it is turned into a state machine by the regex parser. Here I've basically "unrolled" all those state machines by hand.