A markdown-ish parser written in Swift 2.0
/*
Markdown-ish parser. Regular expressions are not allowed in here!
The parser works in several stages:
1. split up the text into lines
2. combine lines into logical blocks
3. tokenize the contents of the blocks
The result of this process is a tree structure that describes the contents of
the file. This tree can then be rendered to HTML, for example.
*/
import Foundation
// MARK: - String Methods
extension String {
func replace(s1: String, with s2: String) -> String {
return self.stringByReplacingOccurrencesOfString(s1, withString: s2)
}
/*
* HTML escapes a string.
*/
func escape() -> String {
var s = self
s = s.replace("&", with: "&")
s = s.replace("\"", with: """)
s = s.replace("'", with: "'")
s = s.replace("<", with: "<")
s = s.replace(">", with: ">")
return s
}
}
// MARK: - Types
struct Markdown {
/*
* The text from a Fragment is converted to a stream of Tokens. This allows
* for the separation of actual text from the control characters that modify
* the appearance of the text.
*/
enum Token {
case Spacing // any amount of whitespace
case Text(text: String) // words
case Escape(symbol: String) // --, <, >, ', & and so on
case Open(symbol: String) // ", *, **, ~~, ` spans
case Close(symbol: String)
case Link(fragment: Fragment, url: String)
case Image(caption: String, url: String)
}
/*
* Roughly speaking, each line in the input document corresponds to a fragment,
* but the fragment will have any whitespace trimmed off. The last newline is
* also stripped. If the line starts a new block, such as `- list item` then
* the `-` is also stripped from the fragment.
*
* Note: Source code is not tokenized at the moment, but could be in the future
* to support syntax highlighting. (Note: .Code blocks consist of only a single
* fragment, so such fragments may include newlines.)
*/
enum Fragment {
case Text(tokens: [Token])
case Code(code: String)
}
/*
* The possible types of top-level blocks.
*/
enum BlockType {
case Empty // just for parsing
case Header(level: Int) // #
case Text // regular paragraph of text
case Quote // >
case Code(language: String) // ```language
case CodeIndented // 4 spaces or tab
case ListItem(ordered: Bool) // 1. or -
}
/*
* Describes a top-level block. The lines from the input document are grouped
* into such blocks. Each block will have one or more Fragments.
*/
struct Block {
var type: BlockType
var fragments: [Fragment] = []
init(type: BlockType) {
self.type = type
}
}
private let input: String // the Markdown text
private var blocks: [Block] = [] // the top-level blocks
}
// MARK: - Public API
extension Markdown {
init(string input: String) {
self.input = input
blocks = parseBlocks(parseLines())
}
func tree() -> [Block] {
return blocks
}
mutating func removeBlockAtIndex(index: Int) {
blocks.removeAtIndex(index)
}
}
// MARK: - Workarounds for Swift Issues
/*
* For debugging only. Letting Swift do this automatically doesn't work
* very dependably yet (Xcode 7 beta 5).
*/
extension Markdown.Token: CustomStringConvertible {
var description: String {
switch self {
case .Spacing: return "Spacing"
case .Text(let text): return text
case .Escape(let symbol): return "Escape(\(symbol))"
case .Open(let symbol): return "Open(\(symbol))"
case .Close(let symbol): return "Close(\(symbol))"
case .Link(let fragment, let url): return "Link(\(fragment),\(url))"
case .Image(let caption, let url): return "Image(\(caption),\(url))"
}
}
}
extension Markdown.BlockType: CustomStringConvertible {
var description: String {
switch self {
case .Empty: return "@Empty"
case .Header(let level): return "@Header:\(level)"
case .Text: return "@Text"
case .Code(let language): return "@Code:\(language)"
case .CodeIndented: return "@CodeIndented"
case .Quote: return "@Quote"
case .ListItem(let ordered): return "@ListItem:\(ordered)"
}
}
}
/*
* These helper methods are necessary because `if case` cannot be combined with
* other conditions.
*
* In Swift 2.0 you can't write: if case .Quote = foo && bar { ... }
* or: if !case .Empty
* or: if case .Empty || case .Header || case .Code
*
* But at least you can now write `if foo.isQuote() && bar { ... }`
*/
private extension Markdown.BlockType {
func isEmpty() -> Bool {
if case .Empty = self { return true } else { return false }
}
func isCode() -> Bool {
if case .Code = self { return true } else { return false }
}
func isQuote() -> Bool {
if case .Quote = self { return true } else { return false }
}
func isListItem() -> Bool {
if case .ListItem = self { return true } else { return false }
}
func isOrderedList() -> Bool {
if case ListItem(let ordered) = self { return ordered } else { return false }
}
func isEmpty_Header_Code() -> Bool {
switch self {
case .Empty, .Header, .Code: return true
default: return false
}
}
func isText_CodeIndented() -> Bool {
switch self {
case .Text, .CodeIndented: return true
default: return false
}
}
func shouldTrimWhitespace() -> Bool {
switch self {
case .Code, .CodeIndented: return false
default: return true
}
}
}
// MARK: - Supporting Methods
private extension Character {
func isWhitespace() -> Bool {
return self == " " || self == "\t" || self == "\r"
}
func isWhitespaceOrNewline() -> Bool {
return isWhitespace() || self == "\n"
}
}
private extension Markdown {
func eatLeadingWhitespace(startIndex: String.Index, _ endIndex: String.Index) -> String.Index {
for var i = startIndex; i < endIndex; i = i.successor() {
if !input[i].isWhitespaceOrNewline() { return i }
}
return endIndex
}
func eatTrailingWhitespace(startIndex: String.Index, _ endIndex: String.Index) -> String.Index {
for var i = endIndex.predecessor(); i >= startIndex; i = i.predecessor() {
if !input[i].isWhitespaceOrNewline() { return i.successor() }
}
return startIndex
}
}
// MARK: - Splitting Into Lines
private extension Markdown {
/*
* Determine the indices in the input document at which new lines begin.
*/
func parseLines() -> [String.Index] {
var lines: [String.Index] = []
var i = input.startIndex
while i < input.endIndex {
let c = input[i]
i = i.successor()
// For convenience, the end of the string is recorded twice so the block
// scanning logic doesn't need a separate check for end-of-text. When the
// end is reached, it simply sees one final .Empty block.
// It may not be immediately obvious, but the check for endIndex here
// makes sure this happens whether the text ends with a newline or not.
if c == "\n" || i == input.endIndex {
lines.append(i)
}
}
lines.append(i) // add the endIndex again
return lines
}
}
// MARK: - Top-Level Blocks
private extension Markdown {
/*
* Determine the top-level blocks in the file.
*/
func parseBlocks(lines: [String.Index]) -> [Block] {
var blocks: [Block] = []
var lineStart = input.startIndex
var fragmentStart = lineStart
var fragmentEnd = lineStart
var block = Block(type: .Empty)
var count = 0
for lineEnd in lines {
// This looks at the next line. If the type of this line is different from
// the current block, then we may need to end the block and make a new one.
// Exactly how depends on the particular block type. The "adjusted" start
// index is for skipping the symbol that identifies the line.
var (nextType, adjustedLineStart) = identify(lineStart, lineEnd)
// A fragment should not have leading or trailing whitespace or a newline.
if nextType.shouldTrimWhitespace() {
adjustedLineStart = eatLeadingWhitespace(adjustedLineStart, lineEnd)
}
let adjustedLineEnd = eatTrailingWhitespace(adjustedLineStart, lineEnd)
//print("LINE '" + input.substringWithRange(lineStart ..< lineEnd) + "' is type \(nextType)")
// Because the endIndex appears twice in the array, we can easily detect
// whether scanning has reached the end of the input. (By the way, the
// type of that last "line" is .Empty, so when checking for the Empty state
// we don't need to look at endOfInput also.)
let endOfInput = (adjustedLineStart == input.endIndex)
// It may be a bit weird to define inner functions here, but this allows
// them to use variables such as `nextType` and `block`, without passing
// those as parameters.
func beginNewBlock() {
block = Block(type: nextType)
fragmentStart = adjustedLineStart
count = 0
}
func addTextFragment() {
let tokenized = tokenize(startIndex: fragmentStart, endIndex: fragmentEnd)
let fragment = Fragment.Text(tokens: tokenized)
block.fragments.append(fragment)
}
func addCodeFragment() {
let s = input.substringWithRange(fragmentStart ..< fragmentEnd)
let fragment = Fragment.Code(code: s)
block.fragments.append(fragment)
}
func finishBlock() {
blocks.append(block)
}
// The current fragment always refers to the previous line(s). It is given
// by `fragmentStart` and `lineStart`, which is the end of the previous line
// and also the start of this one. So we don't immediately add new fragments,
// we always want to look at the next line first.
switch block.type {
case .Empty:
if !nextType.isEmpty() {
beginNewBlock()
}
case .Header:
// A header is always just one line, so we can immediately add this block.
addTextFragment()
finishBlock()
beginNewBlock()
case .Text:
addTextFragment()
// A text block ends when the next line is empty, a header, or code.
if nextType.isEmpty_Header_Code() {
finishBlock()
beginNewBlock()
} else {
fragmentStart = lineStart
}
case .Quote:
// Each line in a quote is added as a new fragment, and we strip off
// the leading > character.
addTextFragment()
// A quote ends when the next line is empty, a header, or code.
if nextType.isEmpty_Header_Code() {
blocks.append(block)
beginNewBlock()
} else {
// Any other type of line also gets added to the quote. If it
// starts with >, we strip that off.
if case .Quote = nextType {
fragmentStart = adjustedLineStart
} else {
fragmentStart = lineStart
}
}
case .Code:
// A code block ends after a closing line of ``` backticks.
if nextType.isCode() || endOfInput {
addCodeFragment()
finishBlock()
block = Block(type: .Empty)
}
case .CodeIndented:
// Each line in an indented code block is added as a new fragment,
// allowing us to strip off the leading spaces/tabs.
if count == 0 {
addCodeFragment()
}
// If the next line is empty, keep going. If the empty line(s) is/are
// followed by more code, then we'll insert empty fragments.
if nextType.isEmpty() && !endOfInput {
++count
fragmentStart = lineEnd.predecessor()
} else if case .CodeIndented = nextType {
// If the next line is also an indented code block, then keep going.
// If we've seen empty lines, then add an empty fragment for each line.
if count > 0 {
fragmentEnd = fragmentStart
for _ in 1...count { addCodeFragment() }
count = 0
}
fragmentStart = adjustedLineStart
} else {
// If the next line is any other kind of block, then the indented
// code block has ended.
finishBlock()
beginNewBlock()
}
case .ListItem:
// Each line in a list item is added as a new fragment, and we strip
// off the leading - character.
addTextFragment()
// If the next line is text or indented code, then interpret this as
// another fragment that also belongs to this list item.
if nextType.isText_CodeIndented() {
fragmentStart = adjustedLineStart
} else {
blocks.append(block)
beginNewBlock()
}
}
lineStart = lineEnd
fragmentEnd = adjustedLineEnd
}
return blocks
}
}
// MARK: - Line Identification
private extension Markdown {
typealias LineType = (BlockType, String.Index)
/*
* Scans the beginning of the line in order to identify what sort of line
* this is.
*
* Returns a new String.Index that points at the beginning of the actual text,
* having skipped the identifying characters (#, .1, -) but not necessarily
* any whitespace.
*/
func identify(startIndex: String.Index, _ endIndex: String.Index) -> LineType {
var i = startIndex
func header() -> LineType {
let textStart = i
var count = 1
i = i.successor()
for ; i < endIndex; i = i.successor() {
if input[i] == "#" {
++count
} else {
let headerStart = i
i = i.successor()
for ; i < endIndex; i = i.successor() {
if !input[i].isWhitespaceOrNewline() {
return (.Header(level: count), headerStart)
}
}
break
}
}
return (.Text, textStart) // a # by itself
}
func code() -> LineType {
let textStart = i
i = i.successor()
if i < endIndex && input[i] == "`" {
i = i.successor()
if i < endIndex && input[i] == "`" {
i = i.successor()
let j = endIndex.predecessor() // not true for very last line if no newline
if i < j {
return (.Code(language: input.substringWithRange(i ..< j)), endIndex)
} else {
return (.Code(language: ""), endIndex)
}
}
}
return (.Text, textStart)
}
func unorderedListItem() -> LineType {
let textStart = i
i = i.successor()
if i < endIndex && input[i].isWhitespace() {
return (.ListItem(ordered: false), i)
}
return (.Text, textStart)
}
func orderedListItem() -> LineType {
let textStart = i
i = i.successor()
if i < endIndex && input[i] == "." {
i = i.successor()
if i < endIndex && input[i].isWhitespace() {
return (.ListItem(ordered: true), i.successor())
}
}
return (.Text, textStart)
}
func identifier() -> LineType {
switch input[i] {
case "#":
return header()
case ">":
return (.Quote, i.successor())
case "`":
return code()
case "-":
return unorderedListItem()
case "1", "2", "3", "4", "5", "6", "7", "8", "9":
return orderedListItem()
default:
return (.Text, i)
}
}
func whitespace() -> LineType {
var count = 0
var codeStart = i
for ; i < endIndex; i = i.successor() {
switch input[i] {
case "\n":
break
case " ", "\r":
count += 1
case "\t":
count += 4
default:
if count >= 4 {
return (.CodeIndented, codeStart)
} else {
return identifier()
}
}
if count == 4 {
codeStart = i.successor()
}
}
return (.Empty, endIndex)
}
if i == endIndex || input[i] == "\n" {
return (.Empty, endIndex)
} else {
switch input[i] {
case " ", "\t", "\r":
return whitespace()
default:
return identifier()
}
}
}
}
// MARK: - Tokenization of Fragments
private extension Markdown {
/*
* The tokenizer takes in a text fragment and outputs a stream of `Token` objects.
*
* For example, the input:
* aaa **x**
* I've
*
* becomes the following stream of tokens: Text(aaa) Spacing Open(bold) Text(x)
* Close(Bold) Newline Text(I) Escape(') Text(ve)
*
* Multiple spaces get combined into a single token.
*/
func tokenize(startIndex startIndex: String.Index, endIndex: String.Index) -> [Token] {
var tokens = [Token]()
var i = startIndex // the lookahead character
func addTextToken(s: String) {
tokens.append(Token.Text(text: s))
}
func addEscapeToken(s: String) {
tokens.append(Token.Escape(symbol: s))
}
func spacing() {
for ; i < endIndex; i = i.successor() {
if !input[i].isWhitespace() { break }
}
tokens.append(Token.Spacing)
}
func escapeDash() {
if i < endIndex && input[i] == "-" {
addEscapeToken("--")
i = i.successor()
} else {
addTextToken("-")
}
}
func escapeEllipsis() {
if i < endIndex && input[i] == "." {
i = i.successor()
if i < endIndex && input[i] == "." {
i = i.successor()
addEscapeToken("...")
} else {
addTextToken("..")
}
} else {
addTextToken(".")
}
}
func escape() {
let c = input[i]
i = i.successor()
if c == "-" {
escapeDash()
} else if c == "." {
escapeEllipsis()
} else {
addEscapeToken(String(c))
}
}
var seenOpen = [String: Bool]()
func addOpenOrCloseToken(symbol: String, strict: Bool = false) {
// The official rules are that *word* and *word1 word2* will work but
// not * word *, *word *, or * word*. To keep things simple, we only
// require that the opening * is followed by non-whitespace; where the
// closing * is doesn't matter.
if let open = seenOpen[symbol] where open {
seenOpen[symbol] = false
tokens.append(Token.Close(symbol: symbol))
} else if strict && (i == endIndex || input[i].isWhitespace()) {
addTextToken(symbol)
} else {
seenOpen[symbol] = true
tokens.append(Token.Open(symbol: symbol))
}
}
func strikethrough() {
if i < endIndex && input[i] == "~" {
i = i.successor()
addOpenOrCloseToken("~~")
} else {
addTextToken("~")
}
}
// Note that italics, bold, etc, do no work across multiple lines.
// That happens because we tokenize each fragment individually and we reset
// the tokenization state with each new fragment.
func italicsOrBold() {
if i < endIndex && input[i] == "*" {
i = i.successor()
addOpenOrCloseToken("**", strict: true)
} else {
addOpenOrCloseToken("*", strict: true)
}
}
func span() {
let c = input[i]
i = i.successor()
if c == "*" {
italicsOrBold()
} else if c == "~" {
strikethrough()
} else {
addOpenOrCloseToken(String(c))
}
}
func backslash() {
i = i.successor()
if i < endIndex {
let c = input[i]
if c == "*" || c == "[" || c == "]" {
addTextToken(String(c))
i = i.successor()
return
}
}
addTextToken("\\")
}
func parseLink() -> ((String.Index, String.Index), (String.Index, String.Index))? {
let textStart = i.successor()
// Loop until we find ] followed by ( followed by ). It's not a real link
// unless it has text and a valid URL, but we're not that picky.
// To be honest, this is where a regexp is the simpler solution. ;-)
for ; i < endIndex; i = i.successor() {
if input[i] == "]" {
let textEnd = i
i = i.successor()
if i < endIndex && input[i] == "(" {
i = i.successor()
let urlStart = i
for ; i < endIndex; i = i.successor() {
if input[i] == ")" {
let urlEnd = i
i = i.successor()
return ((eatLeadingWhitespace(textStart, textEnd),
eatTrailingWhitespace(textStart, textEnd)),
(eatLeadingWhitespace(urlStart, urlEnd),
eatTrailingWhitespace(urlStart, urlEnd)))
}
}
}
}
}
// This does not appear to be a validly formatted link.
addTextToken("[")
i = textStart
return nil
}
func link() {
if let ((textStart, textEnd), (urlStart, urlEnd)) = parseLink() {
let tokenized = tokenize(startIndex: textStart, endIndex: textEnd)
let fragment = Fragment.Text(tokens: tokenized)
let url = input.substringWithRange(urlStart ..< urlEnd)
let token = Token.Link(fragment: fragment, url: url)
tokens.append(token)
}
}
func image() {
i = i.successor()
if i < endIndex && input[i] == "[" {
if let ((captionStart, captionEnd), (urlStart, urlEnd)) = parseLink() {
let caption = input.substringWithRange(captionStart ..< captionEnd)
let url = input.substringWithRange(urlStart ..< urlEnd)
let token = Token.Image(caption: caption, url: url)
tokens.append(token)
}
} else {
addTextToken("!")
}
}
func endsWord(c: Character) -> Bool {
return c == " " || c == "\t" || c == "\n" || c == "\r" || // whitespace
c == "'" || c == "<" || c == ">" || c == "&" || c == "-" || c == "." || // escapes
c == "`" || c == "\"" || c == "*" || c == "~" || // span
c == "\\" || c == "[" || c == "!"
}
func word() {
let wordStart = i
for ; i < endIndex; i = i.successor() {
if endsWord(input[i]) { break }
}
addTextToken(input.substringWithRange(wordStart ..< i))
}
while i < endIndex {
switch input[i] {
case " ", "\t", "\r":
spacing()
case "'", "<", ">", "&", "-", ".":
escape()
case "`", "\"", "*", "~":
span()
case "\\":
backslash()
case "[":
link()
case "!":
image()
case "\n":
fatalError("fragments should not contain newlines")
default:
word()
}
}
return tokens
}
}
// MARK: - HTML Rendering
private let escapeTable = [
"'": "’",
"<": "<",
">": ">",
"&": "&",
"-": "–",
"--": "—",
"...": "…",
]
private let openTable = [
"`": "<code>",
"\"": "“",
"*": "<em>",
"**": "<strong>",
"~~": "<del>",
]
private let closeTable = [
"`": "</code>",
"\"": "”",
"*": "</em>",
"**": "</strong>",
"~~": "</del>",
]
private extension Markdown.Token {
func toHTML() -> String {
switch self {
case .Spacing: return " "
case .Text(let text): return text
case .Escape(let symbol): return escapeTable[symbol]!
case .Open(let symbol): return openTable[symbol]!
case .Close(let symbol): return closeTable[symbol]!
case .Link(let fragment, let url):
return "<a href=\"\(url.escape())\">" + fragment.toHTML() + "</a>"
case .Image(let caption, let url):
return "<img src=\"\(url.escape())\" alt=\"\(caption.escape())\">"
}
}
}
private extension Markdown.Fragment {
func toHTML() -> String {
switch self {
case .Text(let tokens):
var s = ""
for token in tokens {
s += token.toHTML()
}
return s
case .Code(let string):
return string.escape()
}
}
// If the text block contains only one image token, then we turn it into
// a <figure> instead of a <p> paragraph.
func isImageOnly() -> Bool {
if case .Text(let tokens) = self where tokens.count == 1, case .Image = tokens[0] {
return true
} else {
return false
}
}
}
private extension Markdown.Block {
func isImageOnly() -> Bool {
return fragments.count == 1 && fragments.first!.isImageOnly()
}
func formatFragments(separator: String = "<br>\n") -> String {
precondition(fragments.count > 0)
if fragments.count == 1 {
return fragments.first!.toHTML()
} else {
return separator.join(fragments.map { $0.toHTML() })
}
}
func toHTML() -> String {
switch type {
case .Empty:
fatalError("should not happen")
case .Header(let level):
return "<h\(level)>" + formatFragments() + "</h\(level)>\n\n"
case .Text:
if isImageOnly() {
return "<figure class=\"image\">" + formatFragments() + "</figure>\n\n"
} else {
return "<p>" + formatFragments("\n") + "</p>\n\n"
}
case .Quote:
return "<p>" + formatFragments() + "</p>\n"
case .Code(let language):
var s = "<figure class=\"code\"><pre><code"
if !language.isEmpty {
s += " class=\"\(language.escape())\""
}
s += ">" + formatFragments() + "</code></pre></figure>\n\n"
return s
case .CodeIndented:
return "<figure class=\"code\"><pre><code>" + formatFragments("\n") + "</code></pre></figure>\n\n"
case .ListItem:
return "<li>" + formatFragments() + "</li>\n"
}
}
}
extension Markdown {
func renderHTML() -> String {
var s = ""
var lastType = BlockType.Empty
var wasOrderedList = false
for block in blocks {
if !lastType.isQuote() && block.type.isQuote() {
s += "<blockquote>\n"
} else if lastType.isQuote() && !block.type.isQuote() {
s += "</blockquote>\n\n"
}
if !lastType.isListItem() && block.type.isListItem() {
wasOrderedList = block.type.isOrderedList()
s += wasOrderedList ? "<ol>\n" : "<ul>\n"
} else if lastType.isListItem() && !block.type.isListItem() {
s += wasOrderedList ? "</ol>\n\n" : "</ul>\n\n"
}
s += block.toHTML()
lastType = block.type
}
return s
}
}
Goals for this project:
This is just a toy project for me to experiment with writing parsers in Swift. Because why not?
There may be bugs.
The Markdown tags that are currently supported:
# Headers
*italics*
**bold**
~~strikethrough~~
`code`
1. numbered
2. list
- unordered
- list
> quote
[Link text](http://url)
![Alt text](images/image.png)
source code (indented 1 tab or 4 spaces)
```language
\* literal asterisk
\[ literal [
\] literal ]
Not supported are:
*
This could really do with a good test suite. ;-)
Create a new Markdown
instance and give it a String
. The parser creates a tree structure that describes the Markdown document. You can either step through that tree yourself or simply call renderHTML()
to convert it to HTML.
if let data = NSData(contentsOfFile: path) {
if let text = NSString(data: data, encoding: NSUTF8StringEncoding) {
let m = Markdown(string: text as String)
let s = m.renderHTML()
print("<!DOCTYPE html><html><head><meta charset=\"utf-8\"/></head><body>\n")
print(s, appendNewline: false)
print("</body></html>\n")
}
}
They definitely make parsing easier but I don't like throwing a handful of regexps at a parsing problem.
When you use a regular expression, it is turned into a state machine by the regex parser. Here I've basically "unrolled" all those state machines by hand.