dbs67
1/21/2016 - 4:24 PM

Scala code to parse Udacity's subtitle text files for CS6250 Networking

Scala code to parse Udacity's subtitle text files for CS6250 Networking

import java.io.{File, IOException, PrintWriter}
import java.security.MessageDigest
import java.time.{LocalDateTime, ZoneId, LocalTime}
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale, UUID}

import scala.collection.mutable
import scala.concurrent.duration.{FiniteDuration, _}
import scala.io.{BufferedSource, Source}
import scala.math.BigDecimal.RoundingMode
import scala.util.Try
import scala.xml.Elem

/**
 * Converts a folder of transcripts (*.srt) from Udacity to
 * readable notes in html, which can be pasted to Google Docs.
 *
 * Ex.
 * runMain UdacityNotes -i "/Users/kefuzhou/Downloads/P3L5 Non-Functional Reqs & Arch Styles Subtitles"
 * runMain UdacityNotes -i "/Users/kefuzhou/Downloads/Software Architecture & Design Subtitles" -cf -r 1
 */
object UdacityNotes {

  case class Data(title: String, body: String, duration: Option[FiniteDuration])

  val isDebug: Boolean = false
  println(s"Debugging is ${isDebug}")
  def debug[T](block: => T): Unit = {
    if (isDebug) block
  }

  def main(args: Array[String]): Unit = {
    val path = getArgumentValue("-i", args)
    val isCourseFolder = args.contains("-cf")
    val rotateCount = Try(getArgumentValue("-r", args)).map(_.toInt).getOrElse(0)

    val lessonFolders = {
      if (isCourseFolder) {
        val dir = new File(path)
        assert(dir.isDirectory, s"Dir should be a directory of lesson directories: ${path}")
        val lessonFolders = dir.listFiles().filter(_.isDirectory).toList
        val result = rotate(rotateCount, sortFiles(lessonFolders.toList))
        result
      } else {
        List(new File(path))
      }
    }

    println(s"Processing ${lessonFolders.size} lessons")

    val folderTitle = new File(path).getName
    val pw = new PrintWriter(new File(s"UdacityNotes_${folderTitle}.html"))
    pw.println("<html><body>")
    pw.println(<h1 class="folderTitleHeader">{s"Folder Title: ${folderTitle}"}</h1>)
    val totalVideoTimeString = {
      val parts = lessonFolders.flatMap(_.listFiles()).map(f => getEndTime(f).map(_.toMillis))
      val numUnknowns = parts.count(_.isEmpty)
      val totalMillis = parts.collect{case Some(endTime) => endTime}.sum.millis
      val localTime = LocalTime.ofNanoOfDay(totalMillis.toNanos)
      val unknownString = {
        if (numUnknowns == 0) ""
        else if (numUnknowns == 1) " + 1 Unknown"
        else s" + ${numUnknowns} Unknowns"
      }
      localTime.toString + unknownString
    }

    pw.println(<div>{"Total Video Time: %s".format(totalVideoTimeString)}</div>)
    val timeFormatter = DateTimeFormatter.ofPattern("MM/dd/yyyy h:mm:ssa z").withZone(ZoneId.of("America/New_York"))
    val updateTimeString = timeFormatter.format(LocalDateTime.now())
    pw.println(<div>{"Updated: %s".format(updateTimeString)}</div>)

    def surroundText(in: String) = s"\n$in\n"

    val tableOfContents = createTableOfContents(lessonFolders)
    pw.println(tableOfContents)

    lessonFolders.foreach { folder =>
      val sectionDatas = getFilesSorted(folder.getAbsolutePath).toVector.map(parseFile)
        .map { data =>
        data.copy(body = parseSubtitleText(data.body))
      }
      val numUnknowns = sectionDatas.count(_.duration.isEmpty)
      val totalLessonFolderTime = sectionDatas.map(_.duration).collect { case Some(d) => d.toMillis }.sum.millis
      val unknownString = {
        numUnknowns match {
          case 0 => ""
          case 1 => " + 1 Unknown"
          case n => s" + ${n} Unknowns"
        }
      }

      val lessonXml: Elem = {
        <div class="lesson">
          <div class="lessonTitle">
            <h2 class="lessonTitleHeader" id={getLessonId(folder.getName)}>
              <a href={"#%s".format(getTableOfContentsId(folder.getName, ""))}>{surroundText(s"Lesson: ${folder.getName} (${sprintEndTime(totalLessonFolderTime)}${unknownString})")}</a>
            </h2>
          </div>{
          sectionDatas.map { data =>
            <div class="sectionData">
              <h3 class="sectionTitleHeader" id={getSectionId(folder.getName, data.title)}>
                <a href={"#%s".format(getTableOfContentsId(folder.getName, data.title))}>
                  {surroundText(s"Section: ${data.title} ${data.duration.map(a => "(%s)".format(sprintEndTime(a))).getOrElse("")}")}
                </a>
              </h3>
            </div>
            <br/>
            <div>{surroundText(wordWrap(data.body, 120))}</div>
            <br/>
          }}</div>
      }
//      val prettyXml = xmlPrettyPrinter.format(lessonXml)
      pw.println(lessonXml)
    }

    pw.println("</body></html>")
    pw.close()

  }

  private def rotate[A](n: Int, ls: List[A]): List[A] = {
    val nBounded = if (ls.isEmpty) 0 else n % ls.length
    if (nBounded < 0) rotate(nBounded + ls.length, ls)
    else (ls drop nBounded) ::: (ls take nBounded)
  }

  val xmlPrettyPrinter = new scala.xml.PrettyPrinter(120, 2)

  def getArgumentValue(flag: String, args: Array[String]): String = {
    val i = args.indexWhere(_ == flag)
    if (i == -1) {
      throw new IOException(s"Please provide flag ${flag} [value] in argument")
    } else if (i + 1 >= args.length) {
      throw new IOException(s"Please provide value to flag ${flag}")
    } else {
      args(i + 1)
    }
  }


  def parseSubtitleText(input: String): String = {

    def isNumberLine(line: String) = Try(line.toDouble).isSuccess

    val lines = Source.fromString(input).getLines().toVector
    val textLines = StringBuilder.newBuilder
    var i = 0
    while (i < lines.size) {
      val line = lines(i).trim
      if (line.isEmpty || isNumberLine(line) || isRangeLine(line) ) {
      } else {
        textLines.append(s"$line ")
      }
      i += 1
    }
    textLines.toString()
  }

  private def isRangeLine(line: String) = {
    line.take(8).forall(c => "0123456789:,.".contains(c))
  }

  def getFilesSorted(path: String): List[File] = {
    val file = new File(path)
    if (!file.exists()) {
      throw new IOException(s"File not found at: ${file.getAbsolutePath}")
    }
    val files = file.listFiles()
    sortFiles(files)
  }

  def sortFiles(files: Seq[File]): List[File] = {
    val possibleNumbers = Try(files.map(_.getName.split("-",2).head.filter(c => !c.isWhitespace).toDouble).toVector)
    possibleNumbers match {
      case util.Success(numbers) =>
        files.zip(numbers).sortBy(_._2).map(_._1).toList
      case _ =>
        files.sortBy(_.getName).toList
    }
  }

  def parseFile(file: File): Data = {
    val title = getSectionTitleFromFile(file)
    val body = useThenCloseFile(file, _.getLines().mkString("\n"))
    val duration = getEndTime(file)
    Data(title, body, duration)
  }

  private def useThenCloseFile[A](file: File, fn: BufferedSource => A) = {
    val b = Source.fromFile(file)
    val result = fn(b)
    b.close()
    result
  }

  def getSectionTitleFromFile(file: File): String = {
    val pattern = """\d+ - (.*)\.srt""".r
    val title = pattern.findFirstMatchIn(file.getName).get.group(1).trim
    title
  }

  def wordWrap(text: String, maxLength: Int): String = {
    wordWrap(text.split(" "), maxLength)
  }

  def wordWrap(tokens: Seq[String], maxLength: Int): String = {
    var spaceLeft = maxLength
    val spaceWidth = 1
    val sb = StringBuilder.newBuilder
    tokens.foreach { word  =>
      if (word.length + spaceWidth > spaceLeft) {
        sb.append(s"\n$word ")
        spaceLeft = maxLength - word.length - spaceWidth
      } else {
        sb.append(s"$word ")
        spaceLeft -= (word.length + spaceWidth)
      }
    }
    val out = sb.toString()
    debug {
      assert(Source.fromString(out).getLines().forall(_.length <= maxLength), "word wrap violation")
    }
    out
  }

  def createUUIDString(): String = {
    UUID.randomUUID().toString.replace("-", "")
  }

  def convertByteArrayToHexString(arrayBytes: Array[Byte]): String = {
    val sb = mutable.StringBuilder.newBuilder
    arrayBytes.indices.foreach { i =>
      sb.append(Integer.toString((arrayBytes(i) & 0xff) + 0x100, 16)
        .substring(1))
    }
    sb.toString()
  }

  def sha1Hash(prefix: String, lessonName: String, sectionName: String): String = {
    val input = s"$lessonName$sectionName"
    val bytes = MessageDigest.getInstance("SHA-1").digest(input.getBytes)
    "%s%s".format(prefix,convertByteArrayToHexString(bytes))
  }

  def getSectionId(lessonName: String, sectionName: String) = {
    sha1Hash("s_", lessonName, sectionName)
  }
  
  def getLessonId(lessonName: String) = {
    sha1Hash("l_", lessonName, "")
  }
  
  def getTableOfContentsId(lessonName: String, sectionName: String) = {
    sha1Hash("t_", lessonName, sectionName)
  }

  def createTableOfContents(lessonFolders: Seq[File]): String = {
    val xml = <div><ul>{
      lessonFolders.map { lessonFolder =>
        val (totalLessonFolderTime, numUnknowns) = {
          val parts = lessonFolder.listFiles().map(f => getEndTime(f).map(_.toMillis))
          val numUnknowns = parts.count(_.isEmpty)
          (parts.collect{case Some(endTime) => endTime}.sum.millis, numUnknowns)
        }
        val unknownString = {
          numUnknowns match {
            case 0 => ""
            case 1 => " + 1 Unknown"
            case n => s" + ${n} Unknowns"
          }
        }

        <div>
          <li>
            <div id={getTableOfContentsId(lessonFolder.getName, "")}>
              <a href={s"#${getLessonId(lessonFolder.getName)}"}>{s"${lessonFolder.getName} (${sprintEndTime(totalLessonFolderTime)}${unknownString})"}</a>
            </div>
          </li>
          <ul>{
            sortFiles(lessonFolder.listFiles().toList).map { sectionFile =>
              val sectionTitle = getSectionTitleFromFile(sectionFile)
              val title = s"$sectionTitle ${getEndTime(sectionFile).map(a => "(%s)".format(sprintEndTime(a))).getOrElse("")}"
              <li>
                <a id={getTableOfContentsId(lessonFolder.getName, getSectionTitleFromFile(sectionFile))}
                   href={s"#${getSectionId(lessonFolder.getName, getSectionTitleFromFile(sectionFile))}"}>{title}</a>
              </li>
            }
          }</ul>
        </div>
      }
    }</ul></div>
//    xmlPrettyPrinter.format(xml)
    xml.toString
  }

  def getEndTime(sectionFile: File): Option[FiniteDuration] = {
    val pattern = """(\d+:\d+:[\d\.,]+)$""".r
    val candidateLinesFromLast: Vector[String] = {
      useThenCloseFile(sectionFile, _.getLines().filter(isRangeLine).toVector.reverse)
    }
    val optTotal: Option[FiniteDuration] = {
      candidateLinesFromLast.view.map(line => pattern.findFirstMatchIn(line).map(_.group(0))).collectFirst {
        case Some(endTimeString) =>
          val Array(hh, mm, ss) = endTimeString.replace(",", ".").split(":").map(_.toDouble)
          val total = hh.hours + mm.minutes + ss.seconds
          total
      }
    }
    optTotal
  }

  def sprintEndTime(duration: FiniteDuration): String = {
    val value = BigDecimal(duration.toMillis / 6e4d).setScale(2, RoundingMode.HALF_EVEN).toString
    s"$value minutes"
  }

}