hayio
9/1/2014 - 12:30 PM

wypisuje duplikaty

wypisuje duplikaty

import java.io.{FileOutputStream, File, FileInputStream}

import com.google.protobuf.ByteString
import com.sentihub.Model.Content
import com.sentihub.Model.Content.Id
import org.slf4j.{LoggerFactory, Logger}

/**
 * Created by Raphael Hazan on 8/28/2014.
 */
object Main {
  final val logger: Logger = LoggerFactory.getLogger(Main.getClass)

  val urlToCount = scala.collection.mutable.Map[String, Int]()
  val topicUrlToCount = scala.collection.mutable.Map[String, Int]()
  // last czyli ostatnia czesc id
  val lastIdToCount = scala.collection.mutable.Map[ByteString, Int]()
  val textLastIdToCount = scala.collection.mutable.Map[ByteString, Int]()
  // jesli url z innym id powtorzy sie, to nie zapisuje tego nigdzie tylko wypisuje w konsoli
  val urlToFullId = scala.collection.mutable.Map[String, Id]()

  def main(args: Array[String]) = {
    val contentFiles = List("data/content_0000", "data/content_0001", "data/content_0002")
    val uniqueContentsFile = new File("data/cleaned/content_0000")
    val output = new FileOutputStream(uniqueContentsFile)

    val printAdditionalInfo = false
    for (contentFile <- contentFiles if new File(contentFile).exists()) {
      val input: FileInputStream = new FileInputStream(new File(contentFile))
      var c = Content.parseDelimitedFrom(input)
      val step = 300000
      var i = 0
      while (c != null) {
        i += 1
        if (i % step == 0) {
          logger.info("step " + i)
        }
        urlToCount.get(c.getUrl) match {
          case None => urlToCount += ((c.getUrl, 1))
          case Some(count) => urlToCount += ((c.getUrl, count + 1))
        }
        if (c.getInnerType == "topic") {
          topicUrlToCount.get(c.getUrl) match {
            case None => topicUrlToCount += ((c.getUrl, 1))
            case Some(count) => topicUrlToCount += ((c.getUrl, count + 1))
          }
        }
        val id = if (c.getId.getTextCount > 0) c.getId.getText(c.getId.getTextCount - 1) else c.getId.getContainer(c.getId.getContainerCount - 1)
          lastIdToCount.get(id) match {
            case None => lastIdToCount += ((id, 1))
            case Some(count) => lastIdToCount += ((id, count + 1))
          }
        if (c.getInnerType == "text") {
          textLastIdToCount.get(id) match {
            case None => textLastIdToCount += ((id, 1))
            case Some(count) => textLastIdToCount += ((id, count + 1))
          }
        }

        urlToFullId.get(c.getUrl) match {
          case None =>
            urlToFullId += ((c.getUrl, c.getId))
            c.writeDelimitedTo(output)
          case Some(fullId) =>
            if (c.getId != fullId) {
              println("--------------------------")
              logger.error("id sie roznia: " + c.getUrl)
              logger.error(fullId.toString)
              logger.error(c.getId.toString)
              println("--------------------------")
            }
        }

        c = Content.parseDelimitedFrom(input)
      }

      input.close()
    }
    output.close()

    if (printAdditionalInfo) {
      for ((id, count) <- lastIdToCount) {
        if (count > 1) {
          logger.error(id.toStringUtf8 + "  | " + count)
        }
      }
      for ((id, count) <- textLastIdToCount) {
        if (count > 1) {
          logger.error(id.toStringUtf8 + "  | " + count)
        }
      }
    }
    println("Liczba różnych urli: " + urlToCount.size)
    println("Liczba duplikatów urli: " + urlToCount.count(_._2 > 1))
    println("Liczba różnych topicow: " + topicUrlToCount.size)
    println("Liczba duplikatów topicow (urli): " + topicUrlToCount.count(_._2 > 1))
    println("Liczba różnych id koncowych: " + lastIdToCount.size)
    println("Liczba duplikatów id koncowych: " + lastIdToCount.count(_._2 > 1))
    println("Liczba różnych id tekstów: " + textLastIdToCount.size)
    println("Liczba duplikatów id tekstów: " + textLastIdToCount.count(_._2 > 1))
    println("done")
  }
}