wypisuje duplikaty
import java.io.{FileOutputStream, File, FileInputStream}
import com.google.protobuf.ByteString
import com.sentihub.Model.Content
import com.sentihub.Model.Content.Id
import org.slf4j.{LoggerFactory, Logger}
/**
* Created by Raphael Hazan on 8/28/2014.
*/
object Main {
final val logger: Logger = LoggerFactory.getLogger(Main.getClass)
val urlToCount = scala.collection.mutable.Map[String, Int]()
val topicUrlToCount = scala.collection.mutable.Map[String, Int]()
// last czyli ostatnia czesc id
val lastIdToCount = scala.collection.mutable.Map[ByteString, Int]()
val textLastIdToCount = scala.collection.mutable.Map[ByteString, Int]()
// jesli url z innym id powtorzy sie, to nie zapisuje tego nigdzie tylko wypisuje w konsoli
val urlToFullId = scala.collection.mutable.Map[String, Id]()
def main(args: Array[String]) = {
val contentFiles = List("data/content_0000", "data/content_0001", "data/content_0002")
val uniqueContentsFile = new File("data/cleaned/content_0000")
val output = new FileOutputStream(uniqueContentsFile)
val printAdditionalInfo = false
for (contentFile <- contentFiles if new File(contentFile).exists()) {
val input: FileInputStream = new FileInputStream(new File(contentFile))
var c = Content.parseDelimitedFrom(input)
val step = 300000
var i = 0
while (c != null) {
i += 1
if (i % step == 0) {
logger.info("step " + i)
}
urlToCount.get(c.getUrl) match {
case None => urlToCount += ((c.getUrl, 1))
case Some(count) => urlToCount += ((c.getUrl, count + 1))
}
if (c.getInnerType == "topic") {
topicUrlToCount.get(c.getUrl) match {
case None => topicUrlToCount += ((c.getUrl, 1))
case Some(count) => topicUrlToCount += ((c.getUrl, count + 1))
}
}
val id = if (c.getId.getTextCount > 0) c.getId.getText(c.getId.getTextCount - 1) else c.getId.getContainer(c.getId.getContainerCount - 1)
lastIdToCount.get(id) match {
case None => lastIdToCount += ((id, 1))
case Some(count) => lastIdToCount += ((id, count + 1))
}
if (c.getInnerType == "text") {
textLastIdToCount.get(id) match {
case None => textLastIdToCount += ((id, 1))
case Some(count) => textLastIdToCount += ((id, count + 1))
}
}
urlToFullId.get(c.getUrl) match {
case None =>
urlToFullId += ((c.getUrl, c.getId))
c.writeDelimitedTo(output)
case Some(fullId) =>
if (c.getId != fullId) {
println("--------------------------")
logger.error("id sie roznia: " + c.getUrl)
logger.error(fullId.toString)
logger.error(c.getId.toString)
println("--------------------------")
}
}
c = Content.parseDelimitedFrom(input)
}
input.close()
}
output.close()
if (printAdditionalInfo) {
for ((id, count) <- lastIdToCount) {
if (count > 1) {
logger.error(id.toStringUtf8 + " | " + count)
}
}
for ((id, count) <- textLastIdToCount) {
if (count > 1) {
logger.error(id.toStringUtf8 + " | " + count)
}
}
}
println("Liczba różnych urli: " + urlToCount.size)
println("Liczba duplikatów urli: " + urlToCount.count(_._2 > 1))
println("Liczba różnych topicow: " + topicUrlToCount.size)
println("Liczba duplikatów topicow (urli): " + topicUrlToCount.count(_._2 > 1))
println("Liczba różnych id koncowych: " + lastIdToCount.size)
println("Liczba duplikatów id koncowych: " + lastIdToCount.count(_._2 > 1))
println("Liczba różnych id tekstów: " + textLastIdToCount.size)
println("Liczba duplikatów id tekstów: " + textLastIdToCount.count(_._2 > 1))
println("done")
}
}