MattJermyWright
12/9/2014 - 10:53 PM

Bayesian Classifier - classify mobile scripts

Bayesian Classifier - classify mobile scripts

#!/usr/bin/env python
import fileinput

import re
import math
import pprint
import sqlite3

def getWords(str):
	splitter = re.compile(r"\W*")
	words = [s.lower() for s in splitter.split(str) if len(s)>2 and len(s)<40]
	return words

class classifier:
	def __init__(self,name,getfeatures,filename=None,dbName = "bayesianFilter.db"):
		# The name of the classifier
		self.name = name
		# Counts of feature / category combinations
		self.fc={}
		# Counts of documents in each category
		self.cc={}
		self.getfeatures = getfeatures
		# Initialize the SQL Database
		self.conn = sqlite3.connect(dbName)
		# Create tables if they don't already exist
		self.checkTables(True)

	def sqlExecuteAndIgnoreErrors(self,sql):
		try:
			self.conn.execute(sql)
			self.conn.commit();
		except sqlite3.Error:
			pass

	def checkTables(self, createTablesFlag):
		# Check and see if the tables are already created
		cursor = self.conn.cursor()
		try:
			cursor.execute("select count(*) from " + self.name + "_features")
		except sqlite3.Error:
			tablesExistFlag = True
		if createTablesFlag ==True and tablesExistFlag == True:
			self.sqlExecuteAndIgnoreErrors("drop table "+self.name+"_features")
			self.sqlExecuteAndIgnoreErrors("drop table "+self.name+"_categories")
		if createTablesFlag == True:
			self.sqlExecuteAndIgnoreErrors("create table "+self.name+"_features (feature varchar(50))")
			self.sqlExecuteAndIgnoreErrors("create table "+self.name+"_categories (category varchar(50), freq bigint default 0)")

	def addCategoryIfNotExists(self,categoryName):
		cursor = self.conn.cursor()
		try:
			cursor.execute("select count(*) from "+self.name+"_categories where category = ?",categoryName)
			found = int(cursor.fetchone()[0])>0
		except sqlite3.Error:
			pass
		if not found:
			try:
				self.conn.execute("alter table "+self.name+"_features add column "+categoryName+" bigint DEFAULT 0")
				self.conn.execute("insert into "+self.name+"_categories values(?)", categoryName)
				self.conn.commit;
			except sqlite3.Error:
				print("ERROR: Issue with inserting category")


	def incrementFeature(self,feature,category,amount):
		self.addCategoryIfNotExists(category)
		# Check and see if the feature already exists, and retrieve the value
		cursor = self.conn.cursor()
		currentFeatureCategoryValue = 0
		found = True
		try:
			cursor.execute("select sum("+category+") from "+self.name+"_features where feature = ?",feature)
			currentFeatureCategoryValue = int(cursor.fetchone()[0])
		except sqlite3.Error:
			found = False
		if not found:
			try:
				self.conn.execute("insert into "+self.name+"_features ('feature') values(?)", feature)
				self.conn.commit;
			except sqlite3.Error:
				print("ERROR: Issue with inserting category")
		try:		
			conn.execute("update "+self.name+"_features set "+category+"=? where feature = ?",currentFeatureCategoryValue+amount,feature)
			conn.execute("update "+self.name+"_categories set freq=? where category = ?",currentFeatureCategoryValue+amount,category)
			self.conn.commit;
		except sqlite3.Error:
			print("ERROR: Issue with inserting feature")	

	# def incf(self,f,cat):
	# 	self.fc.setdefault(f,{})
	# 	self.fc[f].setdefault(cat,0)
	# 	self.fc[f][cat]+=1



	# def incc(self, cat):
	# 	self.cc.setdefault(cat,0)
	# 	self.cc[cat]+=1

	def fcount(self,feature,category):
		count = 0
		try:
			cursor = self.conn.cursor()
			cursor.execute("select sum("+category+") from "+self.name+"_features where feature =? ",feature)
			count = cursor.fetchone()[0]
		except sqlite3.Error:
			print("ERROR: fcount")
		return float(count)

	def catcount(self,category):
		count = 0
		try:
			cursor = self.conn.cursor()
			cursor.execute("select sum(freq) from "+self.name+"_categories where category =? ",category)
			count = cursor.fetchone()[0]
		except sqlite3.Error:
			print("ERROR: catcount")
		return float(count)

	def totalcount(self):
		count = 0
		try:
			cursor = self.conn.cursor()
			cursor.execute("select sum(freq) from "+self.name+"_categories")
			count = cursor.fetchone()[0]
		except sqlite3.Error:
			print("ERROR: catcount")
		return float(count)

	def categories(self):
		try:
			cursor = self.conn.cursor()
			return [row for row in cursor.execute("select category "+self.name+"_categories")]
		except sqlite3.Error:
			print("ERROR: catcount")
		return []		

	# def fcount(self,f,cat):
	# 	if f in self.fc and cat in self.fc[f]:
	# 		return float(self.fc[f][cat])
	# 	return 0.0

	#def catcount(self,cat):
	#def totalcount(self):
	#def categories(self):


def main():

	pp = pprint.PrettyPrinter(indent=4)
	for line in fileinput.input():
		pp.pprint(getWords(line))

main()