Makistos
10/28/2013 - 7:42 AM

This is an example of how to traverse through web pages using Python services. #data-mining #web-harvest #python #urllib

This is an example of how to traverse through web pages using Python services. #data-mining #web-harvest #python #urllib

#!/usr/bin/python
# -*- coding: iso-8859-15 -*-

from urllib import urlopen
import re
import sys

class PlayerSeasonInfo:
    team = ''
    league = ''
    season = 0
    record = []
    #rsGP = 0
    #rsG = 0
    #rsA = 0
    #rsP = 0
    #rsPM = 0
    #rsPIM = 0
    #poGP = 0
    #poG = 0
    #poA = 0
    #poP = 0
    #poPM = 0
    #poPIM = 0

    def __init__(self, record):
        self.record = record
        self.team = record[0]
        self.league = record[1]
        self.season = record[2]
        #self.team = record[1]
        #self.league = record[2]
        #self.season = record[3]
        #self.rsGp = record[4]
        #self.rsG = record[5]
        #self.rsA = record[6]
        #self.rsP = record[7]
        #self.rsPM = record[8]
        #self.rsPIM = record[9]
        #self.poGP = record[10]
        #self.poG = record[11]
        #self.poA = record[12]
        #self.poP = record[13]
        #self.poPM = record[14]
        #self.poPIM = record[15]
        
    def printSeason(self):
        print "Team: " + self.team #.encode("iso-8859-15")
        print "League: " + self.league
        print "Season: " + self.season
        print "Record: " + repr(self.record)
        print self.season
        
class Player:
    uid = 0
    name = ''
    born = ''
    nationality = ''
    height = ''
    weight = ''
    position = ''
    shoots = ''
    drafted = ''
    notes = ''
    seasons = []
    
    def __init__(self, player_id):
        begin_record = 0
        col = 0
        #str = ''
        infoFields = []
        self.uid = player_id

        url_address = "http://www.eurohockey.net/players/show_player.cgi?serial=" + self.uid
        print url_address
        for line in urlopen(url_address):
            if '</tr' in line and begin_record == 1:
                begin_record = 0
                playerSeasonInfo = PlayerSeasonInfo(infoFields)
                self.seasons = self.seasons + [playerSeasonInfo]
                infoFields = []
                continue
            if begin_record == 1:
                col = col + 1
                tmp_match =  re.search('<td>(.+)</td>', line.strip())
                if tmp_match:
                    infoFields = infoFields + [tmp_match.group(1)]
            if '<h1>' in line:
                # Player name
                tmp_match = re.search('<h1>(.+)</h1>', line.strip())
                self.name = tmp_match.group(1)
            if '<tr class=\"even\"' in line or '<tr class=\"odd\"' in line: 
                begin_record = 1
                col = 0
                continue

    def printInfo(self):
        print "Name: " + self.name
        print "Born: " + self.born
        print "Nationality: " + self.nationality
        print "Height/Weight: " + self.height + "/" + self.weight
        print "Position: " + self.position
        print "Shoots: " + self.shoots
        print "Drafted: " + self.drafted
        
    def printSeasons(self):
        for season in self.seasons:
            season.printSeason()
            #print
    
    def printSeason(self,season):
        for tmp_season in self.seasons:
            #print tmp_season.season + "," + season
            #print tmp_season
            if tmp_season.season == season:
                tmp_season.printSeason()
                
    def printTeamSeason(self, season, team):
        for tmp_season in self.seasons:
            if tmp_season.season == season and tmp_season.team == team:
                tmp_season.printSeason()
                
    def printAllSeasons(self):
        for tmp_season in self.seasons:
            tmp_season.printSeason()
            
class Team:
    name = ''
    players = []
    def __init__(self, teamid, league, season):
        url_address = ''
        url_address = 'http://www.eurohockey.net/players/show_roster.cgi?team=' + teamid + '&league=' + league + '&season=' + season
#        print url_address
        for line in urlopen(url_address):
            if '/players/show_player.cgi?serial=' in line:
                #print line
                player_id = re.search('players/show_player.cgi\?serial=(\d+)\">(.+)</a></td>', line.strip())
                if player_id:
                    #print player_id.group(2)
                    player = Player(player_id.group(1))
                    self.players = self.players + [player]
                    
    def printSeason(self, season):
        for player in self.players:
            #print player.name
            player.printTeamSeason(season, 'K\xe4rp\xe4t Oulu')
            
class TeamSeason:
    team = ''
    season = ''
    players = []

def main():
    team = Team('91', '4', '109')
    
    team.printSeason('08/09')
#    player = Player(4722)

#    player.printSeasons()

if __name__ == "__main__":
    main()