This is an example of how to traverse through web pages using Python services. #data-mining #web-harvest #python #urllib
#!/usr/bin/python
# -*- coding: iso-8859-15 -*-
from urllib import urlopen
import re
import sys
class PlayerSeasonInfo:
team = ''
league = ''
season = 0
record = []
#rsGP = 0
#rsG = 0
#rsA = 0
#rsP = 0
#rsPM = 0
#rsPIM = 0
#poGP = 0
#poG = 0
#poA = 0
#poP = 0
#poPM = 0
#poPIM = 0
def __init__(self, record):
self.record = record
self.team = record[0]
self.league = record[1]
self.season = record[2]
#self.team = record[1]
#self.league = record[2]
#self.season = record[3]
#self.rsGp = record[4]
#self.rsG = record[5]
#self.rsA = record[6]
#self.rsP = record[7]
#self.rsPM = record[8]
#self.rsPIM = record[9]
#self.poGP = record[10]
#self.poG = record[11]
#self.poA = record[12]
#self.poP = record[13]
#self.poPM = record[14]
#self.poPIM = record[15]
def printSeason(self):
print "Team: " + self.team #.encode("iso-8859-15")
print "League: " + self.league
print "Season: " + self.season
print "Record: " + repr(self.record)
print self.season
class Player:
uid = 0
name = ''
born = ''
nationality = ''
height = ''
weight = ''
position = ''
shoots = ''
drafted = ''
notes = ''
seasons = []
def __init__(self, player_id):
begin_record = 0
col = 0
#str = ''
infoFields = []
self.uid = player_id
url_address = "http://www.eurohockey.net/players/show_player.cgi?serial=" + self.uid
print url_address
for line in urlopen(url_address):
if '</tr' in line and begin_record == 1:
begin_record = 0
playerSeasonInfo = PlayerSeasonInfo(infoFields)
self.seasons = self.seasons + [playerSeasonInfo]
infoFields = []
continue
if begin_record == 1:
col = col + 1
tmp_match = re.search('<td>(.+)</td>', line.strip())
if tmp_match:
infoFields = infoFields + [tmp_match.group(1)]
if '<h1>' in line:
# Player name
tmp_match = re.search('<h1>(.+)</h1>', line.strip())
self.name = tmp_match.group(1)
if '<tr class=\"even\"' in line or '<tr class=\"odd\"' in line:
begin_record = 1
col = 0
continue
def printInfo(self):
print "Name: " + self.name
print "Born: " + self.born
print "Nationality: " + self.nationality
print "Height/Weight: " + self.height + "/" + self.weight
print "Position: " + self.position
print "Shoots: " + self.shoots
print "Drafted: " + self.drafted
def printSeasons(self):
for season in self.seasons:
season.printSeason()
#print
def printSeason(self,season):
for tmp_season in self.seasons:
#print tmp_season.season + "," + season
#print tmp_season
if tmp_season.season == season:
tmp_season.printSeason()
def printTeamSeason(self, season, team):
for tmp_season in self.seasons:
if tmp_season.season == season and tmp_season.team == team:
tmp_season.printSeason()
def printAllSeasons(self):
for tmp_season in self.seasons:
tmp_season.printSeason()
class Team:
name = ''
players = []
def __init__(self, teamid, league, season):
url_address = ''
url_address = 'http://www.eurohockey.net/players/show_roster.cgi?team=' + teamid + '&league=' + league + '&season=' + season
# print url_address
for line in urlopen(url_address):
if '/players/show_player.cgi?serial=' in line:
#print line
player_id = re.search('players/show_player.cgi\?serial=(\d+)\">(.+)</a></td>', line.strip())
if player_id:
#print player_id.group(2)
player = Player(player_id.group(1))
self.players = self.players + [player]
def printSeason(self, season):
for player in self.players:
#print player.name
player.printTeamSeason(season, 'K\xe4rp\xe4t Oulu')
class TeamSeason:
team = ''
season = ''
players = []
def main():
team = Team('91', '4', '109')
team.printSeason('08/09')
# player = Player(4722)
# player.printSeasons()
if __name__ == "__main__":
main()