In this exercise we are looking to collect historical match data from ESPN. The follow up exercise where the data is used to predict future matches can be found here: http://www.hicksonanalytics.com/Rugby.html

PRELIMINARIES

The XML library is used to take ESPN HTML pages and parse stats out of the HTML

require(XML)

PART 1: FINDING THE PAGES ON ESPN

Each page contains stats for a given game

  • Match Events
  • Kick/Pass/Run
  • Attacking
  • Set Pieces
  • Defending
  • Discipline

We need to parametrise the gameID= so that we can run code for more than just one game at a time.

league= is the ID for the league. In this code we can keep it static as we pulling data from just the Aviva Premiership.

We are looking to take data for all matches within a given year, for 9 seasons.

By manually navigating through the first game of each of the season on ESPN we can find the gameIDs of the first game for each of the above seasons:

http://www.espn.co.uk/rugby/matchstats?gameId=28141&league=267979 2008/09

http://www.espn.co.uk/rugby/matchstats?gameId=99170&league=267979 2009/10

http://www.espn.co.uk/rugby/matchstats?gameId=118616&league=267979 2010/11

http://www.espn.co.uk/rugby/matchstats?gameId=142486&league=267979 2011/12

http://www.espn.co.uk/rugby/matchstats?gameId=166464&league=267979 2012/13

http://www.espn.co.uk/rugby/matchstats?gameId=188681&league=267979 2013/14

http://www.espn.co.uk/rugby/matchstats?gameId=231903&league=267979 2014/15

http://www.espn.co.uk/rugby/matchstats?gameId=267707&league=267979 2015/16

http://www.espn.co.uk/rugby/matchstats?gameId=289993&league=267979 2016/17

Define a vector Y= which contains the gameIDs for the start of each season.

Y = c(28141, 99170, 118616, 142486, 166464, 188681, 231903, 267707, 289993)

We will use a loop below to get all the gameIDs for each season. There are 132 games (22*6) in each season so we add 131 data points to each on vector Y.

loop <- NULL;
for (i in c(1:length(Y))){
 loop = c(loop,Y[i]:(Y[i]+131))
}

PART 2: PARSING A PAGE

First we need to clear any match stats data if it exists from previous runs of the code.

rm(Summary,KickPass,Attack)
## Warning in rm(Summary, KickPass, Attack): object 'Summary' not found
## Warning in rm(Summary, KickPass, Attack): object 'KickPass' not found
## Warning in rm(Summary, KickPass, Attack): object 'Attack' not found
Overall_Stats <- NULL;

Taking the first game in the loop. i gets coded to 1.

i=1;
message('Processing match ', i, '/',length(loop))   
## Processing match 1/1188

1st gameID taken out of the loop vector and used to create the required URL.

u = paste("http://www.espn.co.uk/rugby/matchstats?gameId=",loop[i],"&league=267979",sep="");

htmlParse gets the HTML code and imports into R.

doc = htmlParse(u);

getNodeSet gets the title of the Match and all the tables required. Tackles data is not stored in tables so is taken separately.

titleNodes = getNodeSet(doc, "//title")
tableNodes = getNodeSet(doc, "//table")
tackleNodes = getNodeSet(doc, "//span[@class='home-team']")

Using the Title we can extract the Home Team, Away Team and Year.

Match = capture.output(titleNodes[[1]])
HomeTeam = substr(Match,8,regexpr("vs",Match)-2)
AwayTeam = substr(Match,regexpr("vs",Match)+3,regexpr(" - ",Match)-1)
Year = substr(Match,regexpr("20",Match),regexpr("20",Match)+3)

readHTMLTable extracts all the required match stats.

Summary = readHTMLTable(tableNodes[[1]])
KickPass = readHTMLTable(tableNodes[[2]])
Attack = readHTMLTable(tableNodes[[3]])

Tackles is extracted separately.

Tackles = capture.output(tackleNodes[1:2])
Tackles = substr(Tackles[c(2,5)],regexpr(">",Tackles[c(2,5)])+1,regexpr("</",Tackles[c(2,5)])-1);

If there is data found in the HTML then combine all the match stats into 1 table.

if(Match == "<title/> "){
        message("no data found, processing as NULL")
        Stats = matrix(rep("null",51),3,17)
        }   else{
        Stats = t(rbind(matrix(c(Year,"Year",Year),1,3),Summary, KickPass, Attack,matrix(c(Tackles[1],"Tackles",Tackles[2]),1,3)))
      } 

Perform some final manipulations

colnames(Stats) <- Stats[2,]
Stats = Stats[-c(2),]
rownames(Stats) = c(HomeTeam, AwayTeam)
Overall_Stats = rbind(Overall_Stats,Stats)

PART 3: PARSING MULTIPLE PAGES AND OUTPUTTING RESULTS TO CSV

Previously i was set to 1 to process only one page in PART 2.

A loop for (i in c(1:length(loop))){ can be used to run the code for all games in all seasons.

Once all the game stats are parsed they are outputted to txt file

# Write the Overall_Stats data frame to a text file.
Overall_Stats
##              Year   Tries Conversion Goals Penalty Goals
## London Irish "2009" "2"   "2"              "4"          
## Wasps        "2009" "2"   "2"              "0"          
##              Kick Percent Success Kicks From Hand Passes Runs 
## London Irish "100%"               "45"            "81"   "69" 
## Wasps        "100%"               "25"            "114"  "108"
##              Possession 1H/2H Territory 1H/2H Clean Breaks
## London Irish "0% / 0%"        "0% / 0%"       "10"        
## Wasps        "0% / 0%"        "0% / 0%"       "2"         
##              Defenders Beaten Offload Rucks Won        Mauls Won   
## London Irish "18"             "5"     "52 / 52 (100%)" "0 / 0 (0%)"
## Wasps        "17"             "5"     "77 / 78 (98%)"  "0 / 0 (0%)"
##              Turnovers Conceded Tackles  
## London Irish "10"               "133/150"
## Wasps        "16"               "74/92"
write.table(Overall_Stats, "Overall_Stats.txt")

Appendix1

Final code looks as below. Note that i is a loop

# the XML library is used to data scrape match stats from ESPN HTML pages 
require(XML)

#  Example of a page used is:
# http://www.espn.co.uk/rugby/matchstats?gameId=99170&league=267979

# each number in vector Y corresponds to the first game of each season 2008 to 2017
# Years 2009,  2010,  2011,   2012,   2013,   2014,   2015,   2016,   2017
# Y = c(28141, 99164, 118616, 142486, 166464, 188681, 231903, 267707, 289993)

Y = c(28141, 99170, 118616, 142486, 166464, 188681, 231903, 267707, 289993)

# generate the loop vector used to generate all the page ids in each html page on ESPN
loop <- NULL;
for (i in c(1:length(Y))){
 loop = c(loop,Y[i]:(Y[i]+131))
}

# create the Overall_Stats data frame using a loop for each HTML page
rm(Summary,KickPass,Attack)
Overall_Stats <- NULL;
for (i in c(1:length(loop))){   

    message('Processing match ', i, '/',length(loop))       
    u = paste("http://www.espn.co.uk/rugby/matchstats?gameId=",loop[i],"&league=267979",sep="")
    doc = htmlParse(u)
    titleNodes = getNodeSet(doc, "//title")
    tableNodes = getNodeSet(doc, "//table")
    tackleNodes = getNodeSet(doc, "//span[@class='home-team']")
    
    Match = capture.output(titleNodes[[1]])
    HomeTeam = substr(Match,8,regexpr("vs",Match)-2)
    AwayTeam = substr(Match,regexpr("vs",Match)+3,regexpr(" - ",Match)-1)
    Year = substr(Match,regexpr("20",Match),regexpr("20",Match)+3)
    
    Summary = readHTMLTable(tableNodes[[1]])
    KickPass = readHTMLTable(tableNodes[[2]])
    Attack = readHTMLTable(tableNodes[[3]])
    Tackles = capture.output(tackleNodes[1:2])
    Tackles = substr(Tackles[c(2,5)],regexpr(">",Tackles[c(2,5)])+1,regexpr("</",Tackles[c(2,5)])-1)
    
    if(Match == "<title/> "){
        message("no data found, processing as NULL")
        Stats = matrix(rep("null",51),3,17)
    }   else{
        Stats = t(rbind(matrix(c(Year,"Year",Year),1,3),Summary, KickPass, Attack,matrix(c(Tackles[1],"Tackles",Tackles[2]),1,3)))  
    }   
    colnames(Stats) <- Stats[2,]
    Stats = Stats[-c(2),]
    rownames(Stats) = c(HomeTeam, AwayTeam)
    Overall_Stats = rbind(Overall_Stats,Stats);
}

# Write the Overall_Stats data frame to a text file.
write.table(Overall_Stats, "Overall_Stats.txt")

# -------- NOTES IN BELOW COMMENTS
# 184, 262:264, 425, 436, 528 empty
# game ids 231999 (BATH QUINS 28/11/14)  empty

FURTHER WORK AND CURRENT ISSUES

Fix 2016 data issues

Fix data gaps for territory and possession

Convert tackles, mauls, rucks into 2 pairs of absolutes as opposed to percentages

Automate the csv as required to avoid manual manipulation of spreadsheets