library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.1     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding

Web scraping

Web scraping is a technique of extracting useful information from webpages. But why do you need it? Not all companies provide an API, some times, you will need to scrape information directly from their websites.

HTML and XML

Here is an example of a simple HTML page:

<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<h1>This is a Heading</h1>
<p>This is a paragraph.</p>
</body>
</html>

Of course, there are a lot of different tags. Go to https://www.w3schools.com/html/ to see some html basics. Nevertheless, all these tags are predefined so your browser knows what each tag means.

(We only need to only a bit HTML in order to do web scrapping)

Look at web page source code

It is important to identify the thing that you want to scrape in the webpage. The best way to do it is to use the inspect function in the Chrome browser.

CSS selector

CSS selector is the most common way to locate an object in a website.

  • tag: the name of the tag, for example <td> and <a> tags
  • .class: class of an object
  • #id: id of an object

imdb example

Suppose we want to get the list of most top rated movies from https://www.imdb.com/chart/top/?ref_=nv_mv_250

We see that all movies names are under the <td>/<a> nodes. The <td> nodes have class name titleColumn.

html <- read_html("https://www.imdb.com/chart/top/?ref_=nv_mv_250")
# it finds all the <td> nodes, but we only need the node with class `titleColumn`
td_nodes <- html %>% html_nodes("td")
# it finds all the <td> nodes with class titleColumn
title_columns <- html %>% html_nodes("td.titleColumn")
# it finds all the <a> nodes within <td> nodes with class titleColumn
a_nodes <- title_columns %>% html_node("a")
# html_text to get the values inside the <a> </a> tag
movie_names <- a_nodes %>% html_text()
head(movie_names)
## [1] "The Shawshank Redemption" "The Godfather"           
## [3] "The Godfather: Part II"   "The Dark Knight"         
## [5] "12 Angry Men"             "Schindler's List"

Put everything together

movie_names <- html %>%
  html_nodes("td.titleColumn") %>% 
  html_node("a") %>%
  html_text()

or even (faster but less readable)

movie_names <- html %>%
  html_nodes("td.titleColumn a") %>%
  html_text()

Now, we also want to capture the ratings.

imdb_ratings <-  html %>%
  html_nodes("td.ratingColumn.imdbRating") %>%
  html_node("strong") %>%
  html_text()
# it only works if there are no missing values
tibble(title = movie_names, rating = imdb_ratings)
## # A tibble: 250 x 2
##    title                                             rating
##    <chr>                                             <chr> 
##  1 The Shawshank Redemption                          9.2   
##  2 The Godfather                                     9.1   
##  3 The Godfather: Part II                            9.0   
##  4 The Dark Knight                                   9.0   
##  5 12 Angry Men                                      8.9   
##  6 Schindler's List                                  8.9   
##  7 The Lord of the Rings: The Return of the King     8.9   
##  8 Pulp Fiction                                      8.8   
##  9 The Good, the Bad and the Ugly                    8.8   
## 10 The Lord of the Rings: The Fellowship of the Ring 8.8   
## # … with 240 more rows

How if you also want to get the years? There is also a cute function html_table.

html %>% html_node("table.chart.full-width") %>%
  html_table() %>%
  select(rank_and_title = `Rank & Title`, rating = `IMDb Rating`) %>%
  separate(rank_and_title, c("rank", "title", "year"), sep = "\n")
##    rank                                                   title           year
## 1    1.                                The Shawshank Redemption         (1994)
## 2    2.                                           The Godfather         (1972)
## 3    3.                                  The Godfather: Part II         (1974)
## 4    4.                                         The Dark Knight         (2008)
## 5    5.                                            12 Angry Men         (1957)
## 6    6.                                        Schindler's List         (1993)
## 7    7.           The Lord of the Rings: The Return of the King         (2003)
## 8    8.                                            Pulp Fiction         (1994)
## 9    9.                          The Good, the Bad and the Ugly         (1966)
## 10  10.       The Lord of the Rings: The Fellowship of the Ring         (2001)
## 11  11.                                              Fight Club         (1999)
## 12  12.                                            Forrest Gump         (1994)
## 13  13.                                               Inception         (2010)
## 14  14.                   The Lord of the Rings: The Two Towers         (2002)
## 15  15.          Star Wars: Episode V - The Empire Strikes Back         (1980)
## 16  16.                                              The Matrix         (1999)
## 17  17.                                              Goodfellas         (1990)
## 18  18.                         One Flew Over the Cuckoo's Nest         (1975)
## 19  19.                                           Seven Samurai         (1954)
## 20  20.                                                   Se7en         (1995)
## 21  21.                                       Life Is Beautiful         (1997)
## 22  22.                                             City of God         (2002)
## 23  23.                                The Silence of the Lambs         (1991)
## 24  24.                                   It's a Wonderful Life         (1946)
## 25  25.                      Star Wars: Episode IV - A New Hope         (1977)
## 26  26.                                     Saving Private Ryan         (1998)
## 27  27.                                          The Green Mile         (1999)
## 28  28.                                           Spirited Away         (2001)
## 29  29.                                            Interstellar         (2014)
## 30  30.                                                Parasite         (2019)
## 31  31.                                  Léon: The Professional         (1994)
## 32  32.                                               Hara-Kiri         (1962)
## 33  33.                                      The Usual Suspects         (1995)
## 34  34.                                           The Lion King         (1994)
## 35  35.                                             The Pianist         (2002)
## 36  36.                                      Back to the Future         (1985)
## 37  37.                              Terminator 2: Judgment Day         (1991)
## 38  38.                                      American History X         (1998)
## 39  39.                                            Modern Times         (1936)
## 40  40.                                               Gladiator         (2000)
## 41  41.                                                  Psycho         (1960)
## 42  42.                                            The Departed         (2006)
## 43  43.                                             City Lights         (1931)
## 44  44.                                        The Intouchables         (2011)
## 45  45.                                                Whiplash         (2014)
## 46  46.                                  Grave of the Fireflies         (1988)
## 47  47.                                            The Prestige         (2006)
## 48  48.                            Once Upon a Time in the West         (1968)
## 49  49.                                              Casablanca         (1942)
## 50  50.                                         Cinema Paradiso         (1988)
##    rating
## 1     9.2
## 2     9.1
## 3     9.0
## 4     9.0
## 5     8.9
## 6     8.9
## 7     8.9
## 8     8.8
## 9     8.8
## 10    8.8
## 11    8.8
## 12    8.8
## 13    8.7
## 14    8.7
## 15    8.7
## 16    8.6
## 17    8.6
## 18    8.6
## 19    8.6
## 20    8.6
## 21    8.6
## 22    8.6
## 23    8.6
## 24    8.6
## 25    8.6
## 26    8.6
## 27    8.5
## 28    8.5
## 29    8.5
## 30    8.5
## 31    8.5
## 32    8.5
## 33    8.5
## 34    8.5
## 35    8.5
## 36    8.5
## 37    8.5
## 38    8.5
## 39    8.5
## 40    8.5
## 41    8.5
## 42    8.5
## 43    8.5
## 44    8.5
## 45    8.5
## 46    8.5
## 47    8.5
## 48    8.4
## 49    8.4
## 50    8.4
##  [ reached 'max' / getOption("max.print") -- omitted 200 rows ]

An alternative approach without html_table.

rows <- html %>% html_nodes("table.chart.full-width tbody tr")
map_dfr(rows, function(row) {
  title <- row %>%
    html_node("td.titleColumn a") %>%
    html_text()
  rating <-  row %>%
    html_node("td.ratingColumn.imdbRating") %>%
    html_node("strong") %>%
    html_text()
  tibble(title = title, imdb_ratings = rating)
})
## # A tibble: 250 x 2
##    title                                             imdb_ratings
##    <chr>                                             <chr>       
##  1 The Shawshank Redemption                          9.2         
##  2 The Godfather                                     9.1         
##  3 The Godfather: Part II                            9.0         
##  4 The Dark Knight                                   9.0         
##  5 12 Angry Men                                      8.9         
##  6 Schindler's List                                  8.9         
##  7 The Lord of the Rings: The Return of the King     8.9         
##  8 Pulp Fiction                                      8.8         
##  9 The Good, the Bad and the Ugly                    8.8         
## 10 The Lord of the Rings: The Fellowship of the Ring 8.8         
## # … with 240 more rows

Now, we want to url link to the movie “The Shawshank Redemption”.

shawshank_url <- html %>%
  html_nodes("td.titleColumn") %>%
  html_node("a") %>%
  keep(html_text(.) == "The Shawshank Redemption") %>%
  html_attr("href")

Alternatively, we could the CSS pseudo class contains()

shawshank_url <- html %>%
  html_node('td.titleColumn a:contains("The Shawshank Redemption")') %>%
  html_attr("href")

But it is not the complete url, we need to base url.

shawshank_full_url <- paste0("https://www.imdb.com/", shawshank_url)

Then we could further scrape things from shawshank_full_url.

Stackoverflow example

Besides using node class, you could also search a node by its id.

Here we are first extracting the div node with id="questions".

read_html("https://stackoverflow.com/questions/tagged/r?tab=newest&pagesize=50") %>%
  html_node("div#questions") %>%
  html_nodes("div.summary") %>%
  html_node("h3") %>%
  html_node("a") %>%
  html_text()
##  [1] "“Dynamic” Function to Calculate Multiple Columns Simultaneously in R"                                                                 
##  [2] "correlation at the end of a pipe operation"                                                                                           
##  [3] "Not able to use ggplot and gives an error"                                                                                            
##  [4] "How to use my dataset to create suitable mixed linear model and pass model check? [migrated]"                                         
##  [5] "Nested lappy in r"                                                                                                                    
##  [6] "Add levels missing in one group to summary table using dplyr"                                                                         
##  [7] "comparing groups and filter common observations using dplyr"                                                                          
##  [8] "Resize table with pictures in Rmarkdown"                                                                                              
##  [9] "tmap is creating a legend (range of values?) for a cropped rasterlayer"                                                               
## [10] "How to set maximum number of fails in testthat?"                                                                                      
## [11] "Implementation of average power function of noise (APFN)"                                                                             
## [12] "How to set column names in R by repeating character? [duplicate]"                                                                     
## [13] "R - 'for' loop initial declarations are only allowed in C99 mode when installing processx package on linux redhat distribution"       
## [14] "Keep station identifier with leading zero numeric"                                                                                    
## [15] "Forcing a predictor into a forward step linear model in r"                                                                            
## [16] "How can I create a loop to make my code more efficient in R?"                                                                         
## [17] "Error in rexp(n, rate) : argument “rate” is missing, with no default"                                                                 
## [18] "RStudio Sweave error -> exit code -1073740791"                                                                                        
## [19] "Prevent hypervolume calculations into negative trait space in R"                                                                      
## [20] "Dynamic vertex attribute in ndtv"                                                                                                     
## [21] "Removing \\\" from string with cat()"                                                                                                 
## [22] "extract specific value by matching name"                                                                                              
## [23] "Function to discord minimum value from a vector not working"                                                                          
## [24] "OMPR- trouble adding constraint"                                                                                                      
## [25] "How to find X coordinate when Y = 0.3?"                                                                                               
## [26] "Saving list with 120 dataframes (11.1 GB)"                                                                                            
## [27] "R - identify sequences in a vector"                                                                                                   
## [28] "Rapid subsetting of a rast brick"                                                                                                     
## [29] "Build R package vignettes with paged data frames"                                                                                     
## [30] "How to provide an optional fill parameter in ggplot2 graph?"                                                                          
## [31] "Assign values from one data frame to another based on subject numbers [duplicate]"                                                    
## [32] "Multiple variables in one bar plot in R"                                                                                              
## [33] "How to reduce the vertical white space between plots using ggarrange in r"                                                            
## [34] "k-folds cross validation to tune a hyperparameter for classification in R"                                                            
## [35] "Attempt to apply non-function error, keras_predict"                                                                                   
## [36] "rstan packages install from github"                                                                                                   
## [37] "How to solve this R package tidyverse error"                                                                                          
## [38] "How would you create R code to count black and white pixels in an image"                                                              
## [39] "Pivot Wider table dyplr R"                                                                                                            
## [40] "Multi-line time series color-coded by attribute in another dataframe (plotly/ggplot2 on pandas/R)"                                    
## [41] "Cannot figure out why object does not exist"                                                                                          
## [42] "Line segments coming out of the interior of a map of brazil using the ggrepel package"                                                
## [43] "How to extract number within but excluding brackets with str_extract() from package stringr?"                                         
## [44] "R dataset from long to wide - under a specific condition [duplicate]"                                                                 
## [45] "linear programming optimization in R?"                                                                                                
## [46] "Avoiding “Error: Aesthetics must be either length 1 or the same as the data (9): y” in geom_errorbar"                                 
## [47] "Estimate and 95% CI for planned contrasts in R"                                                                                       
## [48] "How to separate outliers and far outliers in boxplot"                                                                                 
## [49] "How to create multiple new columns based of off groups of columns that start with a certain prefix and also contain a certain string?"
## [50] "Error in if statement with NA in data in R [duplicate]"

Scraping dynamic webpages

rvest is only able to scrape static web pages. If you want to scrape dynamic web pages, you will need to control a browser programatically.

PS: Please note that dynamic scraping is not suitable to be used in shiny application.

NBA site

PS: actually nba.com has a undocumented API, see https://github.com/seemethere/nba_py/wiki/stats.nba.com-Endpoint-Documentation

port <- httpuv::randomPort()
server <- wdman::chrome(port = port, version = "87.0.4280.87", verbose = FALSE)
rd <- remoteDriver(port = port)
rd$open(silent = TRUE)
rd$navigate("https://stats.nba.com/leaders/?SeasonType=Regular%20Season")
retry::retry({
  rd$getPageSource()[[1]] %>%
    read_html() %>%
    html_node("div.nba-stat-table__overflow table") %>%
    html_table()
  },
  when = "xml_missing",
  timeout = 5
)  
##   #                Player GP  MIN  PTS  FGM  FGA  FG% 3PM  3PA  3P% FTM  FTA
## 1 1          Bradley Beal 30 35.7 33.2 11.7 24.1 48.5 2.4  7.1 33.8 7.4  8.3
## 2 2           Joel Embiid 29 32.8 29.8  9.3 17.9 52.1 1.2  2.9 41.7 9.9 11.6
## 3 3        Damian Lillard 32 35.9 29.6  9.3 20.8 44.5 4.2 11.1 38.0 6.9  7.4
## 4 4         Stephen Curry 34 34.1 29.5  9.7 20.3 47.8 4.8 11.7 41.3 5.3  5.6
## 5 5 Giannis Antetokounmpo 33 33.9 29.2 10.5 19.0 55.5 1.2  4.0 28.6 6.9 10.4
## 6 6           Luka Doncic 32 35.2 28.6 10.0 20.9 47.8 2.7  7.4 35.7 6.0  8.0
## 7 7           Zach LaVine 33 35.6 28.5 10.2 19.6 52.2 3.5  8.1 43.3 4.5  5.3
## 8 8          Nikola Jokic 34 35.9 27.1 10.4 18.4 56.6 1.6  3.8 41.1 4.6  5.2
## 9 9         Kawhi Leonard 29 34.7 26.8  9.7 18.9 51.1 2.0  5.1 38.9 5.5  6.3
##    FT% OREB DREB  REB AST STL BLK TOV  EFF
## 1 89.9  1.4  4.0  5.4 4.8 1.4 0.4 3.3 28.7
## 2 86.0  2.1  9.2 11.3 3.3 1.2 1.3 3.2 33.6
## 3 93.2  0.5  3.9  4.3 8.1 1.0 0.3 3.3 27.9
## 4 93.8  0.5  4.9  5.4 6.4 1.3 0.1 3.1 28.5
## 5 66.3  1.9  9.9 11.8 5.9 1.2 1.4 3.7 33.8
## 6 75.7  0.8  7.6  8.4 9.0 1.0 0.7 4.2 30.7
## 7 85.1  0.7  4.6  5.3 5.0 1.1 0.4 3.9 26.2
## 8 88.2  2.8  8.2 11.0 8.5 1.6 0.7 3.1 37.2
## 9 87.4  1.0  5.3  6.3 4.9 1.6 0.6 1.9 28.2
##  [ reached 'max' / getOption("max.print") -- omitted 41 rows ]

loop over the table by clicking the next button

leader <- NULL
for (i in 1:6) {
  leader <- bind_rows(
    leader,
    rd$getPageSource()[[1]] %>%
      read_html() %>%
      html_node("div.nba-stat-table__overflow table") %>%
      html_table()
  )
  nextbutton <- rd$findElement("css", "a.stats-table-pagination__next")
  nextbutton$clickElement()
}
leader
##   #                Player GP  MIN  PTS  FGM  FGA  FG% 3PM  3PA  3P% FTM  FTA
## 1 1          Bradley Beal 30 35.7 33.2 11.7 24.1 48.5 2.4  7.1 33.8 7.4  8.3
## 2 2           Joel Embiid 29 32.8 29.8  9.3 17.9 52.1 1.2  2.9 41.7 9.9 11.6
## 3 3        Damian Lillard 32 35.9 29.6  9.3 20.8 44.5 4.2 11.1 38.0 6.9  7.4
## 4 4         Stephen Curry 34 34.1 29.5  9.7 20.3 47.8 4.8 11.7 41.3 5.3  5.6
## 5 5 Giannis Antetokounmpo 33 33.9 29.2 10.5 19.0 55.5 1.2  4.0 28.6 6.9 10.4
## 6 6           Luka Doncic 32 35.2 28.6 10.0 20.9 47.8 2.7  7.4 35.7 6.0  8.0
## 7 7           Zach LaVine 33 35.6 28.5 10.2 19.6 52.2 3.5  8.1 43.3 4.5  5.3
## 8 8          Nikola Jokic 34 35.9 27.1 10.4 18.4 56.6 1.6  3.8 41.1 4.6  5.2
## 9 9         Kawhi Leonard 29 34.7 26.8  9.7 18.9 51.1 2.0  5.1 38.9 5.5  6.3
##    FT% OREB DREB  REB AST STL BLK TOV  EFF
## 1 89.9  1.4  4.0  5.4 4.8 1.4 0.4 3.3 28.7
## 2 86.0  2.1  9.2 11.3 3.3 1.2 1.3 3.2 33.6
## 3 93.2  0.5  3.9  4.3 8.1 1.0 0.3 3.3 27.9
## 4 93.8  0.5  4.9  5.4 6.4 1.3 0.1 3.1 28.5
## 5 66.3  1.9  9.9 11.8 5.9 1.2 1.4 3.7 33.8
## 6 75.7  0.8  7.6  8.4 9.0 1.0 0.7 4.2 30.7
## 7 85.1  0.7  4.6  5.3 5.0 1.1 0.4 3.9 26.2
## 8 88.2  2.8  8.2 11.0 8.5 1.6 0.7 3.1 37.2
## 9 87.4  1.0  5.3  6.3 4.9 1.6 0.6 1.9 28.2
##  [ reached 'max' / getOption("max.print") -- omitted 247 rows ]
# close the browser finally
server$stop()
## [1] TRUE

Advanced topics

  • Headless browser
  • XPaths