# Load necessary packages
if (!require("pacman")) install.packages("pacman")
::p_load(tidyverse, rvest, janitor, httr2,
pacman
jsonlite, hrbrthemes, timeperiodsR, listviewer, RSelenium, netstat)
Descriptive Statistics
Kyiv School of Economics
Today we’ll be using SelectorGadget, which is a Chrome extension that makes it easy to discover CSS selectors. (Install the extension directly here.) Please note that SelectorGadget is only available for Chrome. If you prefer using Firefox, then you can try ScrapeMate.
There are actually two ways that web content gets rendered in a browser:
You can read here for more details (including example scripts).
Just because you can scrape data doesn’t mean you should. While scraping public data is legal (per hiQ Labs vs LinkedIn), it’s essential to act responsibly:
Mantra: Be nice to the web.
rvest
(link): A tidyverse-inspired R package for server-side webscraping, similar to Python’s Beautiful Soup.
Key Requirement:
Understand CSS selectors to effectively extract data from webpages.
CSS (Cascading Style Sheets): Defines how HTML elements are styled and displayed.
Key Concepts:
.h1
for headers, .h2
for sub-headers).SelectorGadget: A tool to identify CSS selectors for isolating desired webpage content.
➡️ Recommended: Review the SelectorGadget vignette before proceeding.
Task: Scrape the Wikipedia page on the Men’s 100 metres world record progression.
Steps:
rvest::read_html()
to read the entire page for further analysis.url <- "http://en.wikipedia.org/wiki/Men%27s_100_metres_world_record_progression"
m100 <- read_html(url)
m100
{html_document}
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body class="skin--responsive skin-vector skin-vector-search-vue mediawik ...
The page is read as an XML document, which includes everything required to render the Wikipedia page—similar to viewing a full LaTeX document when you only need specific tables.
Key Insight:
XML structures provide access to all elements of a webpage, but our goal is to extract only the relevant data (e.g., specific tables).
➡️ Next step: Isolate the tables using CSS selectors or dedicated functions.
Step 1: Identify the Table’s CSS Selector
Use SelectorGadget to pinpoint the unique CSS selector for the desired table. This tool helps isolate the table’s content by visually highlighting the relevant elements on the webpage.
Step 2: Extract the Table in R
Once the CSS selector is identified, use rvest
functions to extract the data.
pre_iaaf <-
m100 %>%
html_element("#mw-content-text > div.mw-content-ltr.mw-parser-output > table:nth-child(11)") %>%
html_table() %>%
clean_names() %>%
mutate(date = mdy(date))
pre_iaaf
# A tibble: 21 × 5
time athlete nationality location_of_races date
<dbl> <chr> <chr> <chr> <date>
1 10.8 Luther Cary United States Paris, France 1891-07-04
2 10.8 Cecil Lee United Kingdom Brussels, Belgium 1892-09-25
3 10.8 Étienne De Ré Belgium Brussels, Belgium 1893-08-04
4 10.8 L. Atcherley United Kingdom Frankfurt/Main, Germany 1895-04-13
5 10.8 Harry Beaton United Kingdom Rotterdam, Netherlands 1895-08-28
6 10.8 Harald Anderson-Arbin Sweden Helsingborg, Sweden 1896-08-09
7 10.8 Isaac Westergren Sweden Gävle, Sweden 1898-09-11
8 10.8 Isaac Westergren Sweden Gävle, Sweden 1899-09-10
9 10.8 Frank Jarvis United States Paris, France 1900-07-14
10 10.8 Walter Tewksbury United States Paris, France 1900-07-14
# ℹ 11 more rows
If SelectorGadget isn’t available or feels cumbersome, try using your browser’s “Inspect Element” feature for precise CSS selector identification.
Example (Google Chrome):
This method provides a quick and precise way to extract CSS selectors without additional tools.
Steps:
ggplot2
to plot the progression over time.base_url <- "https://rozetka.com.ua/kompyuternie-kolonki/c4671536/seller=rozetka/"
rozetka <- read_html(base_url)
rozetka
{html_document}
<html lang="uk" data-critters-container="">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body style="overflow-anchor: none;">\n<!--nghm-->\n <rz-app-root ...
After iterative clicking with SelectorGadget, the relevant CSS selectors for the desired elements on this page are:
.goods-tile__price-value
: Price.goods-tile__title
: TitleInstead, we’ll parse it as simple text via html_text()
. This will yield a vector of strings, which I’ll re-assign the same speakers object.
[1] " Акустична система Edifier R1080BT Black "
[2] "3 999₴"
[3] " Акустична система Real-El S-111 Black (EL121100010) "
[4] "650₴"
[5] " Акустична система XTRIKE ME SK-503 RGB "
[6] "485₴"
[7] " Акустична система GamePro RGB (GS290) "
[8] "429₴"
[9] " Акустична система GamePro RGB (GS629) "
[10] "849₴"
[11] " Акустична система XTRIKE ME SK-610 LED "
[12] "649₴"
[13] " Акустична система Edifier R1100 "
[14] "4 199₴"
[15] " Акустична система Trust Gemi RGB 2.0 Black (22948) "
[16] "483₴"
[17] " Акустична система Defender SPK-225 (65220) "
[18] "455₴"
[19] " Акустична система Real-El S-20 Black "
[20] "299₴"
# A tibble: 6 × 2
V1 V2
<chr> <chr>
1 " Акустична система Edifier R1080BT Black " 3 999₴
2 " Акустична система Real-El S-111 Black (EL121100010) " 650₴
3 " Акустична система XTRIKE ME SK-503 RGB " 485₴
4 " Акустична система GamePro RGB (GS290) " 429₴
5 " Акустична система GamePro RGB (GS629) " 849₴
6 " Акустична система XTRIKE ME SK-610 LED " 649₴
Let’s automate the process of extracting data from multiple pages.
speakers_scrape <-
function(x) {
cat("Scraping page", x, "\n")
url = paste0(
'https://rozetka.com.ua/kompyuternie-kolonki/c4671536/',
x,
'sell_status=available;tip-238871=akusticheskaya-sistema'
)
speakers <-
read_html(url) %>%
html_elements(".goods-tile__price-value , .goods-tile__title") %>%
html_text() %>%
matrix(nrow = 2) %>%
t() %>%
as_tibble()
Sys.sleep(1) # будемо чемними :)
return(speakers)
}
speakers_rozetka <-
lapply(pages, speakers_scrape) %>%
bind_rows()
Scraping page page=1;
Scraping page page=2;
Scraping page page=3;
speakers_rozetka_tbl <- speakers_rozetka %>%
rename(title = V1,
price = V2) %>%
mutate(title = title %>% str_replace_all('Акустична система', '') %>% str_trim(),
price = price %>% str_replace('₴', '') %>% str_replace_all('[^ -~]+', "") %>% as.numeric() # remove non-ASCII characters and convert to numeric
)
speakers_rozetka_tbl
# A tibble: 180 × 2
title price
<chr> <dbl>
1 Edifier R1080BT Black 3999
2 Real-El S-111 Black (EL121100010) 650
3 XTRIKE ME SK-503 RGB 485
4 GamePro RGB (GS290) 429
5 Edifier R1100 4199
6 Trust Gemi RGB 2.0 Black (22948) 483
7 Defender SPK-225 (65220) 455
8 Real-El S-20 Black 299
9 F&D T-60X Pro 12599
10 Edifier R1080BT White 3999
# ℹ 170 more rows
APIs (Application Programming Interfaces) are sets of rules that enable different software applications to interact and share data. Here are key concepts:
For more details, check out An Introduction to APIs by Zapier.
API endpoints are URLs that provide direct access to data from a server’s API database. While they resemble normal website URLs, they return data in formats like JSON or XML instead of rich HTML content.
NYC Open Data provides a wealth of public data from various city agencies. You can access datasets on topics ranging from arrests to city job postings and street trees.
For this example, we’ll download data from the 2015 NYC Street Tree Census:
nyc_trees <-
fromJSON("https://data.cityofnewyork.us/resource/uvpi-gqnh.json") %>%
as_tibble()
nyc_trees
# A tibble: 1,000 × 45
tree_id block_id created_at tree_dbh stump_diam curb_loc status health
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 180683 348711 2015-08-27T00:00… 3 0 OnCurb Alive Fair
2 200540 315986 2015-09-03T00:00… 21 0 OnCurb Alive Fair
3 204026 218365 2015-09-05T00:00… 3 0 OnCurb Alive Good
4 204337 217969 2015-09-05T00:00… 10 0 OnCurb Alive Good
5 189565 223043 2015-08-30T00:00… 21 0 OnCurb Alive Good
6 190422 106099 2015-08-30T00:00… 11 0 OnCurb Alive Good
7 190426 106099 2015-08-30T00:00… 11 0 OnCurb Alive Good
8 208649 103940 2015-09-07T00:00… 9 0 OnCurb Alive Good
9 209610 407443 2015-09-08T00:00… 6 0 OnCurb Alive Good
10 192755 207508 2015-08-31T00:00… 21 0 OffsetF… Alive Fair
# ℹ 990 more rows
# ℹ 37 more variables: spc_latin <chr>, spc_common <chr>, steward <chr>,
# guards <chr>, sidewalk <chr>, user_type <chr>, problems <chr>,
# root_stone <chr>, root_grate <chr>, root_other <chr>, trunk_wire <chr>,
# trnk_light <chr>, trnk_other <chr>, brch_light <chr>, brch_shoe <chr>,
# brch_other <chr>, address <chr>, zipcode <chr>, zip_city <chr>,
# cb_num <chr>, borocode <chr>, boroname <chr>, cncldist <chr>, …
The full NYC Street Tree Census contains nearly 700,000 trees, but the API defaults to returning only 1,000 rows. For our example, we’ve downloaded a small sample.
If you want to access more data, you can override the limit by adding ?$limit=LIMIT
to the API endpoint. For example:
To read the first 5 rows: ...?$limit=5
Check the API documentation for more options and details on how to fetch larger datasets.
nyc_trees %>%
select(longitude, latitude, stump_diam, spc_common, spc_latin, tree_id) %>%
mutate_at(vars(longitude:stump_diam), as.numeric) %>%
ggplot(aes(x=longitude, y=latitude, size=stump_diam)) +
geom_point(alpha=0.5) +
scale_size_continuous(name = "Stump diameter") +
labs(
x = "Longitude", y = "Latitude",
title = "Sample of New York City trees",
caption = "Source: NYC Open Data"
)
As with all APIs, a good place to start is the FRED API developer docs.
If you read through these, you’d see that the endpoint path we’re interested in is series/observations:
Head over to https://api.stlouisfed.org/fred/series/observations?series_id=GNPCA&api_key=YOUR_API_KEY&file_type=json
, replacing “YOUR_API_KEY
” with your actual key.
library(httr2)
req <- request("https://api.stlouisfed.org/fred/")
resp <- req %>%
req_url_path_append("series", "observations") %>%
req_url_query(
series_id = "GNPCA",
api_key = Sys.getenv("FRED_API_KEY"), # usethis::edit_r_environ()
file_type = "json"
) %>%
req_perform()
fred <- resp %>%
resp_body_json()
jsonedit(fred, mode = "view")
# A tibble: 95 × 4
realtime_start realtime_end date value
<chr> <chr> <chr> <chr>
1 2024-11-11 2024-11-11 1929-01-01 1202.659
2 2024-11-11 2024-11-11 1930-01-01 1100.67
3 2024-11-11 2024-11-11 1931-01-01 1029.038
4 2024-11-11 2024-11-11 1932-01-01 895.802
5 2024-11-11 2024-11-11 1933-01-01 883.847
6 2024-11-11 2024-11-11 1934-01-01 978.188
7 2024-11-11 2024-11-11 1935-01-01 1065.716
8 2024-11-11 2024-11-11 1936-01-01 1201.443
9 2024-11-11 2024-11-11 1937-01-01 1264.393
10 2024-11-11 2024-11-11 1938-01-01 1222.966
# ℹ 85 more rows
endpoint <- "https://api.wr-rims-prod.pulselive.com/rugby/v3/rankings/mru?language=en"
rugby <- fromJSON(endpoint)
str(rugby)
List of 3
$ label : chr "Mens Rugby Union"
$ entries :'data.frame': 113 obs. of 5 variables:
..$ team :'data.frame': 113 obs. of 6 variables:
.. ..$ id : chr [1:113] "39" "37" "36" "42" ...
.. ..$ altId : chr [1:113] "a49729a5-a9e7-4a53-b18b-13b81d4752f4" "ec6ede05-0367-4b75-a66d-2d7a58950dcd" "f683c048-abc7-4713-9286-4dd1921a3285" "6f456794-9a4a-4623-abba-c7f93fbaf7d7" ...
.. ..$ name : chr [1:113] "South Africa" "New Zealand" "Ireland" "France" ...
.. ..$ abbreviation: chr [1:113] "RSA" "NZL" "IRE" "FRA" ...
.. ..$ countryCode : chr [1:113] "ZAF" "NZL" "IRL" "FRA" ...
.. ..$ annotations : logi [1:113] NA NA NA NA NA NA ...
..$ pts : num [1:113] 92.5 91.2 90.6 87 85.6 ...
..$ pos : int [1:113] 1 2 3 4 5 6 7 8 9 10 ...
..$ previousPts: num [1:113] 91.8 89.7 92.1 87 84.3 ...
..$ previousPos: int [1:113] 2 3 1 4 6 7 5 9 10 8 ...
$ effective:List of 3
..$ millis : num 1.73e+12
..$ gmtOffset: num 0
..$ label : chr "2024-11-11"
rankings <-
bind_cols(
rugby$entries$team,
rugby$entries %>% select(pts:previousPos)
) %>%
clean_names() %>%
select(-c(id, alt_id, annotations)) %>% ## Ці колонки не додають особливого інтересу
select(pos, pts, everything()) %>% ## Змініть порядок решти стовпців
as_tibble()
rankings
# A tibble: 113 × 7
pos pts name abbreviation country_code previous_pts previous_pos
<int> <dbl> <chr> <chr> <chr> <dbl> <int>
1 1 92.5 South Africa RSA ZAF 91.8 2
2 2 91.2 New Zealand NZL NZL 89.7 3
3 3 90.6 Ireland IRE IRL 92.1 1
4 4 87.0 France FRA FRA 87.0 4
5 5 85.6 Argentina ARG ARG 84.3 6
6 6 82.7 Scotland SCO SCO 83.4 7
7 7 82.6 England ENG ENG 84.4 5
8 8 81.1 Australia AUS AUS 79.3 9
9 9 80.1 Fiji FIJ FJI 79.1 10
10 10 78.7 Italy ITA ITA 80.0 8
# ℹ 103 more rows
start_date <- ymd("2004-01-01")
end_date <- floor_date(today(), unit="years")
dates <- seq(start_date, end_date, by="years")
## Отримайте найближчий понеділок до 1 січня, щоб збігтися з датами випуску рейтингу.
dates <- floor_date(dates, "week", week_start = getOption("lubridate.week.start", 1))
dates
[1] "2003-12-29" "2004-12-27" "2005-12-26" "2007-01-01" "2007-12-31"
[6] "2008-12-29" "2009-12-28" "2010-12-27" "2011-12-26" "2012-12-31"
[11] "2013-12-30" "2014-12-29" "2015-12-28" "2016-12-26" "2018-01-01"
[16] "2018-12-31" "2019-12-30" "2020-12-28" "2021-12-27" "2022-12-26"
[21] "2024-01-01"
rugby_scrape <-
function(x) {
# cat("Scraping date", x, "\n") # Uncomment this line to see the progress
endpoint = paste0("https://api.wr-rims-prod.pulselive.com/rugby/v3/rankings/mru?language=en&date=", x)
rugby = fromJSON(endpoint)
rankings =
bind_cols(
rugby$entries$team,
rugby$entries %>% select(pts:previousPos)
) %>%
clean_names() %>%
mutate(date = x) %>%
select(-c(id, alt_id, annotations)) %>%
select(date, pos, pts, everything()) %>%
as_tibble()
Sys.sleep(1)
return(rankings)
}
# A tibble: 2,111 × 8
date pos pts name abbreviation country_code previous_pts
<date> <int> <dbl> <chr> <chr> <chr> <dbl>
1 2003-12-29 1 94.0 England ENG ENG 94.0
2 2003-12-29 2 90.1 New Zealand NZL NZL 90.1
3 2003-12-29 3 86.6 Australia AUS AUS 86.6
4 2003-12-29 4 82.7 France FRA FRA 82.7
5 2003-12-29 5 81.2 South Africa RSA ZAF 81.2
6 2003-12-29 6 80.5 Ireland IRE IRL 80.5
7 2003-12-29 7 78.0 Argentina ARG ARG 78.0
8 2003-12-29 8 76.9 Wales WAL WAL 76.9
9 2003-12-29 9 76.4 Scotland SCO SCO 76.4
10 2003-12-29 10 73.5 Samoa SAM WSM 73.5
# ℹ 2,101 more rows
# ℹ 1 more variable: previous_pos <int>
teams <- c("NZL", "IRE", "ENG", "JPN")
team_cols <- c("NZL"="black", "IRE"="#4DAF4A", "ENG"="#377EB8", "JPN" = "red")
rankings_history %>%
ggplot(aes(x=date, y=pts, group=abbreviation)) +
geom_line(col = "grey") +
geom_line(
data = rankings_history %>% filter(abbreviation %in% teams),
aes(col=fct_reorder2(abbreviation, date, pts)),
lwd = 1
) +
scale_color_manual(values = team_cols) +
labs(
x = "Date", y = "Points",
title = "International rugby rankings", caption = "Source: World Rugby"
) +
theme(legend.title = element_blank())