library(XML)
library(stringr)
library(rvest)
library(tidyverse)
library(kableExtra)
This is an example of how to web scrape grants of active research in college’s official website. Please follow the academia institute’s website robots rules.
1 Web Scrapping
= rep(NA, 84)
topics = rep(NA, 84)
fundings = 0
iter for (page in 1:9) {
## parent webpage
<- paste0('http://XXXXXXXXXXXXXXXXXXXXXX', page)
scrape_url
<- read_html(scrape_url)
html_form_page
## find the child web page containing projects' name, total award, topic etc.
= html_form_page |> html_elements("h3 a[href]") |> html_attr("href")
child_url
for (item in 1:length(child_url)) {
= iter + 1
iter <- child_url[item] |> read_html() |> html_elements("div[class='study_wrapper']") |> html_text()
child_html_text = child_html_text |> str_extract(pattern = "Topic\\(s\\)\\: [a-zA-Z]+\\b") |> str_replace(pattern = "Topic\\(s\\)\\: ", "")
topic = child_html_text |> str_extract(pattern = "\\$\\d+\\,\\d+") |> str_replace_all(pattern = "\\$|\\,", "") |> as.numeric()
funding = topic
topics[iter] = funding
fundings[iter]
}
}
<- data.frame(topic = topics, funding_amount = fundings) |>
dat add_row(topic = "Marijuana", funding_amount = 3743) |>
mutate(topic = ifelse(topic == "TobaccoMarijuana", "Tobacco", topic))
2 Visualization in ggplot2
## funding per project
<- dat |>
dat1 group_by(topic) |>
summarise(funding_amount_mean = mean(funding_amount, na.rm = T)) |>
mutate(topic = fct_reorder(topic, desc(funding_amount_mean)))
ggplot(dat1) +
aes(x = topic, y = funding_amount_mean) +
geom_col(fill = "darkblue") +
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6)) +
labs(y = "funding per project", title = "Funding for each project during 2019 to 2022") +
theme(legend.position = "none", text = element_text(size = 12)) # remove lengend
## Total funding amount
<- dat |>
dat2 group_by(topic) |>
summarise(
funding_amount_sum = sum(funding_amount, na.rm = T),
n = n()) |>
mutate(
topic = fct_reorder(topic, desc(funding_amount_sum)),
highlight = ifelse(funding_amount_sum == max(funding_amount_sum), 1, 0) |> as.factor())
ggplot(dat2) +
aes(x = topic, y = funding_amount_sum, fill = highlight) +
geom_col() +
geom_text(aes(label = round(funding_amount_sum/ 10^6, 3)), vjust = 0.001, size = 5) +
geom_label(aes(label = n), vjust = 0.999, size = 5, color = "white") +
scale_fill_manual(values = c("darkblue", "red2")) +
labs(x = "", y = "total amount of funding",
title = "Total Amount of Funding and Number of Grant Projects during 2019-2022",
subtitle = "Active research at Health Promotion Center from 2019 to 2022",
caption = "source: https://healthpromotionresearch.org/Active-Studies/") +
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6)) +
theme(legend.position = "none", text = element_text(size = 10)) # remove lengend