In this article, I explore an idea suggested by Floyd Everest, which involves analysing the connections between PhD students through shared supervisors using a graph structure. By treating both students and academic staff as nodes, and supervisory relationships as edges, we can visualise the structure of academic mentorship as a network. This may offer insight into the flow of knowledge, influence, and collaboration within the Department of Econometrics and Business Statistics (EBS) at Monash Business School. Krisanat Anukarnsakulchularp already created an SVG version of this network and gave me some pointers for how I can scrape the data. I share my attempts at visualising this network interactively. I also explore the data further.
Code
library(purrr)
library(dplyr)
library(readr)
library(stringr)
library(visNetwork)
library(ggplot2)
library(tidyr)
library(patchwork)
Code
url_list <- paste0('https://www.monash.edu/business/research/our-researchers/graduate-research-students-and-supervisors?queries_degree_query=Econometrics+and+Business+Statistics&queries_degree_query_posted=1&result_1619149_result_page=', 1:4)
get_data <- function(url) {
page <- rvest::read_html(url)
image_links <- page |>
html_elements(".box-listing-element__thumb-link") |>
map(\(x)list(image = html_element(x, 'img') |> html_attr("src"),
name = html_element(x, 'img') |> html_attr("alt"),
website = html_attr(x, 'href'))) |>
bind_rows()
page |>
html_elements(".box-listing-element__blurb") |>
map(\(x) {
elements <- html_children(x)
list(
name = elements[[1]] |>
rvest::html_text(),
description = elements[[2]] |>
rvest::html_text() |>
str_trim(),
supervisors = elements[[3]] |>
rvest::html_text() |>
str_split('\r\n') |>
pluck(1, 1) |>
str_trim() |>
str_remove('Supervisors: ') |>
str_split_1('(,| and )') |>
str_trim() |>
# Thank you Krisanat for the following line
str_remove("Prof |A/Prof |Dr |Prof. |Dr. |Assoc Prof. |Professor ")
)
}) |>
reduce(bind_rows) |>
inner_join(image_links, by = join_by(name))
}
if(!file.exists('students.csv')) {
students <- url_list |>
map(get_data) |>
bind_rows() |>
mutate(
supervisors = case_when(
supervisors %in% c("David Frazier", "David T. Frazier") ~ "David Frazier",
supervisors %in% c("Di Cook", "Dianne Cook") ~ "Dianne Cook",
supervisors %in% c("Farshid Vahid", "Farshid Vahid-Araghi") ~ "Farshid Vahid",
supervisors %in% c("Gael M. Martin", "Gael Martin") ~ "Gael Martin",
supervisors %in% c("Rob Hyndman", "Rob J Hyndman") ~ "Rob Hyndman",
supervisors %in% c("Susan VanderPlas", "Susan Vanderplus") ~ "Susan VanderPlas",
supervisors %in% c("Thiyanga S. Talagala", "Thiyanga Talagala") ~ "Thiyanga Talagala",
TRUE ~ supervisors
)
) |>
rename(supervisor = supervisors)
write_csv(students, 'students.csv')
}
students <- read_csv(
'students.csv',
col_types = cols(
name = col_character(),
description = col_character(),
supervisor = col_character(),
image = col_character(),
website = col_character()
)
)
if (!file.exists('staff.csv')) {
staff <- rvest::read_html('https://www.monash.edu/business/econometrics-and-business-statistics/our-people/staff-directory') |>
rvest::html_elements('.group-list') |>
map(\(tbl) {
tbl |>
rvest::html_elements('.row') |>
map(\(x) {
list(
name = x |>
rvest::html_element('strong') |>
rvest::html_text() |>
str_trim(),
website = x |>
rvest::html_element('a') |>
rvest::html_attr('href'),
image = x |>
rvest::html_element('img') |>
rvest::html_attr('src'),
description = x |>
rvest::html_element('ul') |>
rvest::html_elements('li') |>
rvest::html_text() |>
paste0(collapse = ', ')
)
})
}) |>
bind_rows() |>
mutate(website = ifelse(str_detect(website, 'mailto'), '', website))
write_csv(staff, 'staff.csv')
}
staff <- read_csv(
'staff.csv',
col_types = cols(
name = col_character(),
website = col_character(),
image = col_character(),
description = col_character()
)
)
Code
students |>
count(name, image) |>
pmap(\(...) list(...)) |>
walk(\(x)download.file(x$image, destfile = paste0('images/',
janitor::make_clean_names(x$name), '.jpg'
)))
staff |>
count(name, image) |>
pmap(\(...) list(...)) |>
walk(\(x)download.file(x$image, destfile = paste0('images/',
janitor::make_clean_names(x$name), '.jpg'
)))
Code
student_info <- students |>
select(name, description, website) |>
distinct()
staff_info <- staff |>
select(name, description, website) |>
bind_rows(students |>
filter(!supervisor %in% staff$name) |>
mutate(name = supervisor, website = NA_character_, description = NA_character_)) |>
distinct()
graph_edges <- students |>
select(from=name,
to=supervisor)
all_info <- bind_rows(
student_info |>
mutate(group = 'student'),
staff_info |>
mutate(group = 'staff')
) |>
select(name, description, website, group) |>
mutate(image = glue::glue('images/{janitor::make_clean_names(name, allow_dupes=TRUE)}.jpg'))|>
mutate(image = ifelse(is.na(website), 'images/default.jpg', image)) |>
mutate(
id = name,
label = name,
title = case_when(
is.na(website) ~ glue::glue('{name}<br>{description}', .na = ''),
T ~ glue::glue('<a target="_blank" href = "{website}">{name}</a><br>{description}', .na = ''))
) |>
distinct() |>
filter(id %in% c(graph_edges$from, graph_edges$to))
my_visnetwork <- function(nodes, edges, degrees=1) {
visNetwork(nodes, edges, width = "100%", ) |>
visLayout(randomSeed = 42) |>
visNodes(
shape = "circularImage",
size = 20,
borderWidth = 3,
shapeProperties = list(useBorderWithImage = TRUE)
) |>
visEdges(smooth = T, labelHighlightBold = FALSE) |>
visOptions(
highlightNearest = list(
enabled = TRUE,
degree = degrees,
labelOnly = TRUE,
hover = FALSE,
algorithm = "hierarchical"
)
) |>
visInteraction(
tooltipStyle = 'position: fixed;visibility:hidden;padding: 5px;
font-family: verdana;font-size:14px;font-color:#000000;background-color: #f5f4ed;
-moz-border-radius: 3px;-webkit-border-radius: 3px;border-radius: 3px;
border: 1px solid #808074;box-shadow: 3px 3px 10px rgba(0, 0, 0, 0.2);
max-width:200px;word-break: normal;',
selectConnectedEdges = FALSE
)
}
only_students <- inner_join(
graph_edges,
graph_edges,
by = join_by(to == to),
relationship = "many-to-many"
) |>
rename(title = to,
from = from.x,
to = from.y) |>
filter(from<to)
only_staff <- inner_join(
graph_edges,
graph_edges,
by = join_by(from == from),
relationship = "many-to-many"
) |>
rename(title = from,
from = to.x,
to = to.y) |>
filter(from<to)
Code
p_students <- all_info |>
filter(group == 'student') |>
select(-group) |>
my_visnetwork(edges=only_students, degrees = 1)
p_staff <- all_info |>
filter(group == 'staff') |>
select(-group) |>
my_visnetwork(edges=only_staff, degrees = 1)
p_all<- my_visnetwork(all_info, graph_edges, degrees = 1)
There are some interesting properties of the student network graph.
- All PhD students at EBS are linked to all other PhD students through the network, but the link is surprisingly weak.
- Students appear to cluster based on primary campus. This makes sense as the location of supervisors informs where the student is based. However, there are some exceptions.
- Students appear to cluster loosely based on research interest e.g. Bayesian analysis, analytics, econometrics. Again, there are some exceptions.
Visualising PhD student website domains
Code
get_domain <- Vectorize(function(url) {
url <- str_remove(url, "^https?://")
url <- str_remove(url, "^www\\.")
domain <- str_split(url, "/", simplify = TRUE)[1]
parts <- str_split(domain, "\\.", simplify = TRUE)
if (ncol(parts) >= 2) {
paste0(parts[, ncol(parts) - 1], ".", parts[, ncol(parts)])
} else {
domain
}
})
Code
website_data <- students |>
distinct(name, website) |>
mutate(domain = get_domain(website)) |>
drop_na() |>
mutate(personal_website = case_when(
str_detect(domain, 'linkedin|.edu') ~ domain,
T ~ 'Personal website'
)) |>
mutate(custom_domain = case_when(
str_detect(domain, 'google.com|linkedin.com|netlify.app|.edu|github.io') ~ F,
T ~ T
)) |>
mutate(suffix = str_extract(domain, '\\.[a-z]{2,3}'))
n_na <- students |> filter(is.na(website)) |> count(name) |> nrow()
There were 3 students with no website linked. The attributes of the remaining websites linked by student profiles are shown in Figure 1. The majority of students linked to a LinkedIn profile, which is not surprising. Unsurprisingly, the “.com” suffix is the most popular, while the “.app” is second most popular as Netlify is a popular free host. Netlify, Google sites, and Github are the hosts used among people that did not have a custom domain. These may be a good place to start for someone needing to host a website. In fact this website is hosted using Netlify at the time of writing.
Code
p1 <- website_data |>
count(personal_website) |>
ggplot(aes(forcats::fct_infreq(personal_website, n), n)) +
geom_bar(stat='identity') +
labs(x='', y='Count') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p2 <- website_data |>
filter(personal_website == 'Personal website') |>
mutate(custom_domain = ifelse(custom_domain, "Custom domain", "Free domain")) |>
ggplot(aes(x = forcats::fct_infreq(suffix), fill = custom_domain)) +
geom_bar() +
labs(x = "", y='', fill = "") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position='top')
p3 <- website_data |>
filter(personal_website == 'Personal website', !custom_domain) |>
count(domain) |>
ggplot(aes(forcats::fct_infreq(domain, n), n)) +
geom_bar(stat='identity') +
labs(x='', y='') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p1 + p2 + p3 + plot_layout(widths = c(2, 3, 2))
Most “popular” supervisors
The supervisors with the most students are shown in Table 1.
Code
students |>
count(supervisor, name) |>
group_by(supervisor) |>
summarise(
n = n(),
students = list(sort(name))
) |>
rowwise() |>
mutate(
students = paste0(students, collapse = ', ')
) |>
arrange(desc(n)) |>
filter(n >= 3) |>
knitr::kable(col.names = c('Supervisor', 'Number of students', 'Students'))
Supervisor | Number of students | Students |
---|---|---|
Dianne Cook | 7 | Harriet Mason, Janith Wanniarachchi, Jayani P.G. Lakshika, Krisanat Anukarnsakulchularp, Tina Rashid Jafari, Weihao Li, Xuan Nhat Minh Nguyen |
Jiti Gao | 7 | Denghuan Ye, Lu Wang, Pubudika Nambukaluarachchige Dona, Renhe Wang, Taiga Saito, Yueheng Hu, Ziyi Deng |
Dan Zhu | 5 | Lan Luo, Wenjun Shen, Yanhong Wan, Zhixiang (Elvis) Yang, Ziyi Deng |
Athanasios Pantelous | 4 | Asela Perera Nawagamuwage, Haizhou Cui, Ruohua Tang, Zixuan Fang |
Bin Peng | 4 | Denghuan Ye, Lu Wang, Wenjun Shen, Zhixiang (Elvis) Yang |
David Frazier | 4 | Chaya Weerasinghe, Floyd Everest, Jianying Shelly Xie, Kenyon Ng |
Rob Hyndman | 4 | Cynthia A. Huang, Mitchell O’Hara-Wild, Nuwani Kodikara Palihawadana, Sixian Tang |
Wei Wei | 4 | Xiaotong Sun, Yanhong Wan, Yueheng Hu, Yuru Sun |
Bonsoo Koo | 3 | Haizhou Cui, Ze-Yu Zhong, Zixuan Fang |
Catherine Forbes | 3 | Cheok Hang Lei, Filip Reierson, Jianying Shelly Xie |
George Athanasopoulos | 3 | Bets Ruscoe, Mitchell O’Hara-Wild, Shakila Mallawathantrige |
Jackie Li | 3 | Himasha Warnakulasooriya, Shakila Mallawathantrige, Sixian Tang |
Michael Lydeamore | 3 | Jarryd Chapman, Jayani P.G. Lakshika, Krisanat Anukarnsakulchularp |
Xibin Zhang | 3 | Jianfeng Liang, Renhe Wang, Xiaotong Sun |
Xueyan Zhao | 3 | Gayani Ishara Rathnayake, Heshani Madigasekara, Nimni Senanayaka |
Most “popular” students
The students with the most supervisors are shown in Table 2.
Code
students |>
count(supervisor, name) |>
group_by(name) |>
summarise(
n = n(),
supervisors = list(sort(supervisor))
) |>
rowwise() |>
mutate(
supervisors = paste0(supervisors, collapse = ', ')
) |>
arrange(desc(n)) |>
filter(n > 2) |>
knitr::kable(col.names = c('Student', 'Number of supervisors', 'Supervisors'))
Student | Number of supervisors | Supervisors |
---|---|---|
Himasha Warnakulasooriya | 4 | Brett Inder, Hamza Hanbali, Jackie Li, Jessica Leung |
Janith Wanniarachchi | 4 | Dianne Cook, Kate Saunders, Patricia Menendez, Thiyanga Talagala |
Jayani P.G. Lakshika | 4 | Dianne Cook, Michael Lydeamore, Paul Harrison, Thiyanga Talagala |
Weihao Li | 4 | Dianne Cook, Emi Tanaka, Klaus Ackermann, Susan VanderPlas |
Asela Perera Nawagamuwage | 3 | Athanasios Pantelous, Ioannis Kougioumtzoglou, Nikos Kavallaris |
Chaya Weerasinghe | 3 | David Frazier, Gael Martin, Ruben Loaiza-Maya |
Cynthia A. Huang | 3 | Rob Hyndman, Sarah Goodwin, Simon Angus |
Gayani Ishara Rathnayake | 3 | Akanksha Negi, Otavio Bartalotti, Xueyan Zhao |
Harriet Mason | 3 | Dianne Cook, Sarah Goodwin, Susan VanderPlas |
Lu Wang | 3 | Bin Peng, Jiti Gao, Yanrong Yang |
Nuwani Kodikara Palihawadana | 3 | Louise M Ryan, Rob Hyndman, Xiaoqian Wang |
Ruohua Tang | 3 | Athanasios Pantelous, Yulia Merkoulova, Yuxin Xie |
Sixian Tang | 3 | Jackie Li, Leonie Tickle, Rob Hyndman |
Xiaotong Sun | 3 | Heather Anderson, Wei Wei, Xibin Zhang |
Yuru Sun | 3 | Ole Maneesoonthorn, Wei Wei, Yong Song |
Zhixiang (Elvis) Yang | 3 | Bin Peng, Dan Zhu, Farshid Vahid |
Zixuan Fang | 3 | Athanasios Pantelous, Bonsoo Koo, Stavros Stavroglou |
Conclusion
This article explored the connections between PhD students and supervisors at Monash University. The network of students and supervisors was visualised using the visNetwork
package, revealing clustering based on campus and research interests. The analysis also highlighted particularly connected supervisors and students within the network. I also briefly explored the domains and hosts people used for their personal websites. I will leave more sophisticated analysis of these networks to others, but I hope someone finds this preliminary exploration interesting.